def do_collector_thread(self): logger.log('COLLECTOR thread launched', part='check') cur_launchs = {} while not stopper.interrupted: now = int(time.time()) for (colname, e) in self.collectors.iteritems(): colname = e['name'] inst = e['inst'] # maybe a collection is already running if colname in cur_launchs: continue if now >= e['next_check']: logger.debug('COLLECTOR: launching collector %s' % colname, part='check') t = threader.create_and_launch(inst.main, name='collector-%s' % colname) cur_launchs[colname] = t e['next_check'] += 10 e['last_check'] = now to_del = [] for (colname, t) in cur_launchs.iteritems(): if not t.is_alive(): t.join() to_del.append(colname) for colname in to_del: del cur_launchs[colname] time.sleep(1)
def set_node_leave(nname): node = None with self.nodes_lock: for n in self.nodes.values(): if n['name'] == nname: node = n if node is None: return abort(404, 'This node is not found') logger.log('PUTTING LEAVE the node %s' % n, part='http') self.set_leave(node) return
def drop_db(self, h): # now demove the database with self.lock: try: del self.dbs[h] except IndexError: # if not there, not a problem... pass # And remove the files of this database p = os.path.join(self.ttldb_dir, '%d' % h) logger.log("Deleting ttl database tree", p, part='kv') shutil.rmtree(p, ignore_errors=True)
def do_detector_thread(self): logger.log("DETECTOR thread launched", part="detector") while not self.clust.interrupted: for (gname, gen) in self.clust.detectors.iteritems(): logger.debug("LOOK AT DETECTOR", gen) interval = int(gen["interval"].split("s")[0]) # todo manage like it should should_be_launch = gen["last_launch"] < int(time.time()) - interval if should_be_launch: print "LAUNCHING DETECTOR", gen gen["last_launch"] = int(time.time()) do_apply = evaluater.eval_expr(gen["apply_if"]) print "DO APPLY?", do_apply if do_apply: tags = gen["tags"] for tag in tags: if tag not in self.clust.tags: print "ADDING NEW TAGS", tag time.sleep(1)
def set_dead(self, suspect): addr = suspect['addr'] port = suspect['port'] name = suspect['name'] incarnation = suspect['incarnation'] uuid = suspect['uuid'] tags = suspect.get('tags', []) services = suspect.get('services', {}) state = 'dead' # Maybe we didn't even have this nodes in our list? if not uuid in self.nodes: return node = self.nodes.get(uuid, None) # The node can vanish if node is None: return # Maybe this data is too old if incarnation < node['incarnation']: return # We only case about into about alive nodes, dead and suspect # are not interesting :) if node['state'] != 'alive': return # Maybe it's us?? We need to say F*****G NO, I'm alive!! if uuid == self.uuid: logger.log('SUSPECT: SOMEONE THINK I AM SUSPECT, BUT I AM ALIVE', part='gossip') self.incarnation += 1 node['incarnation'] = self.incarnation self.stack_alive_broadcast(node) return logger.log('DEAD: I put in dead node %s' % node['name'], part='gossip') # Ok it's definitivly someone else that is now suspected, update this, and update it :) node['incarnation'] = incarnation node['state'] = state node['suspect_time'] = int(time.time()) node['tags'] = tags node['services'] = services self.stack_dead_broadcast(node)
def set_alive(self, node, bootstrap=False, strong=False): addr = node['addr'] port = node['port'] name = node['name'] incarnation = node['incarnation'] uuid = node['uuid'] state = node['state'] = 'alive' tags = node.get('tags', []) # Maybe it's me? if so skip it if not bootstrap: if node['addr'] == self.addr and node['port'] == self.port: return # Maybe it's a new node that just enter the cluster? if uuid not in self.nodes: logger.log("New node detected", node, part='gossip') # Add the node but in a protected mode with self.nodes_lock: self.nodes[uuid] = node self.stack_alive_broadcast(node) return prev = self.nodes.get(uuid, None) # maybe the prev was out by another thread? if prev is None: return change = (prev['state'] != state) # If the data is not just new, bail out if not strong and incarnation <= prev['incarnation']: return logger.debug('ALIVENODE', name, prev['state'], state, strong, change, incarnation, prev['incarnation'], (strong and change), (incarnation > prev['incarnation'])) # only react to the new data if they are really new :) if strong or incarnation > prev['incarnation']: # protect the nodes access with the lock so others threads are happy :) with self.nodes_lock: self.nodes[uuid] = node # Only broadcast if it's a new data from somewhere else if (strong and change) or incarnation > prev['incarnation']: logger.debug("Updating alive a node", prev, 'with', node) self.stack_alive_broadcast(node)
def join(self): logger.log("We will try to join our seeds members", self.seeds, part='gossip') tmp = self.seeds others = [] if not len(self.seeds): logger.log("No seeds nodes, I'm a bootstrap node?") return for e in tmp: elts = e.split(':') addr = elts[0] port = self.port if len(elts) > 1: port = int(elts[1]) others.append( (addr, port) ) random.shuffle(others) while True: logger.log('JOINING myself %s is joining %s nodes' % (self.name, others), part='gossip') nb = 0 for other in others: nb += 1 r = self.do_push_pull(other) # Do not merge with more than KGOSSIP distant nodes if nb > KGOSSIP: continue # If we got enough nodes, we exit if len(self.nodes) != 1 or self.interrupted or self.bootstrap: return # Do not hummer the cpu.... time.sleep(0.1)
def do_ping(self, other): ping_payload = {'type':'ping', 'seqno':0, 'node': other['name'], 'from': self.uuid} message = json.dumps(ping_payload) enc_message = encrypter.encrypt(message) addr = other['addr'] port = other['port'] _t = time.time() try: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) # UDP sock.sendto(enc_message, (addr, port) ) logger.debug('PING waiting %s ack message' % other['name'], part='gossip') # Allow 3s to get an answer sock.settimeout(3) ret = sock.recv(65535) logger.debug('PING got a return from %s' % other['name'], len(ret), part='gossip') # An aswer? great it is alive! self.set_alive(other, strong=True) except (socket.timeout, socket.gaierror), exp: logger.debug("PING: error joining the other node %s:%s : %s" % (addr, port, exp), part='gossip') logger.debug("PING: go indirect mode", part='gossip') possible_relays = [] with self.nodes_lock: possible_relays = [n for n in self.nodes.values() if n['uuid'] != self.uuid and n != other and n['state'] == 'alive'] if len(possible_relays) == 0: logger.log("PING: no possible relays for ping", part='gossip') self.set_suspect(other) # Take at least 3 relays to ask ping relays = random.sample(possible_relays, min(len(possible_relays), 3)) logger.debug('POSSIBLE RELAYS', relays) ping_relay_payload = {'type':'ping-relay', 'seqno':0, 'tgt': other['uuid'], 'from': self.uuid} message = json.dumps(ping_relay_payload) enc_message = encrypter.encrypt(message) sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) # UDP for r in relays: try: sock.sendto(enc_message, (r['addr'], r['port']) ) logger.debug('PING waiting ack message', part='gossip') except socket.error, exp: logger.error('Cannot send a ping relay to %s:%s' % (r['addr'], r['port']), part='gossip')
def look_at_deads(self): # suspect a node for 5 * log(n+1) * interval node_scale = math.ceil(math.log10(float(len(self.nodes) + 1))) probe_interval = 1 suspicion_mult = 5 suspect_timeout = suspicion_mult * node_scale * probe_interval leave_timeout = suspect_timeout * 3 # something like 30s #print "SUSPECT timeout", suspect_timeout now = int(time.time()) nodes = {} with self.nodes_lock: for node in self.nodes.values(): # Only look at suspect nodes of course... if node['state'] != 'suspect': continue stime = node.get('suspect_time', now) if stime < (now - suspect_timeout): logger.log("SUSPECT: NODE", node['name'], node['incarnation'], node['state'], "is NOW DEAD", part='gossip') node['state'] = 'dead' self.stack_dead_broadcast(node) # Now for leave nodes, this time we will really remove the entry from our nodes to_del = [] for (uuid, node) in nodes.iteritems(): # Only look at suspect nodes of course... if node['state'] != 'leave': continue ltime = node.get('leave_time', now) print "LEAVE TIME", node['name'], ltime, now - leave_timeout, (now - leave_timeout) - ltime if ltime < (now - leave_timeout): logger.log("LEAVE: NODE", node['name'], node['incarnation'], node['state'], "is now definitivly leaved. We remove it from our nodes", part='gossip') to_del.append(uuid) # now really remove them from our list :) for uuid in to_del: try: del self.nodes[uuid] except IndexError: # not here? it was was we want pass
def clean_old(self): logger.debug("TTL clean old", part='kv') now = NOW.now + 3600 h = divmod(now, 3600)[0]*3600 # Look at the databses directory that have the hour time set subdirs = os.listdir(self.ttldb_dir) for d in subdirs: try: bhour = int(d) except ValueError: # who add a dir that is not a int here... continue # Is the hour available for cleaning? if bhour < h: logger.log("TTL bhour is too low!", bhour, part='kv') # take the database and dump all keys in it cdb = self.get_ttl_db(bhour) to_del = cdb.RangeIter() # Now ask the cluster to delete the key, whatever it is for (k,v) in to_del: self.kv.delete(k) # now we clean all old entries, remove the idx database self.drop_db(bhour)
def bailout_after_leave(self): logger.log('Bailing out in few seconds. I was put in leave state') time.sleep(10) logger.log('Exiting from a self leave message') # Will set self.interrupted = True to eavery thread that loop pubsub.pub('interrupt')
def set_leave(self, leaved): addr = leaved['addr'] port = leaved['port'] name = leaved['name'] incarnation = leaved['incarnation'] uuid = leaved['uuid'] tags = leaved.get('tags', []) services = leaved.get('services', {}) state = 'leave' print "SET_LEAVE::", leaved # Maybe we didn't even have this nodes in our list? if not uuid in self.nodes: return node = self.nodes.get(uuid, None) # The node can vanish by another thread delete if node is None: return # Maybe we already know it's leaved, so don't update it if node['state'] == 'leave': return print "SET LEAVE %s and inner node %s" % (leaved, node) # If for me it must be with my own incarnation number so we are sure it's really us that should leave # and not if uuid == self.uuid: if incarnation != node['incarnation']: print "LEAVE INCARNATION NOT THE SAME FOR MYSELF" return else: # If not for me, use the classic 'not already known' rule if incarnation < node['incarnation']: print "LEAVE, NOT FOR ME, THE INCARNATION NUMBER IS TOO OLD" return print "SET LEAVE UUID and SELF.UUID", uuid, self.uuid # Maybe it's us?? If so we must send our broadcast and exit in few seconds if uuid == self.uuid: logger.log('LEAVE: someone is asking me for leaving.', part='gossip') self.incarnation += 1 node['incarnation'] = self.incarnation self.stack_leave_broadcast(node) def bailout_after_leave(self): logger.log('Bailing out in few seconds. I was put in leave state') time.sleep(10) logger.log('Exiting from a self leave message') # Will set self.interrupted = True to eavery thread that loop pubsub.pub('interrupt') threader.create_and_launch(bailout_after_leave, args=(self,)) return logger.log('LEAVING: The node %s is leaving' % node['name'], part='gossip') # Ok it's definitivly someone else that is now suspected, update this, and update it :) node['incarnation'] = incarnation node['state'] = state node['leave_time'] = int(time.time()) node['tags'] = tags node['services'] = services self.stack_leave_broadcast(node)
sock.settimeout(3*2) try: ret = sock.recv(65535) except socket.timeout: # still noone succed to ping it? I suspect it self.set_suspect(other) sock.close() return msg = json.loads(ret) sock.close() logger.debug('PING: got an answer from a relay', msg, part='gossip') logger.debug('RELAY set alive', other['name'], part='gossip') # Ok it's no more suspected, great :) self.set_alive(other, strong=True) except socket.error, exp: logger.log("PING: cannot join the other node %s:%s : %s" % (addr, port, exp), part='gossip') # Randomly push some gossip broadcast messages and send them to # KGOSSIP others nodes def do_gossip_push(self, dest): message = '' to_del = [] stack = [] tags = dest['tags'] for b in broadcaster.broadcasts: # not a valid node for this message, skip it if 'tag' in b and b['tag'] not in tags: continue old_message = message send = b['send']
def log(self, *args): logger.log(*args)
self.cur_value = f.read() f.close() except IOError, exp: logger.error('Cannot open path file %s : %s' % (self.g['path'], exp)) self.output = None self.template = '' self.buf = '' return False # If not exists or the value did change, regenerate it :) if not os.path.exists(self.g['path']) or self.cur_value != self.output: logger.debug('Generator %s generate a new value, writing it to %s' % (self.g['name'], self.g['path'])) try: f = open(self.g['path'], 'w') f.write(self.output) f.close() logger.log('Generator %s did generate a new file at %s' % (self.g['name'], self.g['path'])) return True except IOError, exp: logger.error('Cannot write path file %s : %s' % (self.g['path'], exp)) self.output = None self.template = '' self.buf = '' return False # If need launch the restart command, shoul not block too long of # course def launch_command(self): cmd = self.g['command'] try: p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, preexec_fn=os.setsid) except Exception, exp: