class TableExtractor: def __init__(self, node_to_extract): self.node_to_extract = node_to_extract self.my_id = self._my_id = RandomId() self.msg_f = message.MsgFactory(PYMDHT_VERSION, self.my_id, None) self.querier = Querier() self.next_level = 159 self.last_extraction_ts = 0 def on_stop(self): pass #self._experimental_m.on_stop() def main_loop(self): msgs_to_send = [] current_time = time.time() if current_time > self.last_extraction_ts + 1: fn_msg = self.msg_f.outgoing_find_node_query( self.node_to_extract, self.node_to_extract.id.generate_close_id(self.next_level), None, None) if PING: fn_msg = self.msg_f.outgoing_ping_query( self.node_to_extract, None) msgs_to_send.append(fn_msg) self.last_extraction_ts = current_time self.next_level -= 1 # Take care of timeouts (self._next_timeout_ts, timeout_queries) = self.querier.get_timeout_queries() for query in timeout_queries: print 'TIMEOUT' timeout_call_ts, datagrams_to_send = self.querier.register_queries( msgs_to_send) return self.last_extraction_ts + 1, datagrams_to_send def on_datagram_received(self, datagram): data = datagram.data addr = datagram.addr datagrams_to_send = [] try: msg = self.msg_f.incoming_msg(datagram) except (message.MsgError): # ignore message return self._next_main_loop_call_ts, datagrams_to_send if msg.type == message.RESPONSE: related_query = self.querier.get_related_query(msg) if related_query and related_query.query == message.FIND_NODE: print 'level', version_repr(msg.version) for node_ in msg.nodes: print node_ else: print 'not related' return self.last_extraction_ts + 1, datagrams_to_send
class TableExtractor: def __init__(self, node_to_extract): self.node_to_extract = node_to_extract self.my_id = self._my_id = RandomId() self.msg_f = message.MsgFactory(PYMDHT_VERSION, self.my_id, None) self.querier = Querier() self.next_level = 159 self.last_extraction_ts = 0 def on_stop(self): pass # self._experimental_m.on_stop() def main_loop(self): msgs_to_send = [] current_time = time.time() if current_time > self.last_extraction_ts + 1: fn_msg = self.msg_f.outgoing_find_node_query( self.node_to_extract, self.node_to_extract.id.generate_close_id(self.next_level), None, None ) if PING: fn_msg = self.msg_f.outgoing_ping_query(self.node_to_extract, None) msgs_to_send.append(fn_msg) self.last_extraction_ts = current_time self.next_level -= 1 # Take care of timeouts (self._next_timeout_ts, timeout_queries) = self.querier.get_timeout_queries() for query in timeout_queries: print "TIMEOUT" timeout_call_ts, datagrams_to_send = self.querier.register_queries(msgs_to_send) return self.last_extraction_ts + 1, datagrams_to_send def on_datagram_received(self, datagram): data = datagram.data addr = datagram.addr datagrams_to_send = [] try: msg = self.msg_f.incoming_msg(datagram) except (message.MsgError): # ignore message return self._next_main_loop_call_ts, datagrams_to_send if msg.type == message.RESPONSE: related_query = self.querier.get_related_query(msg) if related_query and related_query.query == message.FIND_NODE: print "level", version_repr(msg.version) for node_ in msg.nodes: print node_ else: print "not related" return self.last_extraction_ts + 1, datagrams_to_send
class Crawler(object): def __init__(self, bootstrap_nodes): self.rcrawler = RCrawler(set(), set(), START_PREFIX_LEN, bootstrap_nodes[0].id) self.rcrawler.got_nodes_handler(None, bootstrap_nodes) self.my_id = self._my_id = RandomId() self.msg_f = message.MsgFactory(PYMDHT_VERSION, self.my_id, None) self.querier = Querier() self.next_main_loop_ts = 0 self.num_msgs = 0 self.ok_nodes = set() self.dead_nodes = set() def on_stop(self): pass def main_loop(self): self.next_main_loop_ts = time.time() + .1 if self.rcrawler.done: print 'ind | ok dead | ok dead' self.rcrawler.print_result() print self.rcrawler.get_num_ok(), self.rcrawler.get_num_dead() print self.num_msgs, 'messages sent' for n in sorted(self.ok_nodes, key=attrgetter('ip')): print n return msgs_to_send = [] node_, target, rcrawler_obj = self.rcrawler.next() if target: msg = self.msg_f.outgoing_find_node_query(node_, target, None, rcrawler_obj) #print 'target', `target`, 'to node', `node_.id` #print 'sending query to', extracting_node.node, #print extracting_node.node.id.log_distance(TARGET) msgs_to_send.append(msg) # Take care of timeouts (self._next_timeout_ts, timeout_queries) = self.querier.get_timeout_queries() for related_query in timeout_queries: #print 'timeout' related_query.experimental_obj.timeout_handler( related_query.dst_node) self.dead_nodes.add(related_query.dst_node) if msgs_to_send: timeout_call_ts, datagrams_to_send = self.querier.register_queries( msgs_to_send) else: datagrams_to_send = [] self.num_msgs += len(datagrams_to_send) if datagrams_to_send and self.num_msgs % PRINT_DOT_EACH == 0: sys.stdout.write('.') sys.stdout.flush() return self.next_main_loop_ts, datagrams_to_send def on_datagram_received(self, datagram): data = datagram.data addr = datagram.addr try: msg = self.msg_f.incoming_msg(datagram) except (message.MsgError): # ignore message return self.next_main_loop_ts, [] if msg.type == message.RESPONSE: related_query = self.querier.get_related_query(msg) #print 'got reply', if related_query and related_query.experimental_obj: #print 'related >>>>>>>>>>>>>>>>>>>>>>', len(msg.nodes) nodes = msg.all_nodes node_ = msg.src_node related_query.experimental_obj.got_nodes_handler(node_, nodes) self.ok_nodes.add(node_) return self.next_main_loop_ts, [] #datagrams_to_send
class Crawler(object): def __init__(self, bootstrap_nodes): self.target = bootstrap_nodes[0].id target_prefix = self.target.get_prefix(START_PREFIX_LEN) print target_prefix self.rcrawler = RCrawler(target_prefix) for n in bootstrap_nodes: self.rcrawler.found_node_handler(n) self.pending_nodes = bootstrap_nodes self.my_id = self._my_id = RandomId() self.msg_f = message.MsgFactory(PYMDHT_VERSION, self.my_id, None) self.querier = Querier() self.next_main_loop_ts = 0 self.num_msgs = 0 self.known_nodes = set(bootstrap_nodes) self.ok_nodes = set() self.dead_nodes = set() self.last_msg_ts = time.time() def on_stop(self): pass def main_loop(self): self.next_main_loop_ts = time.time() + EXTRACTION_DELAY if time.time() > self.last_msg_ts + 4:# self.rcrawler.done: print 'ind | ok dead | ok dead' self.rcrawler.print_result() print 'total OK/DEAD', len(self.rcrawler.ok_nodes), print len(self.rcrawler.dead_nodes) print self.num_msgs, 'messages sent' for n in sorted(self.ok_nodes, key=attrgetter('ip')): print n return target = None msgs_to_send = [] if ((self.num_msgs < 20 and self.num_msgs % 5 == 0) or (self.num_msgs < 100 and self.num_msgs % 10 == 0) or (self.num_msgs > 100 and self.num_msgs % 20 == 0) or (self.num_msgs > 100 and not self.pending_nodes)): dst_node, target = self.rcrawler.next_bootstrap_msg() if target: print 'O', else: print 'F', if not target and self.pending_nodes: dst_node = self.pending_nodes.pop(0) if dst_node.id.bin_str.startswith(self.rcrawler.target_prefix): self.rcrawler.pinged_node_handler(dst_node) target = dst_node.id else: target = self.target if target: msg = self.msg_f.outgoing_find_node_query( dst_node, target, None, self) #print 'target', `target`, 'to node', `node_.id` #print 'sending query to', extracting_node.node, #print extracting_node.node.id.log_distance(TARGET) msgs_to_send.append(msg) # Take care of timeouts (self._next_timeout_ts, timeout_queries) = self.querier.get_timeout_queries() for related_query in timeout_queries: #print 'timeout' timeout_node = related_query.dst_node self.dead_nodes.add(timeout_node) if timeout_node.id.bin_str.startswith(self.rcrawler.target_prefix): self.rcrawler.timeout_handler(timeout_node) if msgs_to_send: timeout_call_ts, datagrams_to_send = self.querier.register_queries( msgs_to_send) self.last_msg_ts = time.time() else: datagrams_to_send = [] self.num_msgs += len(datagrams_to_send) if datagrams_to_send and self.num_msgs % PRINT_DOT_EACH == 0: #print target.hex sys.stdout.write('.') sys.stdout.flush() return self.next_main_loop_ts, datagrams_to_send def on_datagram_received(self, datagram): data = datagram.data addr = datagram.addr try: msg = self.msg_f.incoming_msg(datagram) except(message.MsgError): # ignore message return self.next_main_loop_ts, [] if msg.type == message.RESPONSE: related_query = self.querier.get_related_query(msg) #print 'got reply', if related_query and related_query.experimental_obj: nodes = msg.all_nodes src_node = msg.src_node print '%s >>>>>>>>>>>>>>>>>>>>>> %d' % ( src_node.id.hex, len(msg.nodes)) if src_node.id.bin_str.startswith(self.rcrawler.target_prefix): self.rcrawler.ok_node_handler(src_node) for n in nodes: if n not in self.known_nodes: add_node = len(self.ok_nodes) < 3 if n.id.bin_str.startswith(self.rcrawler.target_prefix): add_node = True self.rcrawler.found_node_handler(n) if add_node: self.known_nodes.add(n) self.pending_nodes.append(n) self.ok_nodes.add(src_node) return self.next_main_loop_ts, []#datagrams_to_send
class Crawler(object): def __init__(self, bootstrap_nodes): self.target = bootstrap_nodes[0].id target_prefix = self.target.get_prefix(START_PREFIX_LEN) print target_prefix self.rcrawler = RCrawler(target_prefix) for n in bootstrap_nodes: self.rcrawler.found_node_handler(n) self.pending_nodes = bootstrap_nodes self.my_id = self._my_id = RandomId() self.msg_f = message.MsgFactory(PYMDHT_VERSION, self.my_id, None) self.querier = Querier() self.next_main_loop_ts = 0 self.num_msgs = 0 self.known_nodes = set(bootstrap_nodes) self.ok_nodes = set() self.dead_nodes = set() self.last_msg_ts = time.time() def on_stop(self): pass def main_loop(self): self.next_main_loop_ts = time.time() + EXTRACTION_DELAY if time.time() > self.last_msg_ts + 4: # self.rcrawler.done: print 'ind | ok dead | ok dead' self.rcrawler.print_result() print 'total OK/DEAD', len(self.rcrawler.ok_nodes), print len(self.rcrawler.dead_nodes) print self.num_msgs, 'messages sent' for n in sorted(self.ok_nodes, key=attrgetter('ip')): print n return target = None msgs_to_send = [] if ((self.num_msgs < 20 and self.num_msgs % 5 == 0) or (self.num_msgs < 100 and self.num_msgs % 10 == 0) or (self.num_msgs > 100 and self.num_msgs % 20 == 0) or (self.num_msgs > 100 and not self.pending_nodes)): dst_node, target = self.rcrawler.next_bootstrap_msg() if target: print 'O', else: print 'F', if not target and self.pending_nodes: dst_node = self.pending_nodes.pop(0) if dst_node.id.bin_str.startswith(self.rcrawler.target_prefix): self.rcrawler.pinged_node_handler(dst_node) target = dst_node.id else: target = self.target if target: msg = self.msg_f.outgoing_find_node_query(dst_node, target, None, self) #print 'target', `target`, 'to node', `node_.id` #print 'sending query to', extracting_node.node, #print extracting_node.node.id.log_distance(TARGET) msgs_to_send.append(msg) # Take care of timeouts (self._next_timeout_ts, timeout_queries) = self.querier.get_timeout_queries() for related_query in timeout_queries: #print 'timeout' timeout_node = related_query.dst_node self.dead_nodes.add(timeout_node) if timeout_node.id.bin_str.startswith( self.rcrawler.target_prefix): self.rcrawler.timeout_handler(timeout_node) if msgs_to_send: timeout_call_ts, datagrams_to_send = self.querier.register_queries( msgs_to_send) self.last_msg_ts = time.time() else: datagrams_to_send = [] self.num_msgs += len(datagrams_to_send) if datagrams_to_send and self.num_msgs % PRINT_DOT_EACH == 0: #print target.hex sys.stdout.write('.') sys.stdout.flush() return self.next_main_loop_ts, datagrams_to_send def on_datagram_received(self, datagram): data = datagram.data addr = datagram.addr try: msg = self.msg_f.incoming_msg(datagram) except (message.MsgError): # ignore message return self.next_main_loop_ts, [] if msg.type == message.RESPONSE: related_query = self.querier.get_related_query(msg) #print 'got reply', if related_query and related_query.experimental_obj: nodes = msg.all_nodes src_node = msg.src_node print '%s >>>>>>>>>>>>>>>>>>>>>> %d' % (src_node.id.hex, len(msg.nodes)) if src_node.id.bin_str.startswith(self.rcrawler.target_prefix): self.rcrawler.ok_node_handler(src_node) for n in nodes: if n not in self.known_nodes: add_node = len(self.ok_nodes) < 3 if n.id.bin_str.startswith( self.rcrawler.target_prefix): add_node = True self.rcrawler.found_node_handler(n) if add_node: self.known_nodes.add(n) self.pending_nodes.append(n) self.ok_nodes.add(src_node) return self.next_main_loop_ts, [] #datagrams_to_send
class Crawler(object): def __init__(self, bootstrap_nodes): self.rcrawler = RCrawler(set(), set(), 15, bootstrap_nodes[0].id) self.rcrawler.got_nodes_handler(None, bootstrap_nodes) self.my_id = self._my_id = RandomId() self.msg_f = message.MsgFactory(PYMDHT_VERSION, self.my_id, None) self.querier = Querier() self.next_main_loop_ts = 0 self.num_msgs = 0 def on_stop(self): pass def main_loop(self): self.next_main_loop_ts = time.time() + .1 if self.rcrawler.done: print 'ind | ok dead | ok dead' self.rcrawler.print_result() print self.rcrawler.get_num_ok(), self.rcrawler.get_num_dead() print self.num_msgs, 'messages sent' return msgs_to_send = [] node_, target, rcrawler_obj = self.rcrawler.next() if target: msg = self.msg_f.outgoing_find_node_query( node_, target, None, rcrawler_obj) #print 'target', `target`, 'to node', `node_.id` #print 'sending query to', extracting_node.node, #print extracting_node.node.id.log_distance(TARGET) msgs_to_send.append(msg) # Take care of timeouts (self._next_timeout_ts, timeout_queries) = self.querier.get_timeout_queries() for related_query in timeout_queries: #print 'timeout' related_query.experimental_obj.timeout_handler(related_query.dst_node) if msgs_to_send: timeout_call_ts, datagrams_to_send = self.querier.register_queries( msgs_to_send) else: datagrams_to_send = [] self.num_msgs += len(datagrams_to_send) if datagrams_to_send and self.num_msgs % 1 == 0: sys.stdout.write('.') sys.stdout.flush() return self.next_main_loop_ts, datagrams_to_send def on_datagram_received(self, datagram): data = datagram.data addr = datagram.addr try: msg = self.msg_f.incoming_msg(datagram) except(message.MsgError): # ignore message return self.next_main_loop_ts, datagrams_to_send if msg.type == message.RESPONSE: related_query = self.querier.get_related_query(msg) #print 'got reply', if related_query and related_query.experimental_obj: #print 'related >>>>>>>>>>>>>>>>>>>>>>', len(msg.nodes) try: nodes = msg.all_nodes except AttributeError: print '\nno nodes>>>>>>>', msg._msg_dict nodes = [] node_ = related_query.dst_node related_query.experimental_obj.got_nodes_handler(node_, nodes) return self.next_main_loop_ts, []#datagrams_to_send
class Crawler(object): def __init__(self, bootstrap_nodes): self.target = RandomId() self.extracting_queue = ExtractingQueue(self.target) for node_ in bootstrap_nodes: is_new_node = self.extracting_queue.add_node(node_) self.my_id = self._my_id = RandomId() self.msg_f = message.MsgFactory(PYMDHT_VERSION, self.my_id, None) self.querier = Querier() self.last_extraction_ts = time.time() self.num_msgs = 0 self.nodes_inrange_w_response = set() def on_stop(self): pass#self._experimental_m.on_stop() def main_loop(self): current_time = time.time() if current_time > self.last_extraction_ts + 4: return #crawler DONE msgs_to_send = [] only_inrange = len(self.nodes_inrange_w_response) > 4 extracting_node, step_target = \ self.extracting_queue.next_node_step_target(only_inrange) if step_target: msg = self.msg_f.outgoing_find_node_query( extracting_node.lookup_node, step_target, None, extracting_node) #print 'sending query to', extracting_node.node, #print extracting_node.node.id.log_distance(TARGET) msgs_to_send.append(msg) self.last_extraction_ts = current_time # Take care of timeouts (self._next_timeout_ts, timeout_queries) = self.querier.get_timeout_queries() for query in timeout_queries: #print 'timeout' query.experimental_obj.timeout_handler() if msgs_to_send: timeout_call_ts, datagrams_to_send = self.querier.register_queries( msgs_to_send) else: datagrams_to_send = [] self.num_msgs += len(datagrams_to_send) if datagrams_to_send and self.num_msgs % 100 == 0: sys.stdout.write('.') sys.stdout.flush() return current_time + .01, datagrams_to_send def on_datagram_received(self, datagram): data = datagram.data addr = datagram.addr datagrams_to_send = [] try: msg = self.msg_f.incoming_msg(datagram) except(message.MsgError): # ignore message return self.last_extraction_ts + EXTRACTION_DELAY, datagrams_to_send if msg.type == message.RESPONSE: related_query = self.querier.get_related_query(msg) #print 'got reply', if related_query and related_query.experimental_obj: #print 'related >>>>>>>>>>>>>>>>>>>>>>', len(msg.nodes) try: nodes = msg.nodes except AttributeError: print '\nno nodes>>>>>>>', msg._msg_dict nodes = [] lookup_node = related_query.dst_node if in_range(lookup_node): self.nodes_inrange_w_response.add(lookup_node) related_query.experimental_obj.add_found_nodes(nodes) new_nodes = False for node_ in nodes: self.extracting_queue.add_node(node_) self.num_msgs += len(datagrams_to_send) return self.last_extraction_ts + EXTRACTION_DELAY, datagrams_to_send def get_bootstrap_nodes(self): return [en.lookup_node.node for en in self.extracting_queue.pinged_nodes[-100:]] def print_summary(self): self.extracting_queue.print_summary() print "Messages sent:", self.num_msgs def print_results(self): self.extracting_queue.print_results()