def __init__(self, network, store_type, output_dir=""):
     self.network = network
     self.store_type = store_type
     store_class = GenericStore.get_store_class(store_type)
     # Buffer to store network data
     self.gbuffer = GraphBuffer(self.network.label,
                                store_class,
                                output_dir=output_dir)
Exemplo n.º 2
0
 def __init__(self, network, store_type="gml"):
     """ Function to initialize a network crawler.
       Args:
         network: the network object to be crawled
         store_type: the type of storage. Valid values are "gml", "basic_shelve"
                     or "couchdb"
       Returns:
         An initialized crawler object
     """
     self.network = network
     store_class = GenericStore.get_store_class(store_type)
     # Buffer to store network data
     self.gbuffer = GraphBuffer(self.network.label, store_class)
     #self.recover = recover
     # Priority queue in memory that keeps track of nodes to visit
     self.pqueue = PriorityQueue(store_class,
                                 store_name=self.network.label + "_state")
 def __init__(self, network, store_type="gml"):
     """ Function to initialize a network crawler.
       Args:
         network: the network object to be crawled
         store_type: the type of storage. Valid values are "gml", "basic_shelve"
                     or "couchdb"
       Returns:
         An initialized crawler object
     """
     self.network = network
     store_class = GenericStore.get_store_class(store_type)
     # Buffer to store network data
     self.gbuffer = GraphBuffer(self.network.label, store_class)
     #self.recover = recover
     # Priority queue in memory that keeps track of nodes to visit
     self.pqueue = PriorityQueue(store_class, store_name=self.network.label +
                                                         "_state")
class BFSNetworkCrawler(NetworkCrawlerInterface):
    """
    BFS Crawler. Uses a weighted BFS implementation which gives more weight to nodes that are seen multiple times.
    """

    def __init__(self, network, store_type="gml"):
        """ Function to initialize a network crawler.
          Args:
            network: the network object to be crawled
            store_type: the type of storage. Valid values are "gml", "basic_shelve"
                        or "couchdb"
          Returns:
            An initialized crawler object
        """
        self.network = network
        store_class = GenericStore.get_store_class(store_type)
        # Buffer to store network data
        self.gbuffer = GraphBuffer(self.network.label, store_class)
        #self.recover = recover
        # Priority queue in memory that keeps track of nodes to visit
        self.pqueue = PriorityQueue(store_class, store_name=self.network.label +
                                                            "_state")

    """
    def set_seed_nodes(self, seed_values):
      for seed in seed_values:
        self.add_to_crawl(seed)
    """

    def add_to_crawl(self, node, priority=0):
        self.pqueue.push(node, priority)
        #self.visited[node] = False


    def __del__(self):
        self.close()

    def close(self):
        self.gbuffer.close()
        self.pqueue.close()

    def crawl(self, seed_nodes, max_nodes=10, recover=True, checkpoint_frequency=100):
        """ Function to crawl the network using BFS.
            Works in two modes: fresh or recover. If recover is False, then
    seed_nodes needs to be not None.
        """
        # if the crawl was stopped for some reason, recover parameter helps to
        # restart it from where it stopped.
        #if self.recover and not self.pqueue.is_empty():
        # Just trying to minimize conflict. May lead to duplicate storage.
        #  node_counter = itertools.count(self.gbuffer.nodes_store.get_maximum_id()+1)
        #  edge_counter = itertools.count(self.gbuffer.edges_store.get_maximum_id()+1)
        #  print("Node counter", node_counter)
        #else:
        #  self.set_seed_nodes(self.seed_nodes)

        # WARNING: The initialize function assumes that the checkpoint_freq remains the same between current run and the previous run which is being recovered.
        prior_num_nodes_visited, prior_num_edges_visited = self.pqueue.initialize(seed_nodes, do_recovery=recover,
                                                                                  checkpoint_freq=checkpoint_frequency)
        node_counter = itertools.count(prior_num_nodes_visited)
        edge_counter = itertools.count(prior_num_edges_visited)
        iterations = 0
        try:
            while (not self.pqueue.is_empty()) and iterations < max_nodes:
                iterations += 1
                new_node = self.pqueue.pop()
                logging.info("Popped %s from queue and now starting to process it..." % new_node)
                print "Popped %s from queue and now starting to process it..." % new_node
                # Ignore if new_node has already been visited
                if new_node in self.pqueue.visited:
                    continue


                # Get details about the current node
                new_node_info = self.network.get_node_info(new_node)
                if new_node_info is None:
                    error_msg = "Crawler: Error in fetching node info. Skipping node %s" % new_node
                    logging.error(error_msg)
                    print error_msg
                    continue
                    #raise NameError
                # Assume dict output for all stores.
                new_node_edges_info = self.network.get_edges_info(new_node)
                if new_node_edges_info is None:
                    error_msg = "Crawler: Error in fetching node edges info. Skipping node %s" % new_node
                    logging.error(error_msg)
                    print error_msg
                    continue
                    #raise NameError

                logging.info("Crawler: Got information about %s" % new_node)

                print "Got all data"
                for edge_info in new_node_edges_info:
                    node = edge_info['target']

                    if node not in self.pqueue.visited:
                        # Update priority if node already exists in the queue
                        if node in self.pqueue.queue_dict:
                            node_priority = self.pqueue.mark_removed(node)
                            updated_priority = node_priority - 1
                            self.add_to_crawl(node, updated_priority)
                        # Otherwise just add the node to the queue
                        else:
                            self.add_to_crawl(node)

                # Now storing the data about the node and its edges
                print "Starting to store node info"
                new_node_info['id'] = next(node_counter)

                self.gbuffer.store_node(new_node, new_node_info)
                self.pqueue.mark_visited(new_node)
                for edge_info in new_node_edges_info:
                    edge_info['id'] = next(edge_counter)
                    self.gbuffer.store_edge(str(edge_info['id']), edge_info)
                logging.info("Processed %s \n" % new_node)
                print "Processed ", new_node
                if iterations % checkpoint_frequency == 0:
                    #curr_num_nodes_stored = self.gbuffer.nodes_store.get_maximum_id() + 1
                    curr_num_nodes_stored = self.gbuffer.nodes_store.get_count_records()
                    print("Making a checkpoint for pqueue state", iterations, curr_num_nodes_stored)
                    logging.info("Making a checkpoint for pqueue state: %d", curr_num_nodes_stored)
                    self.pqueue.save_checkpoint(curr_num_nodes_stored, self.gbuffer.edges_store.get_count_records(),
                                                checkpoint_frequency)
                    #print gc.collect()
        except (KeyboardInterrupt, SystemExit):
            self.pqueue.close()
            self.gbuffer.close()
        return
Exemplo n.º 5
0
class BFSNetworkCrawler(NetworkCrawlerInterface):
    """
    BFS Crawler. Uses a weighted BFS implementation which gives more weight to nodes that are seen multiple times.
    """
    def __init__(self, network, store_type="gml"):
        """ Function to initialize a network crawler.
          Args:
            network: the network object to be crawled
            store_type: the type of storage. Valid values are "gml", "basic_shelve"
                        or "couchdb"
          Returns:
            An initialized crawler object
        """
        self.network = network
        store_class = GenericStore.get_store_class(store_type)
        # Buffer to store network data
        self.gbuffer = GraphBuffer(self.network.label, store_class)
        #self.recover = recover
        # Priority queue in memory that keeps track of nodes to visit
        self.pqueue = PriorityQueue(store_class,
                                    store_name=self.network.label + "_state")

    """
    def set_seed_nodes(self, seed_values):
      for seed in seed_values:
        self.add_to_crawl(seed)
    """

    def add_to_crawl(self, node, priority=0):
        self.pqueue.push(node, priority)
        #self.visited[node] = False

    def __del__(self):
        self.close()

    def close(self):
        self.gbuffer.close()
        self.pqueue.close()

    def crawl(self,
              seed_nodes,
              max_nodes=10,
              recover=True,
              checkpoint_frequency=100):
        """ Function to crawl the network using BFS.
            Works in two modes: fresh or recover. If recover is False, then
    seed_nodes needs to be not None.
        """
        # if the crawl was stopped for some reason, recover parameter helps to
        # restart it from where it stopped.
        #if self.recover and not self.pqueue.is_empty():
        # Just trying to minimize conflict. May lead to duplicate storage.
        #  node_counter = itertools.count(self.gbuffer.nodes_store.get_maximum_id()+1)
        #  edge_counter = itertools.count(self.gbuffer.edges_store.get_maximum_id()+1)
        #  print("Node counter", node_counter)
        #else:
        #  self.set_seed_nodes(self.seed_nodes)

        # WARNING: The initialize function assumes that the checkpoint_freq remains the same between current run and the previous run which is being recovered.
        prior_num_nodes_visited, prior_num_edges_visited = self.pqueue.initialize(
            seed_nodes,
            do_recovery=recover,
            checkpoint_freq=checkpoint_frequency)
        node_counter = itertools.count(prior_num_nodes_visited)
        edge_counter = itertools.count(prior_num_edges_visited)
        iterations = 0
        try:
            while (not self.pqueue.is_empty()) and iterations < max_nodes:
                iterations += 1
                new_node = self.pqueue.pop()
                logging.info(
                    "Popped %s from queue and now starting to process it..." %
                    new_node)
                print "Popped %s from queue and now starting to process it..." % new_node
                # Ignore if new_node has already been visited
                if new_node in self.pqueue.visited:
                    continue

                # Get details about the current node
                new_node_info = self.network.get_node_info(new_node)
                if new_node_info is None:
                    error_msg = "Crawler: Error in fetching node info. Skipping node %s" % new_node
                    logging.error(error_msg)
                    print error_msg
                    continue
                    #raise NameError
                # Assume dict output for all stores.
                new_node_edges_info = self.network.get_edges_info(new_node)
                if new_node_edges_info is None:
                    error_msg = "Crawler: Error in fetching node edges info. Skipping node %s" % new_node
                    logging.error(error_msg)
                    print error_msg
                    continue
                    #raise NameError

                logging.info("Crawler: Got information about %s" % new_node)

                print "Got all data"
                for edge_info in new_node_edges_info:
                    node = edge_info['target']

                    if node not in self.pqueue.visited:
                        # Update priority if node already exists in the queue
                        if node in self.pqueue.queue_dict:
                            node_priority = self.pqueue.mark_removed(node)
                            updated_priority = node_priority - 1
                            self.add_to_crawl(node, updated_priority)
                        # Otherwise just add the node to the queue
                        else:
                            self.add_to_crawl(node)

                # Now storing the data about the node and its edges
                print "Starting to store node info"
                new_node_info['id'] = next(node_counter)

                self.gbuffer.store_node(new_node, new_node_info)
                self.pqueue.mark_visited(new_node)
                for edge_info in new_node_edges_info:
                    edge_info['id'] = next(edge_counter)
                    self.gbuffer.store_edge(str(edge_info['id']), edge_info)
                logging.info("Processed %s \n" % new_node)
                print "Processed ", new_node
                if iterations % checkpoint_frequency == 0:
                    #curr_num_nodes_stored = self.gbuffer.nodes_store.get_maximum_id() + 1
                    curr_num_nodes_stored = self.gbuffer.nodes_store.get_count_records(
                    )
                    print("Making a checkpoint for pqueue state", iterations,
                          curr_num_nodes_stored)
                    logging.info("Making a checkpoint for pqueue state: %d",
                                 curr_num_nodes_stored)
                    self.pqueue.save_checkpoint(
                        curr_num_nodes_stored,
                        self.gbuffer.edges_store.get_count_records(),
                        checkpoint_frequency)
                    #print gc.collect()
        except (KeyboardInterrupt, SystemExit):
            self.pqueue.close()
            self.gbuffer.close()
        return
 def __init__(self, network, store_type, output_dir=""):
     self.network= network
     self.store_type = store_type
     store_class = GenericStore.get_store_class(store_type)
     # Buffer to store network data
     self.gbuffer = GraphBuffer(self.network.label, store_class, output_dir=output_dir)
class FixedNodesCrawler(NetworkCrawlerInterface):
    def __init__(self, network, store_type, output_dir=""):
        self.network= network
        self.store_type = store_type
        store_class = GenericStore.get_store_class(store_type)
        # Buffer to store network data
        self.gbuffer = GraphBuffer(self.network.label, store_class, output_dir=output_dir)

    def crawl(self, nodes_list, start_from_index=1):
        try:
            counter = 0
            for new_node in nodes_list:
                counter +=1
                if counter < start_from_index:
                    continue

                new_node_info = self.network.get_node_info(new_node)
                if new_node_info is None:
                    error_msg = "Crawler: Error in fetching node info. Skipping node %s" % new_node
                    logging.error(error_msg)
                    print error_msg
                    continue
                # Assume dict output for all stores.
                new_node_edges_info = self.network.get_edges_info(new_node)
                if new_node_edges_info is None:
                    error_msg = "Crawler: Error in fetching node edges info. Skipping node %s" % new_node
                    logging.error(error_msg)
                    print error_msg
                    continue
                    #raise NameError

                logging.info("Crawler: Got information about %s" % new_node)

                print "Got all data"

                # Now storing the data about the node and its edges
                print "Starting to store node info"
                FixedNodesCrawler.store_data(self.gbuffer, new_node, new_node_info, new_node_edges_info)
        except (KeyboardInterrupt, SystemExit):
            self.close()

        self.close()

    def __del__(self):
        self.close()

    def close(self):
        self.gbuffer.close()

    @classmethod
    def store_data(cls, gbuffer, node_id, node_data, edges_data):
        node_data['id'] = -1
        gbuffer.store_node(node_id, node_data)

        edge_keys = []
        for edge_info in edges_data:
            edge_info['id'] = -1
            edge_source_str = edge_info['source'].encode('ascii', 'ignore')
            edge_target_str = edge_info['target'].encode('ascii', 'ignore')
            #edge_keys.append(str(edge_info['source']+'_'+edge_info['target']))
            edge_keys.append(edge_source_str + '_' + edge_target_str)

        gbuffer.store_edges(edge_keys, edges_data)
        logging.info("Processed and stored successfully %s \n" % node_id)
        print "Processed and stored", node_id
class FixedNodesCrawler(NetworkCrawlerInterface):
    def __init__(self, network, store_type, output_dir=""):
        self.network = network
        self.store_type = store_type
        store_class = GenericStore.get_store_class(store_type)
        # Buffer to store network data
        self.gbuffer = GraphBuffer(self.network.label,
                                   store_class,
                                   output_dir=output_dir)

    def crawl(self, nodes_list, start_from_index=1):
        try:
            counter = 0
            for new_node in nodes_list:
                counter += 1
                if counter < start_from_index:
                    continue

                new_node_info = self.network.get_node_info(new_node)
                if new_node_info is None:
                    error_msg = "Crawler: Error in fetching node info. Skipping node %s" % new_node
                    logging.error(error_msg)
                    print error_msg
                    continue
                # Assume dict output for all stores.
                new_node_edges_info = self.network.get_edges_info(new_node)
                if new_node_edges_info is None:
                    error_msg = "Crawler: Error in fetching node edges info. Skipping node %s" % new_node
                    logging.error(error_msg)
                    print error_msg
                    continue
                    #raise NameError

                logging.info("Crawler: Got information about %s" % new_node)

                print "Got all data"

                # Now storing the data about the node and its edges
                print "Starting to store node info"
                FixedNodesCrawler.store_data(self.gbuffer, new_node,
                                             new_node_info,
                                             new_node_edges_info)
        except (KeyboardInterrupt, SystemExit):
            self.close()

        self.close()

    def __del__(self):
        self.close()

    def close(self):
        self.gbuffer.close()

    @classmethod
    def store_data(cls, gbuffer, node_id, node_data, edges_data):
        node_data['id'] = -1
        gbuffer.store_node(node_id, node_data)

        edge_keys = []
        for edge_info in edges_data:
            edge_info['id'] = -1
            edge_source_str = edge_info['source'].encode('ascii', 'ignore')
            edge_target_str = edge_info['target'].encode('ascii', 'ignore')
            #edge_keys.append(str(edge_info['source']+'_'+edge_info['target']))
            edge_keys.append(edge_source_str + '_' + edge_target_str)

        gbuffer.store_edges(edge_keys, edges_data)
        logging.info("Processed and stored successfully %s \n" % node_id)
        print "Processed and stored", node_id