def test_crawl(self): # Running the bfs crawler num_nodes_before_crash = 2 checkpoint_freq = 2 webnetwork = TestWebCaller("test", self.network_edges) self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE) self.crawler.crawl(seed_nodes=["11", "5"], max_nodes=num_nodes_before_crash, recover=True, checkpoint_frequency=checkpoint_freq) node_visit_order = webnetwork.get_visit_order() self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order[:num_nodes_before_crash]) self.crawler.close() self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE) self.crawler.crawl(seed_nodes=["11", "5"], max_nodes=10, recover=True, checkpoint_frequency=checkpoint_freq) node_visit_order = webnetwork.get_visit_order() print("Node visit order is", node_visit_order) self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order)
def test_crawl(self): # Running the bfs crawler num_nodes_before_crash = 2 checkpoint_freq = 2 webnetwork = TestWebCaller("test", self.network_edges) self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE) self.crawler.crawl( seed_nodes=["11", "5"], max_nodes=num_nodes_before_crash, recover=True, checkpoint_frequency=checkpoint_freq ) node_visit_order = webnetwork.get_visit_order() self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order[:num_nodes_before_crash]) self.crawler.close() self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE) self.crawler.crawl(seed_nodes=["11", "5"], max_nodes=10, recover=True, checkpoint_frequency=checkpoint_freq) node_visit_order = webnetwork.get_visit_order() print ("Node visit order is", node_visit_order) self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order)
class TestBFSNetworkCrawler(unittest.TestCase): def setUp(self): self.network_edges = {"11": ["1", "5", "8"], "1": ["11", "8"], "8": ["1", "11", "5"], "5": ["11", "8"]} self.node_info_data = None self.node_connections_data = None self.correct_visit_order = ["11", "5", "8", "1"] self.crawler = None self.STORE_TYPE = "sqlite" def tearDown(self): """ Function that deletes all files after the test is complete. Disable this for testing if required. """ print("Deleting all data files generated as a part of test tearDown().") self.crawler.gbuffer.destroy_stores() self.crawler.pqueue.destroy_state() def test_crawl(self): # Running the bfs crawler num_nodes_before_crash = 2 checkpoint_freq = 2 webnetwork = TestWebCaller("test", self.network_edges) self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE) self.crawler.crawl(seed_nodes=["11", "5"], max_nodes=num_nodes_before_crash, recover=True, checkpoint_frequency=checkpoint_freq) node_visit_order = webnetwork.get_visit_order() self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order[:num_nodes_before_crash]) self.crawler.close() self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE) self.crawler.crawl(seed_nodes=["11", "5"], max_nodes=10, recover=True, checkpoint_frequency=checkpoint_freq) node_visit_order = webnetwork.get_visit_order() print("Node visit order is", node_visit_order) self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order) #self.crawler.close() def compare_values(self, real_visit_order, test_visit_order): # Accessing the results of calling the source code self.node_info_data = self.crawler.gbuffer.nodes_store print(self.node_info_data) self.node_connections_data = self.crawler.gbuffer.edges_store self.node_visit_order = list(OrderedDict.fromkeys(real_visit_order)) # Testing the results with expected values test_node_info = {} test_node_edges_data = {} #test_visit_order = ["11", "5", "8", "1"] node_counter = 0 edge_counter = 0 test_webnetwork = TestWebCaller("test", self.network_edges) for i in test_visit_order: curr_node_info = test_webnetwork.get_node_info(i) curr_node_info['id'] = node_counter test_node_info.update({str(i): curr_node_info}) curr_edges_info = test_webnetwork.get_edges_info(i) for edge_info in curr_edges_info: edge_info['id'] = edge_counter test_node_edges_data[str(edge_counter)] = edge_info edge_counter += 1 node_counter += 1 print self.node_visit_order pprint(self.node_info_data) pprint(test_node_info) pprint(self.node_connections_data) pprint(test_node_edges_data) #pprint(self.node_connections_data) self.assertListEqual(test_visit_order, self.node_visit_order, "Problem in order of BFS crawl") self.assertDictEqual(test_node_info, dict(self.node_info_data), "Nodes do not match: Problem in BFS crawl.") self.assertDictEqual(test_node_edges_data, dict(self.node_connections_data), "Edges do not match: Problem in BFS crawl.")
class TestBFSNetworkCrawler(unittest.TestCase): def setUp(self): self.network_edges = {"11": ["1", "5", "8"], "1": ["11", "8"], "8": ["1", "11", "5"], "5": ["11", "8"]} self.node_info_data = None self.node_connections_data = None self.correct_visit_order = ["11", "5", "8", "1"] self.crawler = None self.STORE_TYPE = "sqlite" def tearDown(self): """ Function that deletes all files after the test is complete. Disable this for testing if required. """ print ("Deleting all data files generated as a part of test tearDown().") self.crawler.gbuffer.destroy_stores() self.crawler.pqueue.destroy_state() def test_crawl(self): # Running the bfs crawler num_nodes_before_crash = 2 checkpoint_freq = 2 webnetwork = TestWebCaller("test", self.network_edges) self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE) self.crawler.crawl( seed_nodes=["11", "5"], max_nodes=num_nodes_before_crash, recover=True, checkpoint_frequency=checkpoint_freq ) node_visit_order = webnetwork.get_visit_order() self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order[:num_nodes_before_crash]) self.crawler.close() self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE) self.crawler.crawl(seed_nodes=["11", "5"], max_nodes=10, recover=True, checkpoint_frequency=checkpoint_freq) node_visit_order = webnetwork.get_visit_order() print ("Node visit order is", node_visit_order) self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order) # self.crawler.close() def compare_values(self, real_visit_order, test_visit_order): # Accessing the results of calling the source code self.node_info_data = self.crawler.gbuffer.nodes_store print (self.node_info_data) self.node_connections_data = self.crawler.gbuffer.edges_store self.node_visit_order = list(OrderedDict.fromkeys(real_visit_order)) # Testing the results with expected values test_node_info = {} test_node_edges_data = {} # test_visit_order = ["11", "5", "8", "1"] node_counter = 0 edge_counter = 0 test_webnetwork = TestWebCaller("test", self.network_edges) for i in test_visit_order: curr_node_info = test_webnetwork.get_node_info(i) curr_node_info["id"] = node_counter test_node_info.update({str(i): curr_node_info}) curr_edges_info = test_webnetwork.get_edges_info(i) for edge_info in curr_edges_info: edge_info["id"] = edge_counter test_node_edges_data[str(edge_counter)] = edge_info edge_counter += 1 node_counter += 1 print self.node_visit_order pprint(self.node_info_data) pprint(test_node_info) pprint(self.node_connections_data) pprint(test_node_edges_data) # pprint(self.node_connections_data) self.assertListEqual(test_visit_order, self.node_visit_order, "Problem in order of BFS crawl") self.assertDictEqual(test_node_info, dict(self.node_info_data), "Nodes do not match: Problem in BFS crawl.") self.assertDictEqual( test_node_edges_data, dict(self.node_connections_data), "Edges do not match: Problem in BFS crawl." )
logging.basicConfig(filename="socintpy.log", level=numeric_loglevel) # Fetch api object using the settings from settings.py api_args = get_args(settings) api = get_api(api_args) """ # If you want to pass a dictionary (say params_dict), use **params_dict result = api.get_data(user='******', method="user.getRecentTracks", max_results = 100) print result """ #####NOTENOTE: I changed how api calls handle errors. if you get one error, then it breaks and does not fetch other api calls for the same user. #####TEST it when YOU NEXT RUN THIS CRAWLER. Also, the checkpoint_Freq should not be changed. Keep it to one unless further notification. cmd_params = cmd_script.get_cmd_parameters(sys.argv) if cmd_params['mode'] == "fetch_data": # Set up the data crawl logging.info("STARTING FETCH_DATA CRAWL. STANDBY!") crawler = BFSNetworkCrawler(api, store_type="sqlite") #crawler = BFSNetworkCrawler(api, seed_nodes=None, store_type="basic_shelve", recover=True) # Start the data crawl crawler.crawl(seed_nodes=api.get_uniform_random_nodes(100), max_nodes=1000000, recover=True, checkpoint_frequency=1) elif cmd_params['mode'] == "retry_errors": nodes_with_error = cmd_script.get_nodes_with_error(logfile="socintpy_old.log") print nodes_with_error, len(nodes_with_error) logging.info("STARTING RETRY_ERRORS CRAWL. STANDBY!") crawler = FixedNodesCrawler(api, store_type="sqlite") nodes_stored, edges_stored = cmd_script.recover_num_items_stored(logfile="socinty_old.log") crawler.crawl(nodes_list=nodes_with_error, start_node_id=nodes_stored, start_edge_id=edges_stored)