import sys import time sys.path.append("../web_crawler") from web_crawler import WebCrawler sys.path.append("..") from privileges import construct_full_privilege, privileges_bigger_or_equal master_crawler = WebCrawler.create_master ( privileges = construct_full_privilege(), start_url = "http://antyweb.pl/" ) WebCrawler.create_worker ( privileges = construct_full_privilege(), master = master_crawler, max_internal_expansion = 5, max_external_expansion = 3, max_crawling_depth = 100, ) master_crawler.run() time.sleep(60*60*24*3) master_crawler.terminate()
""" import sys import time sys.path.append("../web_crawler") from web_crawler import WebCrawler sys.path.append("..") from privileges import construct_full_privilege, privileges_bigger_or_equal master_crawler = WebCrawler.create_master ( privileges = construct_full_privilege(), start_url = "http://rss.wp.pl/" ) WebCrawler.create_worker ( master = master_crawler, privileges = construct_full_privilege(), max_internal_expansion = 10, max_database_updates = 10 ) master_crawler.run() time.sleep(120) master_crawler.terminate()
print 'Output will be APPENDED to file named ' + EXPORT_FILE + '\n' if len(sys.argv) == 1: exit() master_crawler = WebCrawler.create_master ( privileges = construct_full_privilege(), start_url = str(sys.argv[1]), ) WebCrawler.create_worker ( privileges = construct_full_privilege(), master = master_crawler, max_external_expansion = 1000, max_internal_expansion = 4, max_crawling_depth = 3, list_export = True, export_dicts = True, export_file = EXPORT_FILE, ) master_crawler.run() while master_crawler.is_working(): time.sleep(1) master_crawler.terminate() time.sleep(2)
This test precisely explores www.wykop.pl in search for RSS feeds. """ import sys import time sys.path.append("../web_crawler") from web_crawler import WebCrawler sys.path.append("..") from privileges import construct_full_privilege, privileges_bigger_or_equal master_crawler = WebCrawler.create_master ( privileges = construct_full_privilege(), start_url = "http://www.wykop.pl/" ) WebCrawler.create_worker ( privileges = construct_full_privilege(), master = master_crawler, max_crawling_depth = 3 ) master_crawler.run() time.sleep(60*60*24*3) master_crawler.terminate()