def testScrapePageBatch(self): save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv" file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) domains_links = FileHandler.read_lines_from_file(file_path) for link in domains_links: # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/" #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/" stop_event = multiprocessing.Event() inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail( link) root_domain = LinkChecker.get_root_domain(domain)[1] path = "/index.html" link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0) explorer = ArchiveExplorer( original_domain=root_domain, link=link, external_stop_event=stop_event, download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2) explorer.run() archive_detail = explorer.get_archive_detail() CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
def testGetBestProfileBatch(self): file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt" domains = FileHandler.read_lines_from_file(file_path) save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) for domain in domains: print("begin domain:", domain) try: archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000) CsvLogger.log_to_file_path(save_path, [archive.to_tuple()]) except Exception as ex: print(ex)
def testGetBestProfileBatch(self): file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt" domains = FileHandler.read_lines_from_file(file_path) save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) for domain in domains: print("begin domain:", domain) try: archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000) CsvLogger.log_to_file_path(save_path, [archive.to_tuple()]) except Exception as ex: print(ex)
def testScrapePage(self): # link = "http://web.archive.org/web/20111102054835/http://www.agfdh.org:80/" link = "http://web.archive.org/web/20150425143742/http://susodigital.com/" #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/" stop_event = multiprocessing.Event() inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link) root_domain = LinkChecker.get_root_domain(domain)[1] path = "/index.html" link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0) explorer = ArchiveExplorer(original_domain=root_domain, link=link, external_stop_event=stop_event, download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2) explorer.run() save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) archive_detail = explorer.get_archive_detail() CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])