def testScrapePageBatch(self):
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     domains_links = FileHandler.read_lines_from_file(file_path)
     for link in domains_links:
         # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
         #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
         stop_event = multiprocessing.Event()
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
             link)
         root_domain = LinkChecker.get_root_domain(domain)[1]
         path = "/index.html"
         link_s = LinkAttrs(link=link,
                            path=path,
                            ref_link="/",
                            shadow_ref_link="/",
                            source=path,
                            res_type=LinkUtility.EXT_WEBPAGE,
                            level=0)
         explorer = ArchiveExplorer(
             original_domain=root_domain,
             link=link,
             external_stop_event=stop_event,
             download_base_dir=FilePath.get_default_archive_dir(),
             max_thread=10,
             max_level=2)
         explorer.run()
         archive_detail = explorer.get_archive_detail()
         CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
 def testExportCsv(self):
     from_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/Sum.db"
     to_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/2015_OLD.csv"
     table_name = "2015 Old"
     from_db = FilteredResultDB(table_name, db_addr=from_addr)
     data = [x for x in from_db.get_all_sites() if x[1] > 0]
     CsvLogger.log_to_file_path(to_addr, [FilteredResultDB.get_fields_names(),])
     CsvLogger.log_to_file_path(to_addr, data)
Пример #3
0
 def testDA_bulk(self):
     log_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/accounts/good_accounts.csv"
     bad_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/accounts/bad_accounts.csv"
     good_rows = []
     bad_rows = []
     data_counter = 0
     domains = []
     data_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/03-09-2015-Bad-Results.csv"
     with open(data_path, mode='r', newline='') as csv_file:
         rd = csv.reader(csv_file, delimiter=',')
         for row in rd:
             if data_counter > 0:
                 domains.append(row[0])
             data_counter += 1
     problem_account = []
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/accounts/good_accounts_backup.csv"
     count = 0
     work_count = 0
     non_work_count = 0
     with open(file_path, mode='r', newline='') as csv_file:
         reader = csv.reader(csv_file, delimiter=',')
         for email, psd, user_name, access_id, api_key in reader:
             if count not in problem_account:
                 try:
                     print(
                         "email:",
                         email,
                         "psd:",
                         psd,
                         " user_name:",
                         user_name,
                         " access_id:",
                         access_id,
                     )
                     account = MozCom(
                         SiteAccount(siteType=AccountType.Moz,
                                     userID=email,
                                     password=psd,
                                     AccessID=access_id,
                                     APIkey=api_key))
                     da = account.get_ranking_data(domains[count])
                     print("count: ", count, " access id:", access_id,
                           " site:", domains[count], " da:", da)
                     time.sleep(0.2)
                     work_count += 1
                     good_rows.append((count + 1, email, psd, user_name,
                                       access_id, api_key))
                 except Exception as ex:
                     bad_rows.append((count + 1, email, psd, user_name,
                                      access_id, api_key))
                     print(ex)
                     non_work_count += 1
             count += 1
     CsvLogger.log_to_file_path(log_path, good_rows)
     CsvLogger.log_to_file_path(bad_log_path, bad_rows)
     print("total:", count, " worked:", work_count, " not-worked:",
           non_work_count)
Пример #4
0
 def add_proxies(self, proxies: []):
     if proxies is not None:
         convtered = []
         for proxy in proxies:
             if isinstance(proxy, ProxyStruct):
                 convtered.append((proxy.addr, proxy.port, proxy.alt_port,
                                   proxy.user_name, proxy.psd))
         FileHandler.create_file_if_not_exist(self._file_path)
         CsvLogger.log_to_file_path(self._file_path, convtered)
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain,
                                                   thread_size=100,
                                                   profile_check=10,
                                                   pass_threshold=0.9,
                                                   res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
 def testScrapePage(self):
     # link = "http://web.archive.org/web/20111102054835/http://www.agfdh.org:80/"
     link = "http://web.archive.org/web/20150425143742/http://susodigital.com/"
     #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
     stop_event = multiprocessing.Event()
     inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link)
     root_domain = LinkChecker.get_root_domain(domain)[1]
     path = "/index.html"
     link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0)
     explorer = ArchiveExplorer(original_domain=root_domain, link=link,
                                external_stop_event=stop_event,
                                download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2)
     explorer.run()
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     archive_detail = explorer.get_archive_detail()
     CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
 def write_to_error_log(self, data: tuple):
     CsvLogger.log_to_file_path(self.change_log_file_path, [data])
Пример #9
0
 def write_to_error_log(self, data: tuple):
     CsvLogger.log_to_file_path(self.change_log_file_path, [data])