示例#1
0
 def process_data_batch(self, data: collections.Iterable, **kwargs):
     #print("MozFilter processing: ", data)
     account = kwargs.get("Account")
     temp = []
     try:
         if isinstance(data, collections.Iterable) and isinstance(account, SiteAccount):
             temp = [x for x in data if isinstance(x, FilteredDomainData) and TldUtility.is_top_tld(x.domain)]
             check_list = [y.domain for y in temp]
             sleep_time =random.randint(self._min_sleep_time, self._max_wait)
             time.sleep(sleep_time)
             moz = MozCom(account)
             if not self._is_throughput_debug:
                 rankings = moz.get_ranking_data_batch(check_list, limit=len(check_list))
             else:
                 rankings = [100] * len(temp)
             for i in range(len(temp)):
                 temp[i].da = rankings[i]
         else:
             raise ValueError("account is none in process_data_batch()")
     except Exception as ex:
         ErrorLogger.log_error("MozFilter", ex, "process_data_batch() " + str(data) + " account: " + account.userID)
     finally:
         PrintLogger.print("Moz processed: " + str(data) + " with: " + account.userID)
         with self._sync_lock:
             job_done = [x for x in data if x is not None]
             self._job_done += len(job_done)
             if account is not None:
                 account.Available = True
             for item in temp:
                 if isinstance(item, FilteredDomainData):
                     # print("moz processed:", item.domain)
                     if item.da >= self._min_DA_value:
                         if not self._is_throughput_debug:
                             CsvLogger.log_to_file(self._log_file, [(item.domain, item.da)]) # log this to file
                         self._output_queue.put(item)
示例#2
0
 def process_data(self, data: FilteredDomainData, **kwargs):
     account = kwargs.get("Account")
     # is_domain_good = False
     is_spammed = False
     try:
         if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount):
             majestic = MajesticCom(account)
             if self._en_spam_check:
                 self._filter_domain_name(domain=data.domain)
                 # self._filter_anchor_text(majestic, data.domain)
                 # self._filter_ref_domains(majestic, data.domain)
             if self._en_tf_check:
                 data = self._filter_tf_cf_backlink_ratio(majestic, data)
             if not (data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains):
                 raise ValueError("tf or cf doesn't match. tf:" + str(data.tf) + " cf: " + str(data.cf) + " ref domain: " + str(data.ref_domains))
             # if data.backlinks / data.ref_domains > self._max_backlink_to_ref_domain_ratio:
             #     raise MajesticSpamException("backlink to ref domain ratio is greater than {0:.1f}".format(self._max_backlink_to_ref_domain_ratio,))
             if self._en_spam_check:
                 self._filter_anchor_text(majestic, data.domain)
                 self._filter_ref_domains(majestic, data.domain)
             # is_domain_good = True
         else:
             raise ValueError("account is none in process_data")
     except MajesticSpamException as mjx_ex:
         is_spammed = True
         data.exception = str(mjx_ex)
     except Exception as ex:
         data.exception = str(ex)
         # ErrorLogger.log_error("MajesticFilter.process_data()", ex, str(data))
     finally:
         PrintLogger.print("Majestic processed: '" + str(data) + "' with: " + account.userID)
         if isinstance(data, FilteredDomainData):
             with self._sync_lock:
                 self._job_done += 1
                 if account is not None:
                     account.Available = True
                 # if data.cf >= self._min_cf and data.tf >= self._min_tf:
                 if data.tf >= self._min_tf and data.cf >= self._min_cf and data.ref_domains >= self._min_ref_domains:
                 # if data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains:
                     #print("Majatic output:", data)
                     # PrintLogger.print("domain: " + data.domain + " is good.")
                     if not self._is_throughput_debug:
                         if is_spammed:
                             CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir())
                         else:
                             CsvLogger.log_to_file(self._log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # log this to file
                     self._output_queue.put(data)
                     return data
                 # elif is_spammed:
                 #     if not self._is_throughput_debug:
                 #         CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir())
                 #     self._output_queue.put(data)
                     # return data
                 else:
                     if self._is_throughput_debug:
                         self._output_queue.put(data)
                     # return None
                     # print("domain: " + data.domain + " has exception:" + data.exception)
         else:
             pass
示例#3
0
 def process_data(self, data: FilteredDomainData, **kwargs):
     result_ok = False
     if isinstance(data, FilteredDomainData):
         try:
             if len(data.domain_var) == 0:
                 data.domain_var = data.domain
             links = ArchiveOrg.get_url_info(data.domain_var, min_size=self._min_page_size, limit=-100)
             count = len(links)
             data.archive = count
             if count < self._min_profile:
                 pass
                 # raise ValueError("profile count is less than:" + str(self._min_profile))
             result_ok = True
         except Exception as ex:
             if not self._is_throughput_debug:
                 pass
                 # ErrorLogger.log_error("ArchiveOrgFilter.process_data()", ex, data.domain_var)
         finally:
             with self._sync_lock:
                 self._job_done += 1
                     #with self._process_queue_lock:
                 if result_ok:
                     if not self._is_throughput_debug:
                         CsvLogger.log_to_file(self._log_file, [(data.domain, data.da, data.archive)]) # log this to file
                     self._output_queue.put(data)
                     # return data
                 else:
                     if self._is_throughput_debug:
                         self._output_queue.put(data)
                     # return None
     else:
         return None
示例#4
0
 def process_data(self, data: FilteredDomainData, **kwargs):
     #print("MozFilter processing: ", data)
     account = kwargs.get("Account")
     try:
         if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount):
             if TldUtility.is_top_tld(data.domain):
                 sleep_time =random.randint(self._min_sleep_time, self._max_wait)
                 time.sleep(sleep_time)
                 moz = MozCom(account)
                 if not self._is_throughput_debug:
                     ranking = moz.get_ranking_data(data.domain)
                 else:
                     ranking = 100
                 data.da = ranking
             else:
                 pass
         else:
             raise ValueError("account is none in process_data")
     except Exception as ex:
         ErrorLogger.log_error("MozFilter", ex, "process_data() " + str(data) + " account: " + account.userID)
     finally:
         PrintLogger.print("Moz processed: " + str(data) + " with: " + account.userID)
         if isinstance(data, FilteredDomainData):
             with self._sync_lock:
                 self._job_done += 1
                 if account is not None:
                     account.Available = True
             if data.da >= self._min_DA_value:
                 if not self._is_throughput_debug:
                     CsvLogger.log_to_file(self._log_file, [(data.domain, data.da)]) # log this to file
                 self._output_queue.put(data)
 def testScrapePageBatch(self):
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     domains_links = FileHandler.read_lines_from_file(file_path)
     for link in domains_links:
         # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
         #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
         stop_event = multiprocessing.Event()
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
             link)
         root_domain = LinkChecker.get_root_domain(domain)[1]
         path = "/index.html"
         link_s = LinkAttrs(link=link,
                            path=path,
                            ref_link="/",
                            shadow_ref_link="/",
                            source=path,
                            res_type=LinkUtility.EXT_WEBPAGE,
                            level=0)
         explorer = ArchiveExplorer(
             original_domain=root_domain,
             link=link,
             external_stop_event=stop_event,
             download_base_dir=FilePath.get_default_archive_dir(),
             max_thread=10,
             max_level=2)
         explorer.run()
         archive_detail = explorer.get_archive_detail()
         CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
 def testExportCsv(self):
     from_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/Sum.db"
     to_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/2015_OLD.csv"
     table_name = "2015 Old"
     from_db = FilteredResultDB(table_name, db_addr=from_addr)
     data = [x for x in from_db.get_all_sites() if x[1] > 0]
     CsvLogger.log_to_file_path(to_addr, [FilteredResultDB.get_fields_names(),])
     CsvLogger.log_to_file_path(to_addr, data)
示例#7
0
 def testDA_bulk(self):
     log_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/accounts/good_accounts.csv"
     bad_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/accounts/bad_accounts.csv"
     good_rows = []
     bad_rows = []
     data_counter = 0
     domains = []
     data_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/03-09-2015-Bad-Results.csv"
     with open(data_path, mode='r', newline='') as csv_file:
         rd = csv.reader(csv_file, delimiter=',')
         for row in rd:
             if data_counter > 0:
                 domains.append(row[0])
             data_counter += 1
     problem_account = []
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/accounts/good_accounts_backup.csv"
     count = 0
     work_count = 0
     non_work_count = 0
     with open(file_path, mode='r', newline='') as csv_file:
         reader = csv.reader(csv_file, delimiter=',')
         for email, psd, user_name, access_id, api_key in reader:
             if count not in problem_account:
                 try:
                     print(
                         "email:",
                         email,
                         "psd:",
                         psd,
                         " user_name:",
                         user_name,
                         " access_id:",
                         access_id,
                     )
                     account = MozCom(
                         SiteAccount(siteType=AccountType.Moz,
                                     userID=email,
                                     password=psd,
                                     AccessID=access_id,
                                     APIkey=api_key))
                     da = account.get_ranking_data(domains[count])
                     print("count: ", count, " access id:", access_id,
                           " site:", domains[count], " da:", da)
                     time.sleep(0.2)
                     work_count += 1
                     good_rows.append((count + 1, email, psd, user_name,
                                       access_id, api_key))
                 except Exception as ex:
                     bad_rows.append((count + 1, email, psd, user_name,
                                      access_id, api_key))
                     print(ex)
                     non_work_count += 1
             count += 1
     CsvLogger.log_to_file_path(log_path, good_rows)
     CsvLogger.log_to_file_path(bad_log_path, bad_rows)
     print("total:", count, " worked:", work_count, " not-worked:",
           non_work_count)
 def add_proxies(self, proxies: []):
     if proxies is not None:
         convtered = []
         for proxy in proxies:
             if isinstance(proxy, ProxyStruct):
                 convtered.append((proxy.addr, proxy.port, proxy.alt_port,
                                   proxy.user_name, proxy.psd))
         FileHandler.create_file_if_not_exist(self._file_path)
         CsvLogger.log_to_file_path(self._file_path, convtered)
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain,
                                                   thread_size=100,
                                                   profile_check=10,
                                                   pass_threshold=0.9,
                                                   res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
 def testScrapePage(self):
     # link = "http://web.archive.org/web/20111102054835/http://www.agfdh.org:80/"
     link = "http://web.archive.org/web/20150425143742/http://susodigital.com/"
     #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
     stop_event = multiprocessing.Event()
     inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link)
     root_domain = LinkChecker.get_root_domain(domain)[1]
     path = "/index.html"
     link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0)
     explorer = ArchiveExplorer(original_domain=root_domain, link=link,
                                external_stop_event=stop_event,
                                download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2)
     explorer.run()
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     archive_detail = explorer.get_archive_detail()
     CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
 def write_to_error_log(self, data: tuple):
     CsvLogger.log_to_file_path(self.change_log_file_path, [data])
示例#13
0
 def write_to_error_log(self, data: tuple):
     CsvLogger.log_to_file_path(self.change_log_file_path, [data])