def process_data_batch(self, data: collections.Iterable, **kwargs): #print("MozFilter processing: ", data) account = kwargs.get("Account") temp = [] try: if isinstance(data, collections.Iterable) and isinstance(account, SiteAccount): temp = [x for x in data if isinstance(x, FilteredDomainData) and TldUtility.is_top_tld(x.domain)] check_list = [y.domain for y in temp] sleep_time =random.randint(self._min_sleep_time, self._max_wait) time.sleep(sleep_time) moz = MozCom(account) if not self._is_throughput_debug: rankings = moz.get_ranking_data_batch(check_list, limit=len(check_list)) else: rankings = [100] * len(temp) for i in range(len(temp)): temp[i].da = rankings[i] else: raise ValueError("account is none in process_data_batch()") except Exception as ex: ErrorLogger.log_error("MozFilter", ex, "process_data_batch() " + str(data) + " account: " + account.userID) finally: PrintLogger.print("Moz processed: " + str(data) + " with: " + account.userID) with self._sync_lock: job_done = [x for x in data if x is not None] self._job_done += len(job_done) if account is not None: account.Available = True for item in temp: if isinstance(item, FilteredDomainData): # print("moz processed:", item.domain) if item.da >= self._min_DA_value: if not self._is_throughput_debug: CsvLogger.log_to_file(self._log_file, [(item.domain, item.da)]) # log this to file self._output_queue.put(item)
def process_data(self, data: FilteredDomainData, **kwargs): account = kwargs.get("Account") # is_domain_good = False is_spammed = False try: if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount): majestic = MajesticCom(account) if self._en_spam_check: self._filter_domain_name(domain=data.domain) # self._filter_anchor_text(majestic, data.domain) # self._filter_ref_domains(majestic, data.domain) if self._en_tf_check: data = self._filter_tf_cf_backlink_ratio(majestic, data) if not (data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains): raise ValueError("tf or cf doesn't match. tf:" + str(data.tf) + " cf: " + str(data.cf) + " ref domain: " + str(data.ref_domains)) # if data.backlinks / data.ref_domains > self._max_backlink_to_ref_domain_ratio: # raise MajesticSpamException("backlink to ref domain ratio is greater than {0:.1f}".format(self._max_backlink_to_ref_domain_ratio,)) if self._en_spam_check: self._filter_anchor_text(majestic, data.domain) self._filter_ref_domains(majestic, data.domain) # is_domain_good = True else: raise ValueError("account is none in process_data") except MajesticSpamException as mjx_ex: is_spammed = True data.exception = str(mjx_ex) except Exception as ex: data.exception = str(ex) # ErrorLogger.log_error("MajesticFilter.process_data()", ex, str(data)) finally: PrintLogger.print("Majestic processed: '" + str(data) + "' with: " + account.userID) if isinstance(data, FilteredDomainData): with self._sync_lock: self._job_done += 1 if account is not None: account.Available = True # if data.cf >= self._min_cf and data.tf >= self._min_tf: if data.tf >= self._min_tf and data.cf >= self._min_cf and data.ref_domains >= self._min_ref_domains: # if data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains: #print("Majatic output:", data) # PrintLogger.print("domain: " + data.domain + " is good.") if not self._is_throughput_debug: if is_spammed: CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) else: CsvLogger.log_to_file(self._log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # log this to file self._output_queue.put(data) return data # elif is_spammed: # if not self._is_throughput_debug: # CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # self._output_queue.put(data) # return data else: if self._is_throughput_debug: self._output_queue.put(data) # return None # print("domain: " + data.domain + " has exception:" + data.exception) else: pass
def process_data(self, data: FilteredDomainData, **kwargs): result_ok = False if isinstance(data, FilteredDomainData): try: if len(data.domain_var) == 0: data.domain_var = data.domain links = ArchiveOrg.get_url_info(data.domain_var, min_size=self._min_page_size, limit=-100) count = len(links) data.archive = count if count < self._min_profile: pass # raise ValueError("profile count is less than:" + str(self._min_profile)) result_ok = True except Exception as ex: if not self._is_throughput_debug: pass # ErrorLogger.log_error("ArchiveOrgFilter.process_data()", ex, data.domain_var) finally: with self._sync_lock: self._job_done += 1 #with self._process_queue_lock: if result_ok: if not self._is_throughput_debug: CsvLogger.log_to_file(self._log_file, [(data.domain, data.da, data.archive)]) # log this to file self._output_queue.put(data) # return data else: if self._is_throughput_debug: self._output_queue.put(data) # return None else: return None
def process_data(self, data: FilteredDomainData, **kwargs): #print("MozFilter processing: ", data) account = kwargs.get("Account") try: if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount): if TldUtility.is_top_tld(data.domain): sleep_time =random.randint(self._min_sleep_time, self._max_wait) time.sleep(sleep_time) moz = MozCom(account) if not self._is_throughput_debug: ranking = moz.get_ranking_data(data.domain) else: ranking = 100 data.da = ranking else: pass else: raise ValueError("account is none in process_data") except Exception as ex: ErrorLogger.log_error("MozFilter", ex, "process_data() " + str(data) + " account: " + account.userID) finally: PrintLogger.print("Moz processed: " + str(data) + " with: " + account.userID) if isinstance(data, FilteredDomainData): with self._sync_lock: self._job_done += 1 if account is not None: account.Available = True if data.da >= self._min_DA_value: if not self._is_throughput_debug: CsvLogger.log_to_file(self._log_file, [(data.domain, data.da)]) # log this to file self._output_queue.put(data)
def testScrapePageBatch(self): save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv" file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) domains_links = FileHandler.read_lines_from_file(file_path) for link in domains_links: # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/" #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/" stop_event = multiprocessing.Event() inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail( link) root_domain = LinkChecker.get_root_domain(domain)[1] path = "/index.html" link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0) explorer = ArchiveExplorer( original_domain=root_domain, link=link, external_stop_event=stop_event, download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2) explorer.run() archive_detail = explorer.get_archive_detail() CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
def testExportCsv(self): from_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/Sum.db" to_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/2015_OLD.csv" table_name = "2015 Old" from_db = FilteredResultDB(table_name, db_addr=from_addr) data = [x for x in from_db.get_all_sites() if x[1] > 0] CsvLogger.log_to_file_path(to_addr, [FilteredResultDB.get_fields_names(),]) CsvLogger.log_to_file_path(to_addr, data)
def testDA_bulk(self): log_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/accounts/good_accounts.csv" bad_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/accounts/bad_accounts.csv" good_rows = [] bad_rows = [] data_counter = 0 domains = [] data_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/03-09-2015-Bad-Results.csv" with open(data_path, mode='r', newline='') as csv_file: rd = csv.reader(csv_file, delimiter=',') for row in rd: if data_counter > 0: domains.append(row[0]) data_counter += 1 problem_account = [] file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/accounts/good_accounts_backup.csv" count = 0 work_count = 0 non_work_count = 0 with open(file_path, mode='r', newline='') as csv_file: reader = csv.reader(csv_file, delimiter=',') for email, psd, user_name, access_id, api_key in reader: if count not in problem_account: try: print( "email:", email, "psd:", psd, " user_name:", user_name, " access_id:", access_id, ) account = MozCom( SiteAccount(siteType=AccountType.Moz, userID=email, password=psd, AccessID=access_id, APIkey=api_key)) da = account.get_ranking_data(domains[count]) print("count: ", count, " access id:", access_id, " site:", domains[count], " da:", da) time.sleep(0.2) work_count += 1 good_rows.append((count + 1, email, psd, user_name, access_id, api_key)) except Exception as ex: bad_rows.append((count + 1, email, psd, user_name, access_id, api_key)) print(ex) non_work_count += 1 count += 1 CsvLogger.log_to_file_path(log_path, good_rows) CsvLogger.log_to_file_path(bad_log_path, bad_rows) print("total:", count, " worked:", work_count, " not-worked:", non_work_count)
def add_proxies(self, proxies: []): if proxies is not None: convtered = [] for proxy in proxies: if isinstance(proxy, ProxyStruct): convtered.append((proxy.addr, proxy.port, proxy.alt_port, proxy.user_name, proxy.psd)) FileHandler.create_file_if_not_exist(self._file_path) CsvLogger.log_to_file_path(self._file_path, convtered)
def testGetBestProfileBatch(self): file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt" domains = FileHandler.read_lines_from_file(file_path) save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) for domain in domains: print("begin domain:", domain) try: archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000) CsvLogger.log_to_file_path(save_path, [archive.to_tuple()]) except Exception as ex: print(ex)
def testScrapePage(self): # link = "http://web.archive.org/web/20111102054835/http://www.agfdh.org:80/" link = "http://web.archive.org/web/20150425143742/http://susodigital.com/" #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/" stop_event = multiprocessing.Event() inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link) root_domain = LinkChecker.get_root_domain(domain)[1] path = "/index.html" link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0) explorer = ArchiveExplorer(original_domain=root_domain, link=link, external_stop_event=stop_event, download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2) explorer.run() save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) archive_detail = explorer.get_archive_detail() CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
def write_to_error_log(self, data: tuple): CsvLogger.log_to_file_path(self.change_log_file_path, [data])