def process_data(self, data: FilteredDomainData, **kwargs): account = kwargs.get("Account") # is_domain_good = False is_spammed = False try: if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount): majestic = MajesticCom(account) if self._en_spam_check: self._filter_domain_name(domain=data.domain) # self._filter_anchor_text(majestic, data.domain) # self._filter_ref_domains(majestic, data.domain) if self._en_tf_check: data = self._filter_tf_cf_backlink_ratio(majestic, data) if not (data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains): raise ValueError("tf or cf doesn't match. tf:" + str(data.tf) + " cf: " + str(data.cf) + " ref domain: " + str(data.ref_domains)) # if data.backlinks / data.ref_domains > self._max_backlink_to_ref_domain_ratio: # raise MajesticSpamException("backlink to ref domain ratio is greater than {0:.1f}".format(self._max_backlink_to_ref_domain_ratio,)) if self._en_spam_check: self._filter_anchor_text(majestic, data.domain) self._filter_ref_domains(majestic, data.domain) # is_domain_good = True else: raise ValueError("account is none in process_data") except MajesticSpamException as mjx_ex: is_spammed = True data.exception = str(mjx_ex) except Exception as ex: data.exception = str(ex) # ErrorLogger.log_error("MajesticFilter.process_data()", ex, str(data)) finally: PrintLogger.print("Majestic processed: '" + str(data) + "' with: " + account.userID) if isinstance(data, FilteredDomainData): with self._sync_lock: self._job_done += 1 if account is not None: account.Available = True # if data.cf >= self._min_cf and data.tf >= self._min_tf: if data.tf >= self._min_tf and data.cf >= self._min_cf and data.ref_domains >= self._min_ref_domains: # if data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains: #print("Majatic output:", data) # PrintLogger.print("domain: " + data.domain + " is good.") if not self._is_throughput_debug: if is_spammed: CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) else: CsvLogger.log_to_file(self._log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # log this to file self._output_queue.put(data) return data # elif is_spammed: # if not self._is_throughput_debug: # CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # self._output_queue.put(data) # return data else: if self._is_throughput_debug: self._output_queue.put(data) # return None # print("domain: " + data.domain + " has exception:" + data.exception) else: pass
def _filter_ref_domains(self, majestic: MajesticCom, domain: str) -> bool: """ check ref_domain info, :param majestic: :param domain: :return: True if everything is ok, else raise Exception. """ max_bad_country_ratio = 0.25 bad_country_count = 0 max_bad_country_count = 5 max_backlinks_for_single_bad_country = 30 ref_domains = majestic.get_ref_domains(domain, max_count=self._majestic_result_ref_domain_limit, is_dev=DomainFinderSrc.IS_DEBUG, fresh_data=True) total_record = len(ref_domains) for ref_domain in ref_domains: if isinstance(ref_domain, MajesticRefDomainStruct): if ref_domain.country in self._bad_country: bad_country_count += 1 if ref_domain.backlinks > max_backlinks_for_single_bad_country: raise MajesticSpamException("{0:s} from bad country has more than {1:d} backlinks.".format(ref_domain.domain,max_backlinks_for_single_bad_country)) if bad_country_count >= max_bad_country_count: raise MajesticSpamException("too many bad countries, {0:d} detected.".format(bad_country_count,)) bad_country_ratio = bad_country_count/total_record if total_record > 0 and bad_country_ratio > max_bad_country_ratio: raise MajesticSpamException("bad country ratio in ref domains is too high: {0:.1f} percent.".format(bad_country_ratio*100,)) return True
def _filter_tf_cf_backlink_ratio(self, majestic: MajesticCom, data: FilteredDomainData) -> FilteredDomainData: ranking = majestic.get_cf_tf_list(["http://"+data.domain, "www."+data.domain, "http://www."+data.domain], is_dev=DomainFinderSrc.IS_DEBUG) if ranking is not None and len(ranking) > 0: current_tf = 0 for item in ranking: if isinstance(item, MajesticComStruct): item_cf_tf_ratio = 999 data_cf_tf_ratio = 999 item_deviation = 999 data_deviation = 999 if item.tf > 0: item_cf_tf_ratio1 = abs(1-item.cf/item.tf) item_cf_tf_ratio2 = abs(1-item.tf/item.cf) item_deviation = min([item_cf_tf_ratio1, item_cf_tf_ratio2]) else: continue if data.tf > 0: data_cf_tf_ratio1 = abs(1-data.cf/data.tf) data_cf_tf_ratio2 = abs(1-data.tf/data.cf) data_deviation = min([data_cf_tf_ratio1, data_cf_tf_ratio2]) # data_deviation = abs(1-data_cf_tf_ratio) if item.tf >= self._min_tf and item.cf >=self._min_cf and item_deviation < data_deviation and item_deviation <= self._cf_tf_deviation: data.domain_var = item.domain data.tf = item.tf data.cf = item.cf data.backlinks = item.backlinks data.ref_domains = item.ref_domains data.topic = item.topic return data
def _filter_anchor_text(self, majestic: MajesticCom, domain: str) -> bool: """ check anchor text. :param majestic: :param domain: :return:True if everything ok, else raise Exception. """ brand = LinkChecker.get_root_domain(domain)[6] min_anchor_variation_limit = 2 no_follow_limit = 0.5 non_brand_share_limit = 0.25 domain_contain_limit = 5 is_in_anchor = False brand_name_repeat_count = 0 brand_name_backlinks_count = 0 anchor_list, total_backlinks, deleted, nofollow, total_ref_domains \ = majestic.get_anchor_text_info(domain=domain, max_count=self._majestic_result_anchor_limit, is_dev=DomainFinderSrc.IS_DEBUG, fresh_data=True) if len(anchor_list) <= min_anchor_variation_limit: raise MajesticSpamException("number of anchor variation is less than 2.") elif nofollow/total_backlinks > no_follow_limit: pass # raise MajesticSpamException("nofollow backlinks are more than 50%.") elif len(self._spam_anchor) > 0: count = 0 for anchor, ref_domains, total_links, deleted_links, no_follow_links in anchor_list: if brand in anchor or brand in anchor.replace(' ', ''): if count < domain_contain_limit: is_in_anchor = True brand_name_backlinks_count += total_links brand_name_repeat_count += 1 elif ref_domains/total_ref_domains > non_brand_share_limit: # elif total_links/total_backlinks > non_brand_share_limit: raise MajesticSpamException("non branded anchor '{0:s}' exceeded limit {1:.2f}.".format(anchor, ref_domains/total_ref_domains)) for spam in self._spam_anchor: if spam in anchor and not any(x in anchor for x in self._white_keyword_list): raise MajesticSpamException("anchor {0:s} is in spam word {1:s}".format(anchor, spam)) count += 1 # if brand_name_backlinks_count/total > self._max_percentage_for_anchor_text_ratio: # raise MajesticSpamException("domain name mentioned in achor texts more than {0:.1f}.".format(self._max_percentage_for_anchor_text_ratio*100,)) if not is_in_anchor: pass #print(anchor_list) # raise MajesticSpamException("anchor does not have the domain name in top {0:d} results.".format(domain_contain_limit,)) return True
from DomainFinderSrc.MajesticCom import * from DomainFinderSrc.SiteConst import * majestic_account = SiteAccount(siteType=AccountType.Majestic, userID="*****@*****.**", password="******", APIkey="1BB1D141D20CAF35D331F086F55C1CEE") majestic = MajesticCom(majestic_account) moz_account = SiteAccount(siteType=AccountType.Moz, userID="*****@*****.**", password="******", AccessID="mozscape-320a4616a8", APIkey="f03c19321b0973573137288c647b31ea") moz_account_fake = SiteAccount(siteType=AccountType.Moz, userID="*****@*****.**", password="******", AccessID="mozscape-44a37bfcd5", APIkey="bedefa75b4c17317a94a421108974f1d") amazon_ec2_account = SiteAccount( siteType=AccountType.AmazonEC2, userID="*****@*****.**", AccessID="AKIAIPA2WM3ILJWR2KSA", APIkey="7EisLQmbOv04ExZM9Fj1rxnmWiKw8wae5shRPDdx") buy_proxy_org_account = SiteAccount(siteType=AccountType.BuyProxyOrg, userID="*****@*****.**", password="******", AccessID="49885",