def apply_domain_limit(entries, domain_limit): """ Takes a list of sanitised URLs and looks at their domains. If a domain count exceeds the limit, the urls from that domain are removed and replaced with a single 'domain: url' entry. The counts of the new 'domain:' entries, and the remaining links are also returned. """ stripped_urls = subdomains(entries) new_url_set = set() exceeded_domains = set() domains_seen = set() domains_count = dict() for url in stripped_urls: if url not in domains_seen: domains_seen.add(url) domains_count[url] = 1 else: domains_count[url] += 1 for url in stripped_urls: if domains_count[url] >= domain_limit: exceeded_domains.add(url) for entry in entries: if (subdomain(entry) not in exceeded_domains and rootdomain(entry) not in exceeded_domains): new_url_set.add(entry) applied_domain_limit = namedtuple('applied_domain_limit', 'remaining_urls exceeded_domains') return applied_domain_limit(new_url_set, exceeded_domains)
def apply_disavow(disavow_entries, urls_list): """ Using a disavow file, tests which of a file of urls would be disavowed and which wouldn't. """ disavow_links = [] disavow_domains = [] output_dict = {} if 'urls' in disavow_entries: disavow_links_details = normalize_and_dedupe_with_counts( disavow_entries['urls']) disavow_links = disavow_links_details.clean_urls output_dict[ 'disavow_links_entered'] = disavow_links_details.urls_entered output_dict[ 'unique_disavow_links_entered'] = disavow_links_details.unique_urls_entered if 'domains' in disavow_entries: disavow_domains_details = normalize_and_dedupe_with_counts( disavow_entries['domains']) disavow_domains = subdomains(disavow_domains_details.clean_urls) urls_to_test_details = normalize_and_dedupe_with_counts(urls_list) urls = urls_to_test_details.clean_urls disavowed_urls = [] non_disavowed_urls = [] for url in urls: if (url in disavow_links) or (subdomain(url) in disavow_domains): disavowed_urls.append(url) else: non_disavowed_urls.append(url) total_disavowed_links = len(disavowed_urls) total_remaining_links = len(non_disavowed_urls) output_dict.update({ 'disavowed': disavowed_urls, 'non_disavowed': non_disavowed_urls, 'domains_entered': disavow_domains_details.urls_entered, 'unique_domains_entered': disavow_domains_details.unique_urls_entered, 'urls_entered_to_test': urls_to_test_details.urls_entered, 'unique_urls_entered_to_test': urls_to_test_details.unique_urls_entered, 'total_disavowed_links': total_disavowed_links, 'total_remaining_links': total_remaining_links }) return output_dict
def apply_disavow(disavow_entries, urls_list): """ Using a disavow file, tests which of a file of urls would be disavowed and which wouldn't. """ disavow_links = [] disavow_domains = [] output_dict = {} if 'urls' in disavow_entries: disavow_links_details = normalize_and_dedupe_with_counts(disavow_entries['urls']) disavow_links = disavow_links_details.clean_urls output_dict['disavow_links_entered'] = disavow_links_details.urls_entered output_dict['unique_disavow_links_entered'] = disavow_links_details.unique_urls_entered if 'domains' in disavow_entries: disavow_domains_details = normalize_and_dedupe_with_counts(disavow_entries['domains']) disavow_domains = subdomains(disavow_domains_details.clean_urls) urls_to_test_details = normalize_and_dedupe_with_counts(urls_list) urls = urls_to_test_details.clean_urls disavowed_urls = [] non_disavowed_urls = [] for url in urls: if (url in disavow_links) or (subdomain(url) in disavow_domains): disavowed_urls.append(url) else: non_disavowed_urls.append(url) total_disavowed_links = len(disavowed_urls) total_remaining_links = len(non_disavowed_urls) output_dict.update({ 'disavowed': disavowed_urls, 'non_disavowed': non_disavowed_urls, 'domains_entered': disavow_domains_details.urls_entered, 'unique_domains_entered': disavow_domains_details.unique_urls_entered, 'urls_entered_to_test': urls_to_test_details.urls_entered, 'unique_urls_entered_to_test': urls_to_test_details.unique_urls_entered, 'total_disavowed_links': total_disavowed_links, 'total_remaining_links': total_remaining_links }) return output_dict
def disavow_file_to_dict(file_contents, domain_limit=False): """ Takes a disavow file and applies many helper functions, outputting a dictionary with old and new domain entries, the individual links to be disavowed, as well as useful counts. """ entries_dict = import_from_file_contents(file_contents) link_entries_details = normalize_and_dedupe_with_counts( entries_dict['urls']) link_entries = link_entries_details.clean_urls domain_entries_details = normalize_and_dedupe_with_counts( entries_dict['domains']) domain_entries = subdomains(domain_entries_details.clean_urls) if domain_entries: applied_disavow = apply_disavow({"domains": entries_dict['domains']}, entries_dict['urls']) link_entries = applied_disavow['non_disavowed'] final_domain_entries = set() final_domain_entries.update(domain_entries) if domain_limit: link_entries, new_domain_entries = apply_domain_limit( link_entries, domain_limit) final_domain_entries.update(new_domain_entries) if domain_entries and domain_limit: domain_entries, new_domain_entries = remove_redundant_domains( domain_entries, new_domain_entries) # total_domains_disavowed = len(domain_entries + new_domain_entries) links_disavowed = len(link_entries) return { 'domain_entries': list(final_domain_entries), 'url_entries': link_entries, 'urls_entered_count': link_entries_details.urls_entered, 'urls_disavowed_count': links_disavowed, 'unique_urls_entered_count': link_entries_details.unique_urls_entered, 'domain_entries_entered_count': domain_entries_details.urls_entered, }
def disavow_file_to_dict(file_contents, domain_limit=False): """ Takes a disavow file and applies many helper functions, outputting a dictionary with old and new domain entries, the individual links to be disavowed, as well as useful counts. """ entries_dict = import_from_file_contents(file_contents) link_entries_details = normalize_and_dedupe_with_counts(entries_dict['urls']) link_entries = link_entries_details.clean_urls domain_entries_details = normalize_and_dedupe_with_counts(entries_dict['domains']) domain_entries = subdomains(domain_entries_details.clean_urls) if domain_entries: applied_disavow = apply_disavow({"domains": entries_dict['domains']}, entries_dict['urls']) link_entries = applied_disavow['non_disavowed'] final_domain_entries = set() final_domain_entries.update(domain_entries) if domain_limit: link_entries, new_domain_entries = apply_domain_limit(link_entries, domain_limit) final_domain_entries.update(new_domain_entries) if domain_entries and domain_limit: domain_entries, new_domain_entries = remove_redundant_domains(domain_entries, new_domain_entries) # total_domains_disavowed = len(domain_entries + new_domain_entries) links_disavowed = len(link_entries) return { 'domain_entries': list(final_domain_entries), 'url_entries': link_entries, 'urls_entered_count': link_entries_details.urls_entered, 'urls_disavowed_count': links_disavowed, 'unique_urls_entered_count': link_entries_details.unique_urls_entered, 'domain_entries_entered_count': domain_entries_details.urls_entered, }