def apply_domain_limit(entries, domain_limit): """ Takes a list of sanitised URLs and looks at their domains. If a domain count exceeds the limit, the urls from that domain are removed and replaced with a single 'domain: url' entry. The counts of the new 'domain:' entries, and the remaining links are also returned. """ stripped_urls = subdomains(entries) new_url_set = set() exceeded_domains = set() domains_seen = set() domains_count = dict() for url in stripped_urls: if url not in domains_seen: domains_seen.add(url) domains_count[url] = 1 else: domains_count[url] += 1 for url in stripped_urls: if domains_count[url] >= domain_limit: exceeded_domains.add(url) for entry in entries: if (subdomain(entry) not in exceeded_domains and rootdomain(entry) not in exceeded_domains): new_url_set.add(entry) applied_domain_limit = namedtuple('applied_domain_limit', 'remaining_urls exceeded_domains') return applied_domain_limit(new_url_set, exceeded_domains)
def remove_redundant_domains(old_domains, new_domains): """ Checks whether any domains that have been newly created conflict with any existing domain entries and removes any such conflict. """ non_redundant_old_domains = set() non_redundant_new_domains = set() for old_domain in old_domains: if rootdomain(old_domain) not in new_domains: non_redundant_old_domains.add(old_domain) for new_domain in new_domains: if new_domain not in old_domains: non_redundant_new_domains.add(new_domain) return (list(non_redundant_old_domains), list(non_redundant_new_domains))
def combine_with_original_disavow(file_contents, disavow_entries): """ Takes the disavow file passed to disavow_file_to_dict() and it's resulting output and combines them to create a .txt file with the relevant 'domain:' entries and individual links to be disavowed, while maintaining the order and the comments from the original document. """ output = [] # extract = extract_file_contents(disavow_file) file_contents = file_contents.splitlines() urls_encountered = set() domains_encountered = set() for raw_entry in file_contents: if (not raw_entry.isspace()) and (raw_entry != ""): # Strip quotes for lines wrapped in quotes if raw_entry.startswith('"') and raw_entry.endswith('"'): raw_entry = raw_entry[1:-1] if raw_entry[0] == '#': # line is a comment, so we just keep it output.append(raw_entry) continue if raw_entry[:7] == 'domain:': # line is an domain entry # clean the domain entry domain_normalized = normalize(raw_entry[7:]) # check if it is valid, if not then include it is a comment if not domain_normalized: output.append('# invalid entry - ' + raw_entry) else: clean_domain = subdomain(domain_normalized) if clean_domain in disavow_entries['domain_entries']: if clean_domain not in domains_encountered: output.append('domain:' + clean_domain) domains_encountered.add(clean_domain) else: output.append('# domain entry already present - ' + clean_domain) else: # line is a url entry # clean the url entry url_normalized = normalize(raw_entry) # check if link entry is valid if not url_normalized: output.append('# invalid entry - ' + raw_entry) else: url_subdomain = subdomain(url_normalized) url_rootdomain = rootdomain(url_normalized) if url_subdomain in disavow_entries['domain_entries']: if url_subdomain not in domains_encountered: domains_encountered.add(url_subdomain) output.append('domain:' + url_subdomain) else: output.append('# link now disavowed via new domain entry - ' + raw_entry) elif url_rootdomain in disavow_entries['domain_entries']: if url_rootdomain not in domains_encountered: domains_encountered.add(url_rootdomain) output.append('domain:' + url_rootdomain) else: output.append('# link now disavowed via new domain entry - ' + raw_entry) elif url_normalized in disavow_entries['url_entries']: if url_normalized not in urls_encountered: output.append(url_normalized) urls_encountered.add(url_normalized) else: output.append('# link entry already present') else: output.append('# error occurred, not sure what to do with this - ' + raw_entry) return output
def combine_with_original_disavow(file_contents, disavow_entries): """ Takes the disavow file passed to disavow_file_to_dict() and it's resulting output and combines them to create a .txt file with the relevant 'domain:' entries and individual links to be disavowed, while maintaining the order and the comments from the original document. """ output = [] # extract = extract_file_contents(disavow_file) file_contents = file_contents.splitlines() urls_encountered = set() domains_encountered = set() for raw_entry in file_contents: if (not raw_entry.isspace()) and (raw_entry != ""): # Strip quotes for lines wrapped in quotes if raw_entry.startswith('"') and raw_entry.endswith('"'): raw_entry = raw_entry[1:-1] if raw_entry[0] == '#': # line is a comment, so we just keep it output.append(raw_entry) continue if raw_entry[:7] == 'domain:': # line is an domain entry # clean the domain entry domain_normalized = normalize(raw_entry[7:]) # check if it is valid, if not then include it is a comment if not domain_normalized: output.append('# invalid entry - ' + raw_entry) else: clean_domain = subdomain(domain_normalized) if clean_domain in disavow_entries['domain_entries']: if clean_domain not in domains_encountered: output.append('domain:' + clean_domain) domains_encountered.add(clean_domain) else: output.append('# domain entry already present - ' + clean_domain) else: # line is a url entry # clean the url entry url_normalized = normalize(raw_entry) # check if link entry is valid if not url_normalized: output.append('# invalid entry - ' + raw_entry) else: url_subdomain = subdomain(url_normalized) url_rootdomain = rootdomain(url_normalized) if url_subdomain in disavow_entries['domain_entries']: if url_subdomain not in domains_encountered: domains_encountered.add(url_subdomain) output.append('domain:' + url_subdomain) else: output.append( '# link now disavowed via new domain entry - ' + raw_entry) elif url_rootdomain in disavow_entries['domain_entries']: if url_rootdomain not in domains_encountered: domains_encountered.add(url_rootdomain) output.append('domain:' + url_rootdomain) else: output.append( '# link now disavowed via new domain entry - ' + raw_entry) elif url_normalized in disavow_entries['url_entries']: if url_normalized not in urls_encountered: output.append(url_normalized) urls_encountered.add(url_normalized) else: output.append('# link entry already present') else: output.append( '# error occurred, not sure what to do with this - ' + raw_entry) return output