def __init__(self): self.rules = {} etlds = TLD.objects() for etld in etlds: tld = etld.tld.split('.')[-1] self.rules.setdefault(tld, []) self.rules[tld].append(re.compile(self.regexpize(etld.tld)))
def populate_tlds(drop): """ Populate default set of TLDs into the system. :param drop: Drop the existing collection before trying to populate. :type: boolean """ if not drop: print "Drop protection does not apply to effective TLDs" TLD.drop_collection() f = os.path.join(settings.SITE_ROOT, '..', 'extras', 'effective_tld_names.dat') count = 0 for line in open(f, 'r').readlines(): line = line.strip() if line and not line.startswith('//'): TLD.objects(tld=line).update_one(set__tld=line, upsert=True) count += 1 print "Effective TLDs: added %s TLDs!" % count
def update_tlds(data=None): """ Update the TLD list in the database. :param data: The TLD data. :type data: file handle. :returns: dict with key "success" (boolean) """ if not data: return {'success': False} line = data.readline() while line: line = line.rstrip() if line and not line.startswith('//'): TLD.objects(tld=line).update_one(set__tld=line, upsert=True) line = data.readline() # Update the package local tld_parser with the new domain info tld_parser = etld() return {'success': True}
def extract_domains(data): pattern = r'[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?[\.[a-zA-Z]{2,}' domains = [each for each in re.findall(pattern, data) if len(each) > 0] final_domains = [] for item in domains: if len(item) > 1 and item.find('.') != -1: try: tld = item.split(".")[-1] check = TLD.objects(tld=tld).first() if check: final_domains.append(item) except: pass return final_domains
def extract_emails(data): pattern = r'[a-zA-Z0-9-\.\+]+@.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?[\.[a-zA-Z]{2,}' emails = [each for each in re.findall(pattern, data) if len(each) > 0] final_emails = [] for item in emails: if len(item) > 1 and item.find('.') != -1: try: tld = item.split(".")[-1] check = TLD.objects(tld=tld).first() if check: final_emails.append(item) except: pass return final_emails
def extract_emails(data): pattern = r'[a-zA-Z0-9-\.\+]+@.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?[\.][a-zA-Z]{2,}' emails = [each for each in re.findall(pattern, data) if len(each) > 0] final_emails = [] unique_emails = [] for item in emails: if len(item) > 1 and item.find('.') != -1: try: tld = item.split(".")[-1] check = TLD.objects(tld=tld).first() if check: if item not in unique_emails: unique_emails.append(item) except: pass unique_emails.sort() return unique_emails