def urlscan_screenshot(url): #First, let's see if it's been indexed already. domain = tldextract.extract(url).fqdn print("\t\t[-] Looking up domain screenshot {}".format(domain)) if lookup_screenshot(url): print ("\t\t[-] Already screenshotted...") return else: print ("\t\t[-] No screenshot yet, sending to urlscan...") urlscanio = "https://urlscan.io/api/v1/scan/" key = urlscanapikey headers = { "Content-Type" : "application/json", "API-Key" : key } data = { "url" : url, "public" : "on" } try: r = requests.post(urlscanio, headers=headers, json=data) save_screenshot(url, r.json()['result']) except Exception as e: print (e) pass time.sleep(2)
def get_title_for_classifier(self): title = self.soup.find("title") domain = tldextract.extract(self.current_link) return_string = domain.domain + ", " + re.sub( r'[^\w]', " ", title.get_text().strip().lower().replace(domain.domain, "")) return return_string
def __init__(self, url, config=None, **kwargs): """The config object for this source will be passed into all of this source's children articles unless specified otherwise or re-set. """ if (url is None) or ('://' not in url) or (url[:4] != 'http'): raise Exception('Input url is bad!') self.config = config or Configuration() self.config = utils.extend_config(self.config, kwargs) self.extractor = self.config.extractor(self.config) self.url = url self.url = urls.prepare_url(url) self.domain = urls.get_domain(self.url) self.scheme = urls.get_scheme(self.url) self.categories = [] self.feeds = [] self.articles = [] self.html = '' self.doc = None self.logo_url = '' self.favicon = '' self.brand = tldextract.extract(self.url).domain self.description = '' self.is_parsed = False self.is_downloaded = False
def enviar_noticias(resultado, id_chat, Nombre_Grupo, provincias, tema): try: url_api = "bot1477154971:AAHz2Ok9QD8bwzkAxIqqZc64GNPeqGjuRTI/sendMessage" temas = ''.join(tema) for m in resultado: titulo = m[1] linkNoticia = m[0] descripcion = m[2] contenido = m[3] extracted = tldextract.extract(linkNoticia) medio = "{}.{}".format(extracted.domain, extracted.suffix) medio = medio.replace(".com", "").replace(".ar", "") mensaje = "Medio: " + medio + "\n\n" + " Última Noticia: " + titulo + "\n\n" + " Ver más en ->" + linkNoticia requests.post('https://api.telegram.org/' + url_api, data={ 'chat_id': id_chat, 'text': mensaje }) print(requests.status_codes) try: mycursor = mydb.cursor() sql = "INSERT INTO noticias_enviadas (link,tema,id_grupo) " \ "VALUES (%s, %s, %s) " val = (linkNoticia, temas, id_chat) mycursor.execute(sql, val) mydb.commit() print("insertó correctamente el link: " + linkNoticia + "") except Exception as e: print("El Link ya fue guardado: " + linkNoticia + "") except Exception as e: print(" 279 - enviar ", e)
def compare_to(self, compared_url): """ Get other website url and a word Finds the number of occurrences in the compared website Prints data in a table """ # find number of letters & most common word in current website num_of_letters, most_common_word = self.get_text_info() # find number of occurrences in the compared website text = self.parse(compared_url) compared_url_occur = text.count(most_common_word[0]) # create a table and print data table = PrettyTable(['Description', 'Value']) table.add_row(['Number of Letters in ' + self.name, num_of_letters]) table.add_row([ 'Most Common Word in ' + self.name, "The word '" + str(most_common_word[0]) + "'" + " occurs " + str(most_common_word[1]) + " times" ]) table.add_row([ 'Number of Occurrences in ' + tldextract.extract(compared_url).domain, str(compared_url_occur) ]) print(table)
async def select_proxy(self, url: str) -> str: if not self._redis_client: await self.setup() await self.cleanup() domain = tldextract.extract(url).domain while True: free_slots = await asyncio.gather(*[ proxy.get_number_of_free_slots(domain, self._redis_client) for proxy in self._proxies ]) slots_and_proxies = list(zip(free_slots, self._proxies)) shuffle(slots_and_proxies) proxies = sorted(slots_and_proxies, key=lambda d: d[0], reverse=True) if proxies[0][0] > 0: return await proxies[0][1].get_url(domain, self._redis_client) else: # No proxy available right now. Wait. await asyncio.sleep(5)
async def resolve(self, domain): if not self.bootstraped: await self.bootstrap() tld = tldextract.extract(domain).suffix apex_domain = tldextract.extract(domain).registered_domain try: authorative_server = self.server_list[tld] async with aiohttp.ClientSession() as session: res = await self._fetch_json(session, "%sdomain/%s" %(authorative_server, apex_domain )) if res: entry = RdapDomainEntry.from_rdap_response(res) return entry else: raise LookupError('no valid rdap data received') except KeyError: raise LookupError("No rdap server found for domain: %s" % domain)
def update_json_information(self, site): time.sleep(3) print("Updating json with recording information") with open(site["json_path"], "r") as f: json_data = json.loads(f.read()) # Generate the eTLD+1 list # Bug 1585598 - Validate list of sites used for testing Fission etdl = [] for item in json_data["http_protocol"].keys(): base_url = ".".join(item.split(".")[-2:]) if base_url not in etdl: etdl.append(base_url) self.information["etdl"] = etdl self.information["proxy"] = self.proxy self.information["url"] = site["url"] self.information["domain"] = tldextract.extract(site["url"]).domain self.information["label"] = site.get("label") json_data["info"] = self.information with open(site["json_path"], "w") as f: f.write(json.dumps(json_data, sort_keys=True, indent=2))
def getTargetValidityProblems(self): """Verify that each target has a valid TLD in order to prevent problematic rewrite as stated in EFForg/https-everywhere/issues/10877. In particular, right-wildcard target are ignored from this test. Returns an array of strings reporting any coverage problems if they exist, or empty list if coverage is sufficient. """ problems = self._determineTestApplication() # Next, make sure each target has a valid TLD and doesn't overlap with others for target in self.targets: # If it's a wildcard, check which other targets it covers if '*' in target: target_re = regex.escape(target) if target_re.startswith(r'\*'): target_re = target_re[2:] else: target_re = r'\A' + target_re target_re = regex.compile( target_re.replace(r'\*', r'[^.]*') + r'\Z') others = [other for other in self.targets if other != target and target_re.search(other)] if others: problems.append("%s: Target '%s' also covers %s" % (self.filename, target, others)) # Ignore right-wildcard targets for TLD checks if target.endswith(".*"): continue # Ignore if target is an ipv4 address try: socket.inet_aton(target) continue except: pass # Ignore if target is an ipv6 address try: socket.inet_pton(socket.AF_INET6, target) continue except: pass # Extract TLD from target if possible res = tldextract.extract(target) if res.suffix == "": problems.append("%s: Target '%s' missing eTLD" % (self.filename, target)) elif res.domain == "": problems.append("%s: Target '%s' containing entire eTLD" % ( self.filename, target)) return problems
def is_polite(self, url): ext = tldextract.extract(url) URL = ext.subdomain + "." + ext.domain + "." + ext.suffix rp = urllib.robotparser.RobotFileParser() URL = "https://" + URL + "/robots.txt" rp.set_url(URL) rp.read() flag = rp.can_fetch("*", url) return flag
async def register_status_code(self, url: str, status_code: int, proxy_url: str): domain = tldextract.extract(url).domain for proxy in self._proxies: if proxy._url == proxy_url: await proxy.register_status_code(status_code, domain, self._redis_client) break
def get_hostname(hostname): h_data = tldextract.extract(hostname) if h_data.subdomain: hostname = h_data.subdomain else: if h_data.registered_domain: hostname = h_data.registered_domain return hostname
def lookup_screenshot(url): domain = tldextract.extract(url).fqdn if os.path.exists("screenshots/mapping.json"): f = open("screenshots/mapping.json", 'r') current = f.read() f.close() current = json.loads(current) return current.get(domain, False) else: return False
def get_domain(uri): """ Given an uri returns it's domain ("without http://") :param uri: :return: """ parsed_uri = urlparse(uri) netloc = '{uri.netloc}'.format(uri=parsed_uri) extract_result = tldextract.extract(netloc) domain = extract_result.domain + DOT + extract_result.suffix return domain.strip().replace("\"\"", "")
def get_hosts_set(): for num in range(len(URLS)): with open(f'ads_{num}.txt') as f: for line in f: if line.startswith('#') or ('0.0.0.0' not in line and '127.0.0.1' not in line): continue domain = line.strip().split(' ')[1] extracted = tldextract.extract(domain) if extracted.suffix: yield extracted
def classify_links(self): self.same_domain_links = [] self.sub_directory_links = [] self.same_directory_links = [] self.super_directory_links = [] self.sub_domain_links = [] self.super_domain_links = [] self.sister_domain_links = [] self.external_links = [] dir_url = '/'.join(self.url.split('://')[1].split('/')[:-1]) url_tld = extract(self.url) for link in self.all_links: link_tld = extract(link['href']) if link_tld.tld == url_tld.tld and link_tld.domain == url_tld.domain: # internal link within the domain name e.g. bbc.co.uk if link_tld.subdomain == url_tld.subdomain: # we are at the same subdomain self.same_domain_links.append(link) if link['href'].startswith(dir_url): remainder = link['href'].replace(dir_url, '') r_segs = remainder.split('/') if len(r_segs) == 2: self.same_directory_links.append(link) elif len(r_segs) > 2: self.sub_directory_links.append(link) else: self.super_directory_links.append(link) elif link_tld.subdomain.endswith(url_tld.subdomain): # this is a subdomain of the current_domain self.sub_domain_links.append(link) elif url_tld.subdomain.endswith(link_tld.subdomain): # this is a superdomain of the current domain self.super_domain_links.append(link) else: # this must be a sister domain of some kind self.sister_domain_links.append(link) else: # external link self.external_links.append(link)
def identify_short_url_by_suffix(self): """ 检查URL的后缀是否为常见的短链后缀 :param url_to_tld_dict: 从文本内容中提取的URL:TLD字典 :return: 经过后缀检查后的URL:TLD字典 """ # 可能属于短链的URL:TLD的字典 short_url_to_tld_dict = {} # 项目根目录的路径 rootPath = os.path.split(curPath)[0] # 短链服务提供商域名列表的绝对路径,由此来获取常用的短链后缀 short_url_service_domain_list_path_absolute = rootPath + "/data/short_url_services_list.txt" short_url_service_domain_list = loads_file_from_txt_to_list( fpath=short_url_service_domain_list_path_absolute) # 常见短链服务提供商的域名后缀集合 suffix_set = set() for url in short_url_service_domain_list: suffix = tldextract.extract(url)[2] suffix_set.add(suffix) # 通用域名,不列为可疑的短链域名后缀 common_suffix_list = ['com', 'net', 'info', 'org', ''] for common_suffix in common_suffix_list: suffix_set.remove(common_suffix) # print("短链服务提供商的常见域名后缀的集合:{0}".format(suffix_set)) for url in self.url_to_tld_dict.keys(): flag = False input_url_suffix = tldextract.extract(self.url_to_tld_dict[url])[2] # 传入URL的TLD tld = self.url_to_tld_dict[url] for suffix in suffix_set: if input_url_suffix == suffix: flag = True short_url_to_tld_dict[url] = tld if flag: self.judge_dict[url].append(True) else: self.judge_dict[url].append(False) return short_url_to_tld_dict
def get_base_url(url): """ Takes as input a url, returns the protocol,domain and suffix concatenated to form the base url of the website. Uses the tldextract library. """ tld = tldextract.extract(url) print(tld.subdomain, ' - ', tld.domain, ' - ', tld.suffix) if tld.subdomain != "": base_url = '.'.join([tld.subdomain, tld.domain, tld.suffix]) else: base_url = '.'.join([tld.domain, tld.suffix]) return base_url
def get_domain(url): """ Return the domain without any subdomain >>> get_domain("http://blog.example.com") 'example.com' >>> get_domain("http://www.example.com") 'example.com' >>> get_domain("http://deeper.blog.example.co.uk") 'example.co.uk' """ return ".".join(tldextract.extract(url)[-2:])
def get_open_page_rank(self, url): try: o = tldextract.extract(url) domain = ('%s.%s' % (o.domain, o.suffix)) try: pginfo = self.page_rank.pg[domain] except KeyError: config.logger.warn('page rank information for domain [' + domain + '] not found') return MISSING_FEATURE * 2, True return [pginfo['page_rank_decimal'], pginfo['rank']], False except Exception as e: config.logger.error(repr(e)) return MISSING_FEATURE * 2, True
def post_reformat(post): return [ post.id, # 0 post id Markup((post.title[:120] + '...') if len(post.title) > 120 else post.title), # 1 post title Markup(post.url), # 2 full url tldextract.extract(post.url).registered_domain, # 3 domain only url Markup(post.username), # 4 user name datetime.datetime.fromtimestamp(post.datetime).strftime('%c'), # 5 tooltip time human_datetime(post.datetime), # 6 time since post post.tags, # 7 tags len(post.votes.split(';')), # 8 votes len(post.reports.split(';')), # 9 reports int(time.time() - post.datetime) # 10 epoch seconds ]
def save_screenshot(url, urlscanurl): domain = tldextract.extract(url).fqdn if os.path.exists("screenshots/mapping.json"): f = open("screenshots/mapping.json", 'r') current = f.read() f.close() current = json.loads(current) current[domain] = urlscanurl f = open("screenshots/mapping.json", 'w') current = f.write(json.dumps(current)) f.close() else: current = {domain: urlscanurl} f = open("screenshots/mapping.json", 'w') current = f.write(json.dumps(current)) f.close()
def get_url_info(url, guess_and_check, max_depth): # Create a new Link instance link = Link() # Save the extracted URL link.url = url # Remove markdown and other artifacts from the URL link.url_clean = remove_markdown(url) # Check if the clean URL is valid, if so continue with the next steps link.url_clean_is_valid = check_if_valid_url(link.url_clean) if link.url_clean_is_valid: link.is_amp = check_if_amp(link.url_clean) if link.is_amp: link.is_cached = check_if_cached(link.url_clean) link.domain = tldextract.extract(link.url_clean).domain link = get_canonical(link, guess_and_check, max_depth) return link
def is_valid_url(url): extract = tldextract.extract(url) if not extract.suffix: return False # Validators catches 'most' invalid urls, but there are some issues and exceptions that are not really likely # to cause any major issues in our software. The other alternative is another library with other quircks. # see: https://github.com/kvesteri/validators/ # Note that this library does not account for 'idna' / punycode encoded domains, so you have to convert # them yourself. luckily: # 'аренда.орг' -> 'xn--80aald4bq.xn--c1avg' # 'google.com' -> 'google.com' valid_domain = domain(url.encode('idna').decode()) if valid_domain is not True: return False return True
def handle(self, *args, **options): imported_domains = 0 already_existing_domains = 0 file_path = options.get('file', None) with open(file_path, newline='') as file: line = file.readline() while line: url = line line = file.readline() if not urlparse(url).scheme: url = 'http://' + line # This uses a list of public suffixes tld_extract_result = tldextract.extract(url) if tld_extract_result.subdomain: url_full_domain = '.'.join([ tld_extract_result.subdomain, tld_extract_result.domain, tld_extract_result.suffix ]) elif tld_extract_result.suffix: url_full_domain = '.'.join( [tld_extract_result.domain, tld_extract_result.suffix]) else: url_full_domain = tld_extract_result.domain if ProxyBlacklistedDomain.objects.filter( domain=url_full_domain).exists(): logger.info('Domain %s already exists, not importing.' % url_full_domain) already_existing_domains += 1 continue ProxyBlacklistedDomain.objects.create(domain=url_full_domain) logger.info('Imported domain %s successfully.' % url_full_domain) imported_domains += 1 logger.info( 'Finished. Imported %d domains and skipped %d as they already existed' % (imported_domains, already_existing_domains))
def get_url_info(url, use_gac, max_depth) -> Link: link = Link(canonicals=[]) origin = UrlMeta(url=remove_markdown(url)) origin.is_valid = check_if_valid_url(origin.url) origin.is_amp = check_if_amp(origin.url) and not any( map(origin.url.__contains__, static.DENYLISTED_DOMAINS)) if origin.is_valid: if origin.is_amp: origin.is_cached = check_if_cached(origin.url) origin.domain = tldextract.extract(origin.url).domain link.origin = origin link = get_canonicals(link=link, max_depth=max_depth, use_gac=use_gac) return link
def valid_image_url(url): url = clean_url(url) if not url.startswith(('http://', 'https://')): return False path = urlparse(url).path # input url is not in valid form (scheme, netloc, tld) if not path.startswith('/'): return False # the '/' which may exist at the end of the url provides us no information if path.endswith('/'): path = path[:-1] # '/story/cnn/blahblah/index.html' --> ['story', 'cnn', 'blahblah', 'index.html'] path_chunks = [x for x in path.split('/') if len(x) > 0] # siphon out the file type. eg: jpeg, png if len(path_chunks) > 0: file_type = urls.url_to_filetype(url) # if the file type is a media type, reject instantly if file_type and file_type not in ALLOWED_TYPES: return False last_chunk = path_chunks[-1].split('.') # the file type is not of use to use anymore, remove from url if len(last_chunk) > 1: path_chunks[-1] = last_chunk[-2] # extract the tld (top level domain) tld_dat = tldextract.extract(url) tld = tld_dat.domain.lower() if tld in urls.BAD_DOMAINS: return False for d in BAD_URLS: if d in url: return False return True
def create_tenant(domain='', **extra_fields): extra_fields.setdefault('name', '') extra_fields.setdefault('country', '') extra_fields.setdefault('currency', '') if domain and not freemail.is_free(domain): # We can guess some field values based on the domain. tld = tldextract.extract(domain) geo_ip = GeoIP2() if not extra_fields['name']: # Use the domain of the email address as tenant name. extra_fields['name'] = tld.domain.title() if not extra_fields['country']: try: country_code = geo_ip.country( tld.registered_domain).get('country_code') except (gaierror, AddressNotFoundError): pass else: if country_code in [c[0] for c in COUNTRIES]: extra_fields['country'] = country_code if extra_fields['country'] and not extra_fields['currency']: currency = get_territory_currencies( extra_fields['country'])[-1] if currency in [c[0] for c in CURRENCIES]: extra_fields['currency'] = currency if settings.BILLING_ENABLED: # Chargebee needs extra info on who to bill, so for now only create the plans without activating the trial. plan, created = Plan.objects.get_or_create( name=settings.CHARGEBEE_PRO_TRIAL_PLAN_NAME) billing = Billing.objects.create(plan=plan) else: billing = Billing.objects.create() tenant = Tenant.objects.create(billing=billing, **extra_fields) create_defaults_for_tenant(tenant) return tenant
def enviar_noticias(arr, id_chat, Nombre_Grupo, provincias, tema): try: url_api = "bot1477154971:AAHz2Ok9QD8bwzkAxIqqZc64GNPeqGjuRTI/sendMessage" temas = ''.join(tema) for m in arr: linkPortal = m[1] linkNoticia = m[0] mycursor = mydb.cursor() #sql = "SELECT * FROM noticias_enviadas WHERE link = '"+str(linkNoticia)+"'" +""+"" #mycursor.execute(sql) #records = cursor.fetchall() #if records == []: extracted = tldextract.extract(linkPortal) medio = "{}.{}".format(extracted.domain, extracted.suffix) medio = medio.replace(".com", "").replace(".ar", "") linktitulo = requests.get(m[0]).text titulo = funcion_BuscaTitulo(linktitulo) if isinstance(titulo, str): mensaje = "Medio: " + medio + "\n\n" + " Última Noticia: " + titulo + "\n\n" + " Ver más en ->" + linkNoticia requests.post('https://api.telegram.org/' + url_api, data={ 'chat_id': id_chat, 'text': mensaje }) print(requests.status_codes) try: mycursor = mydb.cursor() sql = "INSERT INTO noticias_enviadas (link,tema,id_grupo) " \ "VALUES (%s, %s, %s) " val = (linkNoticia, temas, id_chat) mycursor.execute(sql, val) mydb.commit() print("insertó correctamente el link: " + linkNoticia + "") except Exception as e: print("El Link ya fue guardado: " + linkNoticia + "") except Exception as e: print(" 279 - enviar ", e)
def set_url(self, url): url = url.strip() res = extract(url) segs = [] for seg in [res.subdomain, res.domain, res.tld]: if seg: segs.append(seg) domain_str = '.'.join(segs) domain = self.get_domain_entry(domain_str) if not domain: domain = Domain( url=domain_str) # create the domain guessing the values domain.save() setattr(self, 'domain_name', domain_str) setattr(self, 'domain_data', domain) setattr(self, 'link', url.split(domain_str, 1)[1]) self.get_soup() self.set_title() self.set_date() self.set_content()