def fetch_meta_attributes(input_html): meta_content = {} soup_obj = BeautifulSoup(input_html, 'lxml') site_lookup_dict = { 'thequint': [{'property': 'og:title'}, {'name': ['keywords', 'title', 'description']}], 'indiatimes': [{'name': ['keywords', 'title', 'description']}, {'property': 'og:title'}], 'afternoondc': [{'property': 'og:title'}, {'name': ['keywords', 'title', 'description']}], 'dailyo': [{'property': 'og:title'}, {'name': ['keywords', 'title', 'description']}], 'qz': [{'property': 'og:title'}, {'name': ['title', 'news_keywords', 'description']}], 'scroll': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'dnaindia': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'firstpost': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'thehansindia': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'thehindu': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'thehindubusinessline': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'hindustantimes': [{'property': ['og:title', 'og:description']}, {'name': 'keywords'}], 'indianexpress': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'mid-day': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'livemint': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'orissapost': [{'property': ['og:title', 'og:description']}, {'name': ['title', 'keywords', 'description']}], 'dailypioneer': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'sundayguardianlive': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], # NO keywords 'thestatesman': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'telegraphindia': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], # NO keywords 'tribuneindia': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'kashmirreader': [{'property': ['og:title', 'og:description']}, {'name': ['title', 'keywords', 'description']}], # NO keywords 'asianage': [{'property': ['og:title', 'og:description']}, {'name': 'Keywords'}], 'bangaloremirror': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'newindianexpress': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'deccanchronicle': [{'property': ['og:title', 'og:description']}, {'name': ['Keywords']}], 'deccanherald': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'economictimes': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'financialexpress': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'mydigitalfc': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'afternoondc': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'business-standard': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'speakingtree': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'cricbuzz': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}], 'indiatimes': [{'name': ['keywords', 'title', 'description']}, {'property': 'og:title'}], 'femina': [{'name': ['keywords', 'title', 'description']}, {'property': 'og:title'}] } try: input_url = soup_obj.find('meta', {'property': 'og:url'}).get('content') for curr_attr_map in site_lookup_dict[tld_extract(input_url).domain]: if type(curr_attr_map) is dict: meta_map = curr_attr_map else: meta_map = {curr_attr_map: site_lookup_dict[tld_extract(input_url).domain][curr_attr_map]} meta_tags = soup_obj.find_all('meta', meta_map) for curr_tag in meta_tags: tag_content = curr_tag.get('content') if curr_tag.get('name'): meta_content[curr_tag.get('name').lower()] = tag_content else: meta_content[curr_tag.get('property').replace('og:', '').lower()] = tag_content except (KeyError, TypeError, AttributeError, UnboundLocalError) as e: pass return meta_content
def get_domain_parts(domain: str) -> TldExtractResult: """wrapper for calling tldextract as it logs things about file locks we don't care about.""" logger = logging.getLogger() level = logger.getEffectiveLevel() logger.setLevel(logging.ERROR) domain_parts: TldExtractResult = tld_extract(domain) logging.getLogger().setLevel(level) return domain_parts
def naked_domain(url): # This function extracts the domain name part of an URL. # It works indiscriminately on URLs or plain domains. res = tld_extract(url) if not res.subdomain or res.subdomain == '': return res.registered_domain else: return ".".join([res.subdomain, res.registered_domain])
def build_ea_scheme_and_naming_authority( host: str, host_auth_start_month: Optional[str] = None) -> str: """ This function creates the host identification part of USEF's EA1 addressing scheme, so everything but the locally unique string. If not given nor configured, host_auth_start_month is the start of the next month for localhost. """ domain_parts: TldExtractResult = tld_extract(host) if host_auth_start_month is None: config_var_domain_key = ".".join( filter( lambda x: x, [ domain_parts.subdomain, domain_parts.domain, domain_parts.suffix ], )) if config_var_domain_key in current_app.config.get( "FLEXMEASURES_HOSTS_AND_AUTH_START", {}): host_auth_start_month = current_app.config.get( "FLEXMEASURES_HOSTS_AND_AUTH_START", {})[config_var_domain_key] elif domain_parts.domain in ("localhost", "127.0.0.1"): host_auth_start_month = get_first_day_of_next_month().strftime( "%Y-%m") else: raise Exception( f"Could not find out when authority for {config_var_domain_key} started. Is FLEXMEASURES_HOSTS_AND_AUTH_START configured for it?" ) regex = r"^\d{4}-\d{2}$" if not re.search(regex, host_auth_start_month): raise ValueError( f"{host_auth_start_month} should adhere to the format {regex}.") if not int(host_auth_start_month[-2:]) in range(1, 13): raise ValueError( f"Month in {host_auth_start_month} should be in the range of 1 to 12." ) reversed_domain_name = reverse_domain_name(domain_parts) if reversed_domain_name == "": raise Exception(f"Could not make domain name from {host}!") return f"{ADDR_SCHEME}.{host_auth_start_month}.{reversed_domain_name}"
def parseURL(url): """parses a url and returns a dictionary of the components combines the results of tldextract and urlparse eg url = 'http://sports.au.yahoo.com/something/other.html?things' urlparse(url): ParseResult(scheme='http', netloc='sports.au.yahoo.com', path='/something/other.html', params='', query='things', fragment='') extract(url): ExtractResult(subdomain='sports.au', domain='yahoo', suffix='com') """ url = url.lower() url_components = urlparse(url) tld_components = tld_extract(url) components = { 'subdomain': tld_components.subdomain, 'tld': tld_components.domain + "." + tld_components.suffix, 'path': url_components.path.split('/') } return components
def reverse_domain_name(domain: Union[str, TldExtractResult]) -> str: """ Returns the reverse notation of the domain. You can pass in a string domain or an extraction result from tldextract """ if isinstance(domain, str): domain_parts = tld_extract(domain) else: domain_parts = domain suffix = domain_parts.suffix if suffix != "": if "." in suffix: suffix = ".".join(suffix.split(".")[::-1]) suffix = f"{suffix}." domain = domain_parts.domain reversed_subdomain = "" if domain_parts.subdomain != "": sd_list = ".".join(domain_parts.subdomain.split(".")[::-1]) reversed_subdomain = f".{sd_list}" return f"{suffix}{domain}{reversed_subdomain}"
def parent_domain(domain): return tld_extract(domain).registered_domain
def analyze_js_source(domain, input_): """Find the type of source of the javascript tags that are given as input. Classifies the source of a script tag into internal and external includes and into inline scripts. If external resources are loaded via a 'src' attribute it is determined whether the resources are loaded from the same page ('include-local') or ('include-external'). Resources that are loaded from the same domain (including subdomains) are also identified as 'include-local'. The dicts returned contain the following keys: * type one of 'inline-js', 'inline-unknown', 'inline-json', 'include-local', 'include-external', or 'include' (only if domain is None) * src *(for type == include\*)* the full included path * src_path *(for type == include\*)* relative path to included resource * 'src_domain' *(for type == include-external)* domain from which resource is included * 'content' *(for type == inline-\*)* content of the script tag Parameters ---------- domain : str or None Domain the tags have been scraped from. Used to detect whether scripts are local or external includes. Expects the domain to be given up to the second level (e.g. domain="nic.at"). If domain is None, script tags that load from an absolute path are marked as "include", since in this case external and local includes can not be distinguished. input_ : list of (dict,str) List of script tags to analyze where the dictionary contains the attributes of the tag and the str the text of the tag. Returns ------- js_code : list of dict the script sources found. Each dict describes a single <script> tag found. The keys are: """ sources = [] try: for attrs, text in input_: result = {} if "src" in attrs: # These are (possibly external) includes url = attrs["src"] result["src"] = url p_url = urlparse(url) result["src_path"] = p_url.path if p_url.netloc: o_tld = tld_extract(p_url.netloc) script_domain = o_tld.domain + "." + o_tld.suffix if domain is None: result["type"] = "include" result["src_domain"] = script_domain elif script_domain != domain: result["type"] = "include-external" result["src_domain"] = script_domain else: result["type"] = "include-local" else: if result["src_path"].startswith("./"): result["src_path"] = \ result["src_path"].replace("./", "/", 1) result["type"] = "include-local" else: result["content"] = text # if tag has no type attr it is assumed text/javascript if "type" not in attrs: result["type"] = "inline-js" elif (isinstance(attrs["type"], basestring) and (attrs["type"].lower() in KNOWN_SCRIPT_MIME_TYPES)): result["type"] = "inline-%s" % ( KNOWN_SCRIPT_MIME_TYPES[attrs["type"].lower()]) else: result["type"] = "inline-unknown" sources.append(result) except TypeError as e: raise ValueError("could not iterate over input_ " "- expected list of tuples (dict,str) - %s" % str(e)) return sources