예제 #1
0
def fetch_meta_attributes(input_html):
    meta_content = {}
    soup_obj = BeautifulSoup(input_html, 'lxml')
    site_lookup_dict = {
        'thequint': [{'property': 'og:title'}, {'name': ['keywords', 'title', 'description']}],
        'indiatimes': [{'name': ['keywords', 'title', 'description']}, {'property': 'og:title'}],
        'afternoondc': [{'property': 'og:title'}, {'name': ['keywords', 'title', 'description']}],
        'dailyo': [{'property': 'og:title'}, {'name': ['keywords', 'title', 'description']}],
        'qz': [{'property': 'og:title'}, {'name': ['title', 'news_keywords', 'description']}],
        'scroll': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'dnaindia': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'firstpost': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'thehansindia': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'thehindu': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'thehindubusinessline': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'hindustantimes': [{'property': ['og:title', 'og:description']}, {'name': 'keywords'}],
        'indianexpress': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'mid-day': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'livemint': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'orissapost': [{'property': ['og:title', 'og:description']}, {'name': ['title', 'keywords', 'description']}],
        'dailypioneer': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'sundayguardianlive': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],  # NO keywords
        'thestatesman': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'telegraphindia': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],  # NO keywords
        'tribuneindia': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'kashmirreader': [{'property': ['og:title', 'og:description']}, {'name': ['title', 'keywords', 'description']}],
        # NO keywords
        'asianage': [{'property': ['og:title', 'og:description']}, {'name': 'Keywords'}],
        'bangaloremirror': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'newindianexpress': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'deccanchronicle': [{'property': ['og:title', 'og:description']}, {'name': ['Keywords']}],
        'deccanherald': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'economictimes': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'financialexpress': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'mydigitalfc': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'afternoondc': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'business-standard': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'speakingtree': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'cricbuzz': [{'property': 'og:title'}, {'name': ['title', 'keywords', 'description']}],
        'indiatimes': [{'name': ['keywords', 'title', 'description']}, {'property': 'og:title'}],
        'femina': [{'name': ['keywords', 'title', 'description']}, {'property': 'og:title'}]

    }
    try:
        input_url = soup_obj.find('meta', {'property': 'og:url'}).get('content')
        for curr_attr_map in site_lookup_dict[tld_extract(input_url).domain]:
            if type(curr_attr_map) is dict:
                meta_map = curr_attr_map
            else:
                meta_map = {curr_attr_map: site_lookup_dict[tld_extract(input_url).domain][curr_attr_map]}
            meta_tags = soup_obj.find_all('meta', meta_map)
            for curr_tag in meta_tags:
                tag_content = curr_tag.get('content')
                if curr_tag.get('name'):
                    meta_content[curr_tag.get('name').lower()] = tag_content
                else:
                    meta_content[curr_tag.get('property').replace('og:', '').lower()] = tag_content
    except (KeyError, TypeError, AttributeError, UnboundLocalError) as e:
        pass
    return meta_content
예제 #2
0
def get_domain_parts(domain: str) -> TldExtractResult:
    """wrapper for calling tldextract as it logs things about file locks we don't care about."""
    logger = logging.getLogger()
    level = logger.getEffectiveLevel()
    logger.setLevel(logging.ERROR)
    domain_parts: TldExtractResult = tld_extract(domain)
    logging.getLogger().setLevel(level)
    return domain_parts
예제 #3
0
def naked_domain(url):
    # This function extracts the domain name part of an URL.
    # It works indiscriminately on URLs or plain domains.
    res = tld_extract(url)

    if not res.subdomain or res.subdomain == '':
        return res.registered_domain
    else:
        return ".".join([res.subdomain, res.registered_domain])
예제 #4
0
def naked_domain(url):
    # This function extracts the domain name part of an URL.
    # It works indiscriminately on URLs or plain domains.
    res = tld_extract(url)

    if not res.subdomain or res.subdomain == '':
        return res.registered_domain
    else:
        return ".".join([res.subdomain, res.registered_domain])
예제 #5
0
def build_ea_scheme_and_naming_authority(
        host: str, host_auth_start_month: Optional[str] = None) -> str:
    """
    This function creates the host identification part of
    USEF's EA1 addressing scheme, so everything but the locally unique string.

    If not given nor configured, host_auth_start_month is the start of the next month for
    localhost.
    """
    domain_parts: TldExtractResult = tld_extract(host)

    if host_auth_start_month is None:
        config_var_domain_key = ".".join(
            filter(
                lambda x: x,
                [
                    domain_parts.subdomain, domain_parts.domain,
                    domain_parts.suffix
                ],
            ))
        if config_var_domain_key in current_app.config.get(
                "FLEXMEASURES_HOSTS_AND_AUTH_START", {}):
            host_auth_start_month = current_app.config.get(
                "FLEXMEASURES_HOSTS_AND_AUTH_START", {})[config_var_domain_key]
        elif domain_parts.domain in ("localhost", "127.0.0.1"):
            host_auth_start_month = get_first_day_of_next_month().strftime(
                "%Y-%m")
        else:
            raise Exception(
                f"Could not find out when authority for {config_var_domain_key} started. Is FLEXMEASURES_HOSTS_AND_AUTH_START configured for it?"
            )
    regex = r"^\d{4}-\d{2}$"
    if not re.search(regex, host_auth_start_month):
        raise ValueError(
            f"{host_auth_start_month} should adhere to the format {regex}.")
    if not int(host_auth_start_month[-2:]) in range(1, 13):
        raise ValueError(
            f"Month in {host_auth_start_month} should be in the range of 1 to 12."
        )

    reversed_domain_name = reverse_domain_name(domain_parts)
    if reversed_domain_name == "":
        raise Exception(f"Could not make domain name from {host}!")
    return f"{ADDR_SCHEME}.{host_auth_start_month}.{reversed_domain_name}"
def parseURL(url):
	"""parses a url and returns a dictionary of the components
	
	combines the results of tldextract and urlparse eg
	url = 'http://sports.au.yahoo.com/something/other.html?things'
	
	urlparse(url):
		ParseResult(scheme='http', netloc='sports.au.yahoo.com', path='/something/other.html', params='', query='things', fragment='')
	
	extract(url):
		ExtractResult(subdomain='sports.au', domain='yahoo', suffix='com')	
	"""
	url = url.lower()
	url_components = urlparse(url)
	tld_components = tld_extract(url)
	
	components = {
		'subdomain': tld_components.subdomain,
		'tld': tld_components.domain + "." + tld_components.suffix,
		'path': url_components.path.split('/')
	}
	
	return components
예제 #7
0
def reverse_domain_name(domain: Union[str, TldExtractResult]) -> str:
    """
    Returns the reverse notation of the domain.
    You can pass in a string domain or an extraction result from tldextract
    """
    if isinstance(domain, str):
        domain_parts = tld_extract(domain)
    else:
        domain_parts = domain

    suffix = domain_parts.suffix
    if suffix != "":
        if "." in suffix:
            suffix = ".".join(suffix.split(".")[::-1])
        suffix = f"{suffix}."

    domain = domain_parts.domain

    reversed_subdomain = ""
    if domain_parts.subdomain != "":
        sd_list = ".".join(domain_parts.subdomain.split(".")[::-1])
        reversed_subdomain = f".{sd_list}"

    return f"{suffix}{domain}{reversed_subdomain}"
예제 #8
0
def parent_domain(domain):
    return tld_extract(domain).registered_domain
예제 #9
0
def parent_domain(domain):
    return tld_extract(domain).registered_domain
예제 #10
0
def analyze_js_source(domain, input_):
    """Find the type of source of the javascript tags that are given as input.

    Classifies the source of a script tag into internal and external includes
    and into inline scripts.

    If external resources are loaded via a 'src' attribute it is determined
    whether the resources are loaded from the same page ('include-local') or
    ('include-external'). Resources that are loaded from the same domain
    (including subdomains) are also identified as 'include-local'.

    The dicts returned contain the following keys:

    * type
        one of 'inline-js', 'inline-unknown', 'inline-json', 'include-local',
        'include-external', or 'include' (only if domain is None)
    * src *(for type == include\*)*
        the full included path
    * src_path *(for type == include\*)*
        relative path to included resource
    * 'src_domain' *(for type == include-external)*
        domain from which resource is included
    * 'content' *(for type == inline-\*)*
        content of the script tag


    Parameters
    ----------
    domain : str or None
        Domain the tags have been scraped from. Used to detect whether scripts
        are local or external includes. Expects the domain to be given up to
        the second level (e.g. domain="nic.at"). If domain is None, script
        tags that load from an absolute path are marked as "include", since
        in this case external and local includes can not be distinguished.
    input_ : list of (dict,str)
        List of script tags to analyze where the dictionary contains
        the attributes of the tag and the str the text of the tag.

    Returns
    -------
    js_code : list of dict
        the script sources found. Each dict describes a single <script> tag
        found. The keys are:

    """
    sources = []
    try:
        for attrs, text in input_:
            result = {}
            if "src" in attrs:
                # These are (possibly external) includes
                url = attrs["src"]
                result["src"] = url
                p_url = urlparse(url)
                result["src_path"] = p_url.path
                if p_url.netloc:
                    o_tld = tld_extract(p_url.netloc)
                    script_domain = o_tld.domain + "." + o_tld.suffix
                    if domain is None:
                        result["type"] = "include"
                        result["src_domain"] = script_domain
                    elif script_domain != domain:
                        result["type"] = "include-external"
                        result["src_domain"] = script_domain
                    else:
                        result["type"] = "include-local"
                else:
                    if result["src_path"].startswith("./"):
                        result["src_path"] = \
                            result["src_path"].replace("./", "/", 1)
                    result["type"] = "include-local"
            else:
                result["content"] = text
                # if tag has no type attr it is assumed text/javascript
                if "type" not in attrs:
                    result["type"] = "inline-js"
                elif (isinstance(attrs["type"], basestring)
                      and (attrs["type"].lower() in KNOWN_SCRIPT_MIME_TYPES)):
                    result["type"] = "inline-%s" % (
                        KNOWN_SCRIPT_MIME_TYPES[attrs["type"].lower()])
                else:
                    result["type"] = "inline-unknown"
            sources.append(result)
    except TypeError as e:
        raise ValueError("could not iterate over input_ "
                         "- expected list of tuples (dict,str) - %s" % str(e))
    return sources