Exemplo n.º 1
0
def link_report(df):
    tweet_df = df['Tweet']
    all_tweet_links = ""
    for tweet in tweet_df:
        all_tweet_links += tweet + " "

    all_urls = re.findall(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        all_tweet_links)

    print("*" * 50)
    print("Total number of links : " + str(len(all_urls)))
    print("*" * 50)

    links = {'Links': all_urls}

    link_df = pd.DataFrame(links, columns=['Links'])

    domain_count = {}
    for link in link_df['Links']:
        url = uex.expand(link)
        domain = uex.get_domain(url)
        # print(domain)
        if domain in domain_count:
            domain_count[domain] += 1
        else:
            domain_count[domain] = 1

    # print(domain_count)

    domain_df = pd.DataFrame.from_dict(domain_count,
                                       orient='index',
                                       columns=['Count'])

    print(domain_df.sort_values(by=['Count'], ascending=False).to_string())
def find_tld_from_email(x):
    # isolates the website
    x = x.split('@')[-1]

    # gets tld
    if x:
        x = get_domain(x)
    return x
Exemplo n.º 3
0
def amp_parser(body: element.Tag) -> List[Dict]:
    """
    AMP links which look like regular links.
    Makes a distinction between search results
    with text and regular links
    """
    data = []
    for elm in body.find_all("a", attrs={'data-amp': True}):
        category = 'amp-card'
        url = elm["data-amp"]
        domain = get_domain(url)
        parent = elm.parent.parent.parent
        if ('data-amp-st' not in elm.attrs and parent.get('role') != 'listitem'
                and not elm.parent.parent.get('data-hveid')
                and not parent.parent.parent.name == 'g-card'):
            if any(e for e in parent.find_all('span',
                                              recursive=True,
                                              text=True,
                                              attrs={
                                                  "role": False,
                                                  "aria-level": False,
                                                  "class": True
                                              }) if len(e.text) > 50):
                elm = parent
                category = 'amp-search_result_2'

            else:
                parent = elm.parent.parent
                for div in parent.find_all('div',
                                           recursive=True,
                                           attrs={
                                               "role": False,
                                               'style': False,
                                               'data-ved': False,
                                               'jscontroller': False,
                                               "aria-level": False,
                                               "class": True
                                           }):
                    if div.text:
                        elm = parent
                        category = 'amp-search_result_3'
                    else:
                        if any(_ for _ in div.find_all(
                                'span', text=True, attrs={'class': True})
                               if len(_.text) > 50):
                            elm = parent
                            category = 'amp-search_result_3b'

        if domain == 'google.com':
            category += '_google'
        if 'data-amp-st' in elm.attrs:
            category = 'amp-visual_stories'
        row = element_to_dict(elm, url=url, domain=domain, category=category)
        data.append(row)

    return data
Exemplo n.º 4
0
def __custom_filter(url):
    """This function returns True if the url is a shortened URL with custom addons like lajunta.es o opgob.es
    
    Arguments:
        url {str} -- Input url (to be determined if is a shortened url)
    
    Returns:
        [bool] -- if url is shortened
    """

    
    if urlexpander.get_domain(url) in ['lajunta.es','opgob.es','chng.it']:
        return True
    elif urlexpander.is_short(url):
        return True
    else:
        return False
Exemplo n.º 5
0
def knowledge_panel_factoids_parser(body: element.Tag) -> List[Dict]:
    """
    Factoids parsed from the open web.
    Only finds those without links, the links will show up as 'link-google' ala the link_parser.
    """
    data = []
    for elm in body.find_all('div',
                             attrs={
                                 'data-attrid':
                                 re.compile('^(kc:|ss:|hw:|okra:)'),
                                 'lang': True
                             }):
        for span in elm.find_all('span',
                                 recursive=True,
                                 attrs={
                                     'role': False,
                                     'aria-level': False,
                                     'jsaction': False
                                 }):
            if (span.text and len(span.text) > 1):

                not_under_link = True
                # make sure the span doesn't have an organic link.
                for link in span.find_all('a', href=True):
                    link_domain = get_domain(link['href'])
                    if (link_domain not in javascript + ['google.com']
                            and link_domain[0] != '/'):
                        not_under_link = False

                # make sure the span isn't in a link
                check = span
                for _ in range(4):
                    check = check.parent
                    if check.name == 'a':
                        not_under_link = False
                        break
                if not_under_link:
                    row = element_to_dict(
                        span, category='answer-knowledge_graph_factoid')
                    data.append(row)
    return data
Exemplo n.º 6
0
def get_domain(url):
    '''Returns the domain name for any given url.'''
    if isinstance(url, str):
        return urlexpander.get_domain(url)
Exemplo n.º 7
0
def link_parser(body):
    """
    Parses all a tags with `href` attributes. 
    Decides if the url is `organic`, or from a Google property
    such as "youtube" or google ad services.
    """
    data = []
    for elm in body.find_all('a', href=True, attrs={'data-amp': False}):
        url = elm['href']
        domain = get_domain(url)
        category = 'link-google'
        if url in javascript:
            domain = 'google.com'
            category = 'link-javascript'
            # skip call, share, and save icons
            if any(elm.find_all('div', text=re.compile('Call|Share|Save'))):
                continue

        # links to Google Ad services...
        elif domain[0] == '/':
            if domain.split('?')[0] == '/aclk':  # check this
                category = 'ads-google_ad_services'
            domain = 'google.com'

        elif domain == 'googleadservices.com':
            category = 'ads-google_ad_services'

        elif domain not in javascript + ['google.com']:
            """
            This is mostly logic for organic, but it applies to links
            that are from Google. This is to find then entire element
            link + hyperlink for search results
            """
            category = 'organic'
            if (not any(e for e in elm.attrs if e in ['data-ved', 'target'])
                    and not any(elm.find_all('g-img', recursive=True))):
                # get the sibling of the parent of the link
                elm_potential_text = elm.parent.find_next_sibling('div')
                if elm_potential_text and not 'data-attrid' in elm.parent.attrs:
                    if any(
                            elm_potential_text.find_all('div',
                                                        recursive=True,
                                                        attrs={
                                                            "role": False,
                                                            "aria-level":
                                                            False,
                                                            "jsname": False
                                                        })):
                        category = 'organic-search_result_1a'
                        elm = elm.parent  # limit this
                        if 'data-ved' not in elm.attrs:
                            elm = elm.parent

                    elif any(
                            elm_potential_text.find_all('span',
                                                        recursive=True,
                                                        text=True,
                                                        attrs={
                                                            "role": False,
                                                            "aria-level": False
                                                        })):
                        category = 'organic-search_result_1b'
                        elm = elm.parent.parent
                else:
                    elm_potential_text = elm.parent.parent.find_next_sibling(
                        'div')
                    if elm_potential_text and elm.parent.name != 'h3':
                        if any(
                                elm_potential_text.find_all('div',
                                                            recursive=True,
                                                            text=True,
                                                            attrs={
                                                                "role": False,
                                                                "aria-level":
                                                                False,
                                                                "jsname": False
                                                            })):
                            category = 'organic-search_result_2a'
                            elm = elm.parent.parent.parent

                        elif any(
                                elm_potential_text.find_all('span',
                                                            recursive=True,
                                                            text=True,
                                                            attrs={
                                                                "role": False,
                                                                "aria-level":
                                                                False
                                                            })):
                            category = 'organic-search_result_2b'
                            elm = elm.parent.parent.parent
                        elif any(
                                elm_potential_text.find_all(
                                    'table',
                                    recursive=True,
                                    attrs={"class": True})):
                            category = 'organic-search_result_2c'
                            elm = elm.parent.parent.parent
                # tweets
                if 'gws-twitter-link' in elm.attrs.get('class', []):
                    for _ in range(3):
                        elm = elm.parent
                    category = 'organic-tweet_1'

        # set categories for Google products
        if domain == 'youtube.com':
            if 'organic-' in category:
                category = category.replace('organic-', 'link-youtube_')
            else:
                category = 'link-youtube'
            if 'tabindex' in elm.attrs:
                for div in elm.find_all('div'):
                    elm = div
                    break

        elif domain in javascript + ['google.com']:
            if 'organic-' in category:
                category = category.replace('organic-', 'link-google_')
            else:
                category = 'link-google'
            if 'data-merchant-id' in elm.attrs:
                category = 'ads-merchant'
            elif 'aclk?' in url:
                category = 'ads-google_ad_services'
            elif elm.parent.parent.name == 'g-tray-header':
                check = elm.parent.parent
                if 'style' in check.attrs:
                    for _ in range(2):
                        elm = elm.parent
                    if 'organic-' in category:
                        category = category.replace('organic-',
                                                    'link-button_2_')
                    else:
                        category = 'link-button_2'
            elif elm.parent.parent.parent.name == 'g-inner-card' and elm.name == 'a':
                if 'organic-' in category:
                    category = category.replace('organic-', 'link-google_2_')
                else:
                    category = 'link-google_2'
                for _ in range(3):
                    elm = elm.parent

        row = element_to_dict(elm, url=url, domain=domain, category=category)
        data.append(row)

    return data