def link_report(df): tweet_df = df['Tweet'] all_tweet_links = "" for tweet in tweet_df: all_tweet_links += tweet + " " all_urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', all_tweet_links) print("*" * 50) print("Total number of links : " + str(len(all_urls))) print("*" * 50) links = {'Links': all_urls} link_df = pd.DataFrame(links, columns=['Links']) domain_count = {} for link in link_df['Links']: url = uex.expand(link) domain = uex.get_domain(url) # print(domain) if domain in domain_count: domain_count[domain] += 1 else: domain_count[domain] = 1 # print(domain_count) domain_df = pd.DataFrame.from_dict(domain_count, orient='index', columns=['Count']) print(domain_df.sort_values(by=['Count'], ascending=False).to_string())
def find_tld_from_email(x): # isolates the website x = x.split('@')[-1] # gets tld if x: x = get_domain(x) return x
def amp_parser(body: element.Tag) -> List[Dict]: """ AMP links which look like regular links. Makes a distinction between search results with text and regular links """ data = [] for elm in body.find_all("a", attrs={'data-amp': True}): category = 'amp-card' url = elm["data-amp"] domain = get_domain(url) parent = elm.parent.parent.parent if ('data-amp-st' not in elm.attrs and parent.get('role') != 'listitem' and not elm.parent.parent.get('data-hveid') and not parent.parent.parent.name == 'g-card'): if any(e for e in parent.find_all('span', recursive=True, text=True, attrs={ "role": False, "aria-level": False, "class": True }) if len(e.text) > 50): elm = parent category = 'amp-search_result_2' else: parent = elm.parent.parent for div in parent.find_all('div', recursive=True, attrs={ "role": False, 'style': False, 'data-ved': False, 'jscontroller': False, "aria-level": False, "class": True }): if div.text: elm = parent category = 'amp-search_result_3' else: if any(_ for _ in div.find_all( 'span', text=True, attrs={'class': True}) if len(_.text) > 50): elm = parent category = 'amp-search_result_3b' if domain == 'google.com': category += '_google' if 'data-amp-st' in elm.attrs: category = 'amp-visual_stories' row = element_to_dict(elm, url=url, domain=domain, category=category) data.append(row) return data
def __custom_filter(url): """This function returns True if the url is a shortened URL with custom addons like lajunta.es o opgob.es Arguments: url {str} -- Input url (to be determined if is a shortened url) Returns: [bool] -- if url is shortened """ if urlexpander.get_domain(url) in ['lajunta.es','opgob.es','chng.it']: return True elif urlexpander.is_short(url): return True else: return False
def knowledge_panel_factoids_parser(body: element.Tag) -> List[Dict]: """ Factoids parsed from the open web. Only finds those without links, the links will show up as 'link-google' ala the link_parser. """ data = [] for elm in body.find_all('div', attrs={ 'data-attrid': re.compile('^(kc:|ss:|hw:|okra:)'), 'lang': True }): for span in elm.find_all('span', recursive=True, attrs={ 'role': False, 'aria-level': False, 'jsaction': False }): if (span.text and len(span.text) > 1): not_under_link = True # make sure the span doesn't have an organic link. for link in span.find_all('a', href=True): link_domain = get_domain(link['href']) if (link_domain not in javascript + ['google.com'] and link_domain[0] != '/'): not_under_link = False # make sure the span isn't in a link check = span for _ in range(4): check = check.parent if check.name == 'a': not_under_link = False break if not_under_link: row = element_to_dict( span, category='answer-knowledge_graph_factoid') data.append(row) return data
def get_domain(url): '''Returns the domain name for any given url.''' if isinstance(url, str): return urlexpander.get_domain(url)
def link_parser(body): """ Parses all a tags with `href` attributes. Decides if the url is `organic`, or from a Google property such as "youtube" or google ad services. """ data = [] for elm in body.find_all('a', href=True, attrs={'data-amp': False}): url = elm['href'] domain = get_domain(url) category = 'link-google' if url in javascript: domain = 'google.com' category = 'link-javascript' # skip call, share, and save icons if any(elm.find_all('div', text=re.compile('Call|Share|Save'))): continue # links to Google Ad services... elif domain[0] == '/': if domain.split('?')[0] == '/aclk': # check this category = 'ads-google_ad_services' domain = 'google.com' elif domain == 'googleadservices.com': category = 'ads-google_ad_services' elif domain not in javascript + ['google.com']: """ This is mostly logic for organic, but it applies to links that are from Google. This is to find then entire element link + hyperlink for search results """ category = 'organic' if (not any(e for e in elm.attrs if e in ['data-ved', 'target']) and not any(elm.find_all('g-img', recursive=True))): # get the sibling of the parent of the link elm_potential_text = elm.parent.find_next_sibling('div') if elm_potential_text and not 'data-attrid' in elm.parent.attrs: if any( elm_potential_text.find_all('div', recursive=True, attrs={ "role": False, "aria-level": False, "jsname": False })): category = 'organic-search_result_1a' elm = elm.parent # limit this if 'data-ved' not in elm.attrs: elm = elm.parent elif any( elm_potential_text.find_all('span', recursive=True, text=True, attrs={ "role": False, "aria-level": False })): category = 'organic-search_result_1b' elm = elm.parent.parent else: elm_potential_text = elm.parent.parent.find_next_sibling( 'div') if elm_potential_text and elm.parent.name != 'h3': if any( elm_potential_text.find_all('div', recursive=True, text=True, attrs={ "role": False, "aria-level": False, "jsname": False })): category = 'organic-search_result_2a' elm = elm.parent.parent.parent elif any( elm_potential_text.find_all('span', recursive=True, text=True, attrs={ "role": False, "aria-level": False })): category = 'organic-search_result_2b' elm = elm.parent.parent.parent elif any( elm_potential_text.find_all( 'table', recursive=True, attrs={"class": True})): category = 'organic-search_result_2c' elm = elm.parent.parent.parent # tweets if 'gws-twitter-link' in elm.attrs.get('class', []): for _ in range(3): elm = elm.parent category = 'organic-tweet_1' # set categories for Google products if domain == 'youtube.com': if 'organic-' in category: category = category.replace('organic-', 'link-youtube_') else: category = 'link-youtube' if 'tabindex' in elm.attrs: for div in elm.find_all('div'): elm = div break elif domain in javascript + ['google.com']: if 'organic-' in category: category = category.replace('organic-', 'link-google_') else: category = 'link-google' if 'data-merchant-id' in elm.attrs: category = 'ads-merchant' elif 'aclk?' in url: category = 'ads-google_ad_services' elif elm.parent.parent.name == 'g-tray-header': check = elm.parent.parent if 'style' in check.attrs: for _ in range(2): elm = elm.parent if 'organic-' in category: category = category.replace('organic-', 'link-button_2_') else: category = 'link-button_2' elif elm.parent.parent.parent.name == 'g-inner-card' and elm.name == 'a': if 'organic-' in category: category = category.replace('organic-', 'link-google_2_') else: category = 'link-google_2' for _ in range(3): elm = elm.parent row = element_to_dict(elm, url=url, domain=domain, category=category) data.append(row) return data