def extract_url_and_title(link): if isinstance(link.contents[0], bs.Tag): if link.contents[0].name == 'img': img_target = link.contents[0].get('src') return link.get('href'), '(img){0}'.format(img_target) else: title = remove_text_formatting_markup_from_fragments(link.contents) return link.get('href'), title else: return link.get('href'), remove_text_formatting_markup_from_fragments(link.contents)
def extract_text_content_and_links_from_articletext(main_content, has_intro=True): article_text = main_content in_text_tagged_urls = [] all_cleaned_paragraphs = [] all_rough_paragraphs = [] all_plaintext_urls = [] embedded_tweets = [] def is_text_content(blob): if isinstance(blob, bs.Tag) and blob.name in TEXT_MARKUP_TAGS: return True if isinstance(blob, bs.NavigableString): return True return False text_fragments = [c for c in article_text.contents if is_text_content(c)] if text_fragments: # we first need to avoid treating embedded tweets as text for paragraph in text_fragments: if isinstance(paragraph, bs.NavigableString): all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph)) all_rough_paragraphs.append(paragraph) else: if not paragraph.find("blockquote", {"class": "twitter-tweet"}): in_text_links = extract_and_tag_in_text_links(paragraph) in_text_tagged_urls.extend(in_text_links) all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph)) all_rough_paragraphs.append(paragraph) else: embedded_tweets.extend( twitter_utils.extract_rendered_tweet(paragraph, DHNET_NETLOC, DHNET_INTERNAL_SITES) ) # extracting plaintext links for paragraph in all_rough_paragraphs: plaintext_urls = extract_plaintext_urls_from_text( remove_text_formatting_and_links_from_fragments(paragraph) ) for url in plaintext_urls: tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES) tags.update(["plaintext", "in text"]) all_plaintext_urls.append(make_tagged_url(url, url, tags)) else: all_cleaned_paragraphs = [] return all_cleaned_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets
def sanitize_paragraph(paragraph): """Returns plain text article""" sanitized_paragraph = [remove_text_formatting_markup_from_fragments(fragment, strip_chars='\t\r\n') for fragment in paragraph.contents if not isinstance(fragment, BeautifulSoup.Comment)] return ''.join(sanitized_paragraph)
def extract_intro(main_content): hat = main_content.find('div', {'id': 'articleHat'}) if hat: return remove_text_formatting_markup_from_fragments(hat.contents, strip_chars=' \t\r\n') else: return u''
def extract_intro(soup): intro_box = soup.find(attrs={"class": "intro"}) tagged_urls = [] if intro_box: intro_fragments = intro_box.find_all('b') intro = utils.remove_text_formatting_markup_from_fragments(intro_fragments) inline_links = intro_box.find_all("a") titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] plaintext_urls = utils.extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(intro)) for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.update(base_tags) tags.add('in intro') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.add('in intro') tags.add('plaintext') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) else: intro = "" return intro, tagged_urls
def extract_text_content_and_links(soup) : tagged_urls = list() inline_links = [] text = list() article_body = soup.find(attrs = {"class" : "article-body"}) text_fragments = article_body.find_all("p") other_fragments = article_body.find_all("h2", {"style": "display: inline; font-size: 1em; padding: 0px; margin: 0px;"}) all_fragments = text_fragments + other_fragments if all_fragments: for paragraph in text_fragments: text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph))) plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(['plaintext', 'in text']) tagged_urls.append(tagging.make_tagged_url(url, url, tags)) else: text = u"" for p in all_fragments: link = p.find_all("a") inline_links.extend(link) titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(base_tags) tags.add('in text') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) return text, tagged_urls
def extract_url_and_title(item): url = item.a.get('href') title = remove_text_formatting_markup_from_fragments(item.a.contents) tags = set() if not title: title = u'No Title' tags.add('ghost link') return url, title, tags
def cleanup_text_fragment(text_fragment): """ Recursively cleans up a text fragment (e.g. nested tags). Returns a plain text string with no formatting info whatsoever. """ if isinstance(text_fragment, bs.Tag): return remove_text_formatting_markup_from_fragments(text_fragment.contents) else: return text_fragment
def extract_text_content_and_links(main_content): article_text = main_content.find('div', {'id': 'articleText'}) in_text_tagged_urls = [] all_rough_paragraphs = [] all_clean_paragraphs = [] all_plaintext_urls = [] embedded_tweets = [] def is_text_content(blob): if isinstance(blob, BeautifulSoup.Tag) and blob.name in TEXT_MARKUP_TAGS: return True if isinstance(blob, BeautifulSoup.NavigableString): return True return False text_fragments = [c for c in article_text.contents if is_text_content(c)] if text_fragments: for paragraph in text_fragments: if isinstance(paragraph, BeautifulSoup.NavigableString): all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n')) all_rough_paragraphs.append(paragraph) else: if not paragraph.find('blockquote', {'class': 'twitter-tweet'}): in_text_links = extract_and_tag_in_text_links(paragraph) in_text_tagged_urls.extend(in_text_links) all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n')) all_rough_paragraphs.append(paragraph) else: embedded_tweets.extend( twitter_utils.extract_rendered_tweet(paragraph, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)) for p in all_rough_paragraphs: plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(p)) for url in plaintext_urls: tags = classify_and_tag(url, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) tags.update(['plaintext', 'in text']) all_plaintext_urls.append(make_tagged_url(url, url, tags)) else: all_clean_paragraphs = [] return all_clean_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets
def extract_intro(main_article): left_column = main_article.find('div', {'id':'leftCol'}) intro_container = left_column.find('h2', recursive=False) if intro_container: intro = remove_text_formatting_markup_from_fragments(intro_container.contents) else: intro = None return intro
def extract_article_data(source): """ """ if hasattr(source, "read"): html_content = source.read() else: try: html_content = fetch_html_content(source) except HTTPError as e: if e.code == 404: return None, None else: raise except Exception: raise soup = make_soup_from_html_content(html_content) main_content = soup.find("div", {"id": "maincontent"}) if main_content and main_content.h1: title = remove_text_formatting_markup_from_fragments(main_content.h1.contents) pub_date, pub_time = extract_date_from_maincontent(main_content) category = extract_category_from_maincontent(main_content) author_name = extract_author_name_from_maincontent(main_content) article_text = main_content.find("div", {"id": "articleText"}) if article_has_intro(article_text): intro = extract_intro_from_articletext(article_text) text, in_text_links = extract_text_content_and_links_from_articletext(article_text) else: intro = u"" text, in_text_links = extract_text_content_and_links_from_articletext(article_text, False) audio_content_links = ipm_utils.extract_embedded_audio_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) sidebox_links = ipm_utils.extract_and_tag_associated_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) bottom_links = ipm_utils.extract_bottom_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) embedded_content_links = extract_links_to_embedded_content(main_content) all_links = in_text_links + sidebox_links + embedded_content_links + bottom_links + audio_content_links updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.DHNET_SAME_OWNER) fetched_datetime = datetime.today() # print generate_test_func('twizz_stream', 'dhnet', dict(tagged_urls=updated_tagged_urls)) # save_sample_data_file(html_content, source, 'twizz_stream', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/dhnet') # import os # generate_unittest("links_tweet_with_emoji", "dhnet", dict(urls=updated_tagged_urls), html_content, source, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/dhnet"), True) new_article = ArticleData( source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author_name, intro, text ) return new_article, html_content else: return None, html_content
def extract_intro_from_articletext(article_text): """ Finds the introduction paragraph, returns a string with the text """ # intro text seems to always be in the first paragraph. if article_has_intro(article_text): intro_paragraph = article_text.p return remove_text_formatting_markup_from_fragments(intro_paragraph.contents) # but sometimes there is no intro. What the hell. else: return u""
def select_title_and_url(selector, tag_name): url = selector.select("./@href").extract()[0] title = selector.select(".//text()").extract() if title: title = remove_text_formatting_markup_from_fragments(title[0]) tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags = tags.union([tag_name]) else: tags = set([tag_name, constants.GHOST_LINK_TAG]) title = constants.GHOST_LINK_TITLE return make_tagged_url(url, title, tags)
def extract_intro(soup): if soup.find(attrs={"class": "article-content"}).h3: intro_box = soup.find(attrs={"class": "article-content"}) def extract_links_from_intro(fragment): tagged_urls = list() inline_links = fragment.find_all('a') titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(fragment)) for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(base_tags) tags.add('in intro') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.add('in intro') tags.add('plaintext') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) return tagged_urls if len(intro_box.find("h3").contents) > 0: fragment = intro_box.find("h3").contents[0] tagged_urls = extract_links_from_intro(intro_box.find("h3")) intro = remove_text_formatting_markup_from_fragments(fragment, strip_chars='\t\r\n').rstrip() return intro, tagged_urls if intro_box.find("h3").find_next_sibling("p"): fragment = intro_box.find("h3").find_next_sibling("p") tagged_urls = extract_links_from_intro(fragment) intro = remove_text_formatting_markup_from_fragments(fragment, strip_chars='\t\r\n') return intro, tagged_urls else: return [], [] else: return [], []
def extract_title_and_url_from_bslink(link): base_tags = [] if link.get('href'): url = link.get('href') else: url = constants.GHOST_LINK_URL base_tags.append(constants.GHOST_LINK_TAG) if link.contents: title = remove_text_formatting_markup_from_fragments(link.contents) else: title = constants.GHOST_LINK_TITLE base_tags.append(constants.GHOST_LINK_TAG) return title, url, base_tags
def extract_headlines_from_column_3(column): stories = column.findAll('div', {'class': 'octetFun'}) last_story = column.findAll('div', {'class': 'octetFun noborder'}) if last_story: stories.append(last_story[0]) headlines = list() for story in stories: if story.h3.a.contents: clean_title = remove_text_formatting_markup_from_fragments(story.h3.a.contents) if story.h3.a.get('href'): title_and_url = clean_title, story.h3.a.get('href') headlines.append(title_and_url) return headlines
def extract_text_and_links_from_paragraph(paragraph_hxs): def separate_img_and_text_links(links): img_links = [l for l in links if l.select("./img")] text_links = [l for l in links if l not in img_links] return [extract_title_and_url(link) for link in text_links], [extract_img_link_info(link) for link in img_links] links = paragraph_hxs.select(".//a") titles_and_urls, img_targets_and_urls = separate_img_and_text_links(links) tagged_urls = list() for title, url in titles_and_urls: tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags.update(['in text']) if title == constants.GHOST_LINK_TITLE: tags.update([constants.GHOST_LINK_TAG]) tagged_urls.append(make_tagged_url(url, title, tags)) for img_target, url in img_targets_and_urls: tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags.update(['in text', 'embedded image']) tagged_urls.append(make_tagged_url(url, img_target, tags)) # plaintext urls text_fragments = paragraph_hxs.select("./text()").extract() if text_fragments: text = u"".join(remove_text_formatting_markup_from_fragments(text_fragments)) for paragraph in text_fragments: plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags.update(['plaintext', 'in text']) tagged_urls.append(make_tagged_url(url, url, tags)) else: text = u"" # iframes iframes = paragraph_hxs.select(".//iframe") for iframe in iframes: target_url, tags = extract_and_tag_iframe_source(iframe) tagged_urls.append(make_tagged_url(target_url, "__EMBEDDED_IFRAME__", tags)) return text, tagged_urls
def extract_external_links(main_article): container = main_article.find('div', {'class':'art_ext_links'}) if container: link_list = container.ul items = link_list.findAll('li') urls_and_titles = [(i.a.get('href'), remove_text_formatting_markup_from_fragments(i.a.contents)) for i in items] tagged_urls = list() for url, title in urls_and_titles: tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES) tagged_urls.append(make_tagged_url(url, title, tags)) return tagged_urls else: return []
def extract_related_links(main_article): container = main_article.find('div', {'class':'relatedArticles'}) if container: left_list, right_list = container.findAll('ul') all_list_items = [link_list.findAll('li', recursive=False) for link_list in (left_list, right_list)] tagged_urls = list() for item in chain(*all_list_items): url, title = item.a.get('href'), remove_text_formatting_markup_from_fragments(item.a.contents) tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES) tags.add('associated') tagged_urls.append(make_tagged_url(url, title, tags)) return tagged_urls else: return []
def extract_embedded_links_from_articlebody(article_body): embedded_links = list() for link in extract_usable_links(article_body): url = link.get('href') title = remove_text_formatting_markup_from_fragments(link.contents) tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES) tags.add('in text') embedded_links.append(make_tagged_url(url, title, tags)) for embedded_video_frame in article_body.findAll('iframe'): url = embedded_video_frame.get('src') title = '[Video] {0}'.format(url) tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES) tags = tags.union(['in text', 'embedded']) embedded_links.append(make_tagged_url(url, title, tags)) return embedded_links
def extract_links_and_text_content(main_article): article_body = main_article.find('div', {'class':'articleBody rtl_margin_top_25'}) embedded_links = extract_embedded_links_from_articlebody(article_body) all_paragraphs = article_body.findAll('p', recursive=False) cleaned_up_paragraphs = list() all_plaintext_urls = list() for p in all_paragraphs: paragraph = remove_text_formatting_markup_from_fragments(p.contents) plaintext_urls = extract_plaintext_urls_from_text(paragraph) for url in plaintext_urls: tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES) tags = tags.union(['in text', 'plaintext']) all_plaintext_urls.append(make_tagged_url(url, url, tags)) cleaned_up_paragraphs.append(paragraph) all_links = embedded_links+all_plaintext_urls return all_links, cleaned_up_paragraphs
def extract_article_data_new_style(source, hxs): """ """ category = hxs.select("//nav [contains(@id,'breadcrumb')]//li").extract() datetime_string = hxs.select("//div [@class='row content']//time/@datetime").extract() if not datetime_string: raise ValueError("Could not find the date, update the parser") parsed_datetime = datetime_from_iso8601(datetime_string[0]) pub_date, pub_time = parsed_datetime.date(), parsed_datetime.time() fetched_datetime = datetime.now() title = hxs.select("//header//h1/text()").extract() if not title: raise ValueError() title = title[0] content_hxs = hxs.select("//div [@class='entry-content']") author_fragments = content_hxs.select(".//p [@class='copyright']/text()").extract() author = ''.join([remove_text_formatting_markup_from_fragments(author_fragments, strip_chars='\r\n\t ')]) intro, intro_links = extract_intro_and_links_new(content_hxs) content, content_links = extract_content_and_links_new(content_hxs) other_div_hxs = content_hxs.select("//div [@class='entry-content']/div [not(contains(@class, 'entry-'))]") content_media_links = extract_links_from_other_divs(other_div_hxs) related_links = extract_related_links(hxs) media_links = extract_links_from_embbeded_media(content_hxs) tag_links = extract_links_from_tags(hxs) all_links = it.chain(intro_links, content_links, media_links, content_media_links, related_links, tag_links) updated_tagged_urls = update_tagged_urls(all_links, LAVENIR_SAME_OWNER) article_data = ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, content) return article_data
def extract_text_and_links_from_paragraph(paragraph): def extract_url_and_title(link): if isinstance(link.contents[0], bs.Tag): if link.contents[0].name == 'img': img_target = link.contents[0].get('src') return link.get('href'), '(img){0}'.format(img_target) else: title = remove_text_formatting_markup_from_fragments(link.contents) return link.get('href'), title else: return link.get('href'), remove_text_formatting_markup_from_fragments(link.contents) # Why do we filter on link.contents? Because sometimes there # are <a id="more"></a> links which point to nothing. # Awesome. urls_and_titles = [extract_url_and_title(link) for link in paragraph.findAll('a', recursive=False) if link.contents] tagged_urls = list() for url, title in urls_and_titles: tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES) tags.update(['in text']) tagged_urls.append(make_tagged_url(url, title, tags)) text_fragments = paragraph.contents if text_fragments: text = u"".join(remove_text_formatting_markup_from_fragments(text_fragments)) plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(text_fragments)) for url in plaintext_urls: tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES) tags.update(['plaintext', 'in text']) tagged_urls.append(make_tagged_url(url, url, tags)) else: text = u"" return text, tagged_urls
def extract_text_content_and_links(soup): article_text = [] inline_links = [] plaintext_urls = [] content_box = soup.find(attrs={"id": "detail_content"}) text = content_box.find_all(attrs={"class": "clear"}) for fragment in text: paragraphs = fragment.find_all("p", recursive=False) for p in paragraphs: clean_text = remove_text_formatting_markup_from_fragments(p, strip_chars="\n") if clean_text: article_text.append(clean_text) found_plaintext_links = utils.extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(p)) plaintext_urls.extend(found_plaintext_links) link = p.find_all("a") inline_links.extend(link) titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] tagged_urls = list() for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.update(base_tags) tags.add('in text') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.add('in text') tags.add('plaintext') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) return article_text, tagged_urls
def extract_text_content(story): """ Finds the story's body, cleans up the text to remove all html formatting. Returns a list of strings, one per found paragraph, and all the plaintext urls, as TaggedURLs """ story = story.find('div', {'id': 'story_body'}) paragraphs = story.findAll('p', recursive=False) tagged_urls = list() # extract regular, in text links inline_links = list() plaintext_urls = list() text = list() if paragraphs: for paragraph in paragraphs: text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph))) links = paragraph.findAll('a', recursive=True) inline_links.extend(links) plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) tags.update(['plaintext', 'in text']) tagged_urls.append(make_tagged_url(url, url, tags)) titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links if not i.find('img')] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) tags.add('in text') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) else: text = u"" return text, tagged_urls
def extract_title(main_article): left_column = main_article.find('div', {'id':'leftCol'}) title = left_column.find('h1', {'class':'rtl_font_weight_normal'}) return remove_text_formatting_markup_from_fragments(title.contents)
def extract_category(soup): category_box = soup.find(attrs={"class": "actua_nav"}) links = category_box.find_all('a') return [utils.remove_text_formatting_markup_from_fragments(link.contents[0]) for link in links]
def find_embedded_media_in_multimedia_box(multimedia_box): tagged_urls = list() all_sections = multimedia_box.findAll("section") for section in all_sections: if 'photo' in section.attrs['class']: continue elif 'poll' in section.attrs['class']: continue elif 'asset' in section.attrs['class']: url = section.find('a').get('href') title = section.find('a').contents[0] tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.add('embedded') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) elif 'video' in section.attrs['class']: # it might be an iframe if section.find("iframe"): iframe = section.find("iframe") url = iframe.get("src") if url: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.add('embedded') tags.add('iframe') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) else: raise ValueError("There seems to be an iframe but we could not find a link. Please update parser.") elif section.find("embed"): embedded_stuff = section.find("embed") url = embedded_stuff.get("src") if url: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.add('embedded') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) else: raise ValueError("There seems to be an embedded video but we could not find a link. Please update parser.") else: raise ValueError("There seems to be an embedded video but we could not identify it. Please update parser.") elif 'snippet' in section.attrs['class']: # it might be a tweet tweets = section.find_all(attrs={"class": "twitter-tweet"}) if tweets: for tweet in tweets: links = tweet.find_all("a") for link in links: if link.get("data-datetime"): url = link.get("href") tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.add('embedded') tags.add('tweet') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) # it might be an embedded javascript object that shows a twitter account or query twitter_widget = section.find_all(attrs={"class": "tweet_widget"}) if twitter_widget: if len(twitter_widget) == 1: if twitter_widget[0].find('script'): script_url = twitter_widget[0].find('script').get('src') if twitter_utils.is_twitter_widget_url(script_url): title, url, tags = twitter_utils.get_widget_type(twitter_widget[0].findAll('script')[1].contents[0]) tags |= tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags |= set(['script', 'embedded']) tagged_urls.append(tagging.make_tagged_url(url, title, tags)) elif section.find("script"): script_url = section.find('script').get('src') if twitter_utils.is_twitter_widget_url(script_url): title, url, tags = twitter_utils.get_widget_type(section.findAll('script')[1].contents[0]) tags |= tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags |= set(['script', 'embedded']) tagged_urls.append(tagging.make_tagged_url(url, title, tags)) else: raise ValueError("Embedded script of unknown type was detected ('{0}'). Update the parser.".format(script_url)) elif twitter_widget[0].find('noscript'): noscript = twitter_widget[0].find('noscript') link = noscript.find('a') if link: url = link.get('href') title = remove_text_formatting_markup_from_fragments(link.contents) all_tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) all_tags |= set(['script', 'embedded']) tagged_urls.append(tagging.make_tagged_url(url, title, all_tags)) else: raise ValueError("No link was found in the <noscript> section. Update the parser.") else: raise ValueError("Could not extract fallback noscript url for this embedded javascript object. Update the parser.") else: raise ValueError("There seems to be more than one embedded twitter wdget in the SNIPPET, check this") # it might be a spotify container spotify_widget = section.find(attrs={"class": "spotify"}) if spotify_widget: if spotify_widget.find("iframe").get("src"): url = spotify_widget.find("iframe").get("src") all_tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) all_tags |= set(['spotify', 'embedded']) tagged_urls.append(tagging.make_tagged_url(url, url, all_tags)) else: raise ValueError("There seems to be a spotify widget but we could not find a link") else: raise ValueError("There seems to be an undefined embedded media here, you should check") return tagged_urls
def sanitize_paragraph(paragraph): """ Removes image links, removes paragraphs, formatting """ return remove_text_formatting_markup_from_fragments(paragraph)
def extract_link_and_title(link): return link.get("href"), remove_text_formatting_markup_from_fragments(link.contents)