def extract_text_content_and_links_from_articletext(main_content, has_intro=True): article_text = main_content in_text_tagged_urls = [] all_cleaned_paragraphs = [] all_rough_paragraphs = [] all_plaintext_urls = [] embedded_tweets = [] def is_text_content(blob): if isinstance(blob, bs.Tag) and blob.name in TEXT_MARKUP_TAGS: return True if isinstance(blob, bs.NavigableString): return True return False text_fragments = [c for c in article_text.contents if is_text_content(c)] if text_fragments: # we first need to avoid treating embedded tweets as text for paragraph in text_fragments: if isinstance(paragraph, bs.NavigableString): all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph)) all_rough_paragraphs.append(paragraph) else: if not paragraph.find("blockquote", {"class": "twitter-tweet"}): in_text_links = extract_and_tag_in_text_links(paragraph) in_text_tagged_urls.extend(in_text_links) all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph)) all_rough_paragraphs.append(paragraph) else: embedded_tweets.extend( twitter_utils.extract_rendered_tweet(paragraph, DHNET_NETLOC, DHNET_INTERNAL_SITES) ) # extracting plaintext links for paragraph in all_rough_paragraphs: plaintext_urls = extract_plaintext_urls_from_text( remove_text_formatting_and_links_from_fragments(paragraph) ) for url in plaintext_urls: tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES) tags.update(["plaintext", "in text"]) all_plaintext_urls.append(make_tagged_url(url, url, tags)) else: all_cleaned_paragraphs = [] return all_cleaned_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets
def extract_text_content_and_links(main_content): article_text = main_content.find('div', {'id': 'articleText'}) in_text_tagged_urls = [] all_rough_paragraphs = [] all_clean_paragraphs = [] all_plaintext_urls = [] embedded_tweets = [] def is_text_content(blob): if isinstance(blob, BeautifulSoup.Tag) and blob.name in TEXT_MARKUP_TAGS: return True if isinstance(blob, BeautifulSoup.NavigableString): return True return False text_fragments = [c for c in article_text.contents if is_text_content(c)] if text_fragments: for paragraph in text_fragments: if isinstance(paragraph, BeautifulSoup.NavigableString): all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n')) all_rough_paragraphs.append(paragraph) else: if not paragraph.find('blockquote', {'class': 'twitter-tweet'}): in_text_links = extract_and_tag_in_text_links(paragraph) in_text_tagged_urls.extend(in_text_links) all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n')) all_rough_paragraphs.append(paragraph) else: embedded_tweets.extend( twitter_utils.extract_rendered_tweet(paragraph, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)) for p in all_rough_paragraphs: plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(p)) for url in plaintext_urls: tags = classify_and_tag(url, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) tags.update(['plaintext', 'in text']) all_plaintext_urls.append(make_tagged_url(url, url, tags)) else: all_clean_paragraphs = [] return all_clean_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets