def extract_links_from_text_hxs(hxs): tagged_urls = list() # intext urls: take all the <a>, except what might be inside a rendered tweet intext_link_hxs = hxs.select(".//a") for link_hxs in intext_link_hxs: title, url = extract_title_and_url(link_hxs) tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags.add('in text') tagged_urls.append(make_tagged_url(url, title, tags)) #plaintext text urls raw_content = hxs.select(".//p/text()").extract() if raw_content: for paragraph in raw_content: plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags.update(['plaintext', 'in text']) tagged_urls.append(make_tagged_url(url, url, tags)) #embedded objects iframe_sources = hxs.select(".//iframe/@src").extract() for url in iframe_sources: tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags = tags.union(['in text', 'embedded', 'iframe']) tagged_urls.append(make_tagged_url(url, url, tags)) return tagged_urls
def extract_text_content_and_links(soup) : tagged_urls = list() inline_links = [] text = list() article_body = soup.find(attrs = {"class" : "article-body"}) text_fragments = article_body.find_all("p") other_fragments = article_body.find_all("h2", {"style": "display: inline; font-size: 1em; padding: 0px; margin: 0px;"}) all_fragments = text_fragments + other_fragments if all_fragments: for paragraph in text_fragments: text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph))) plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(['plaintext', 'in text']) tagged_urls.append(tagging.make_tagged_url(url, url, tags)) else: text = u"" for p in all_fragments: link = p.find_all("a") inline_links.extend(link) titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(base_tags) tags.add('in text') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) return text, tagged_urls
def extract_links_from_sidebar_box(soup): tagged_urls = list() sidebar_box = soup.find(attrs={"class": "teas_article_306 mar10 clear clearfix relatedcomponents"}) # there are links to articles if sidebar_box: sidebar_box.find_all(attrs={"class": "clearfix"}) articles = sidebar_box.find_all(attrs={"class": "clearfix"}) links = articles[0].find_all("a") titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.update(base_tags) tags.add('sidebar box') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) # and also links to thematic tags tags = sidebar_box.find_all(attrs={"class": "bt_meer_over clearfix"}) for tag in tags: links = tag.find_all("a") titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.update(base_tags) tags.add('keyword') tags.add('sidebar box') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) return tagged_urls
def extract_intro(soup): intro_box = soup.find(attrs={"class": "intro"}) tagged_urls = [] if intro_box: intro_fragments = intro_box.find_all('b') intro = utils.remove_text_formatting_markup_from_fragments(intro_fragments) inline_links = intro_box.find_all("a") titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] plaintext_urls = utils.extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(intro)) for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.update(base_tags) tags.add('in intro') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.add('in intro') tags.add('plaintext') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) else: intro = "" return intro, tagged_urls
def extract_links_from_embedded_content(story): tagged_urls = [] # generic iframes iframe_items = story.findAll("iframe", recursive=True) for iframe in iframe_items: url = iframe.get('src') all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded', 'iframe']))) # extract embedded storify scripts = story.findAll('script', recursive=True) for script in scripts: url = script.get('src') if url: scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) if netloc == "storify.com": url = url.rstrip(".js") all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded', 'storify']))) # TO DO NEXT : reconstruc kplayer URL kplayer = story.find('div', {'class': 'containerKplayer'}) if kplayer: kplayer_flash = kplayer.find('div', {'class': 'flash_kplayer'}) url_part1 = kplayer_flash.object['data'] url_part2 = kplayer_flash.object.find('param', {'name': 'flashVars'})['value'] if url_part1 is not None and url_part2 is not None: url = "%s?%s" % (url_part1, url_part2) all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded', 'kplayer']))) else: raise ValueError("We couldn't find an URL in the flash player. Update the parser.") return tagged_urls
def extract_tagged_url_from_embedded_script(script, site_netloc, site_internal_sites): if script.get('src'): script_url = script.get('src') if twitter_utils.is_twitter_widget_url(script_url): if script.contents: title, url, tags = twitter_utils.get_widget_type(script.contents[0]) else: # sometimes the TWTR.Widget code is in the next <script> container. Whee. sibling_script = script.findNextSibling('script') title, url, tags = twitter_utils.get_widget_type(sibling_script.contents[0]) tags |= classify_and_tag(url, site_netloc, site_internal_sites) tags |= set(['script', 'embedded']) return make_tagged_url(url, title, tags) else: if script.findNextSibling('noscript'): noscript = script.findNextSibling('noscript') link = noscript.find('a') if link: url = link.get('href') title = remove_text_formatting_markup_from_fragments(link.contents) all_tags = classify_and_tag(url, site_netloc, site_internal_sites) all_tags |= set(['script', 'embedded']) return make_tagged_url(url, title, all_tags) else: raise ValueError("No link was found in the <noscript> section. Update the parser.") else: raise ValueError("Could not extract fallback noscript url for this embedded javascript object. Update the parser.") else: raise ValueError("Embedded script of unknown type was detected. Update the parser.")
def extract_associated_links(hxs): links = hxs.select("//div[@id='picture']/descendant::div[@class='bloc-01']//a") all_tagged_urls = [] if links: def extract_url_and_title(link_hxs): url = link_hxs.select('@href').extract()[0] title = u"".join(link_hxs.select("text()").extract()) tags = set() if not title: title = u'No Title' tags.add(constants.GHOST_LINK_TAG) if not url: url = u'' tags.add('no target') return url, title, tags all_tagged_urls = list() for item in links: url, title, tags = extract_url_and_title(item) tags.update(classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)) link_type = item.select('@class') if link_type and link_type[0] in LINK_TYPE_TO_TAG: tags.update(LINK_TYPE_TO_TAG[link_type]) tags.add("sidebar box") all_tagged_urls.append(make_tagged_url(url, title, tags)) media_links = hxs.select("//div[@id='picture']/descendant::div[@class='wrappAllMedia']/div") for i, item in enumerate(media_links): if item.select('./img'): pass # images are lame elif item.select(".//div[starts-with(@id, 'media-youtube')]"): youtube_div = item.select(".//div[starts-with(@id, 'media-youtube')]") youtube_object = youtube_div.select("./object") url = hxs_media_utils.extract_url_from_youtube_object(youtube_object) tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags |= set(['youtube', 'embedded', 'video']) title = parser_constants.NO_TITLE all_tagged_urls.append(make_tagged_url(url, title, tags)) elif item.select(".//div[contains(@class, 'emvideo-kewego')]"): kplayer_div = item.select(".//div[contains(@class, 'emvideo-kewego')]") kplayer_object = kplayer_div.select("./object") url = hxs_media_utils.extract_url_from_kplayer_object(kplayer_object) tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags |= set(['kewego', 'embedded', 'video']) title = parser_constants.NO_TITLE all_tagged_urls.append(make_tagged_url(url, title, tags)) elif not item.select("./div/text()"): pass # empty divs are lame else: raise ValueError("The media box contains something other than an image or a youtube video. Update your parser") return all_tagged_urls
def extract_associated_links(article): links_block = article.find('div', {'class': 'bloc-01'}) if links_block: link_list = links_block.find('ul') def extract_url_and_title(item): url = item.a.get('href') title = remove_text_formatting_markup_from_fragments(item.a.contents) tags = set() if not title: title = u'No Title' tags.add('ghost link') return url, title, tags all_tagged_urls = list() for item in link_list.findAll('li'): url, title, tags = extract_url_and_title(item) tags.update(classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES)) link_type = item.get('class') if link_type in LINK_TYPE_TO_TAG: tags.update(LINK_TYPE_TO_TAG[link_type]) tags.add("sidebar box") all_tagged_urls.append(make_tagged_url(url, title, tags)) return all_tagged_urls else: return []
def extract_content_and_links(hxs): content_paragraphs_hxs = hxs.select("//div [@id='article']/p[starts-with(@class, 'publiele')]/following-sibling::p") all_content_paragraphs, all_tagged_urls = list(), list() # process paragraphs for p in content_paragraphs_hxs: text, tagged_urls = extract_text_and_links_from_paragraph(p) all_content_paragraphs.append(text) all_tagged_urls.extend(tagged_urls) #extract embedded videos divs = hxs.select("//div [@id='article']/p[starts-with(@class, 'publiele')]/following-sibling::div/div [@class='bottomVideos']") for div in divs: urls = div.select("./div [contains(@class, 'emvideo-kewego')]//video/@poster").extract() for url in urls: tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags.update(['bottom', 'video', 'embedded']) all_tagged_urls.append(make_tagged_url(url, url, tags)) new_media_items = hxs.select("//div [@class='digital-wally_digitalobject']//li") all_tagged_urls.extend(extract_links_from_media_items(new_media_items)) return all_content_paragraphs, all_tagged_urls
def extract_embedded_media(soup): tagged_urls = list() # extract embedded media from any iframe in the article body content_box = soup.find(attrs={"id": "detail_content"}) text = content_box.find_all(attrs={"class": "clear"}) for fragment in text: for p in fragment.find_all("p", recursive=False): embedded_container = p.findAll("iframe") for x in embedded_container: url = x.get("src") tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.add('embedded') tags.add('iframe') tags.add('in text') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) # some embedded media are not in the artucle body, but embedded in the art_aside container art_aside = soup.find_all(attrs={"class": "art_aside"}) if art_aside: for section in art_aside: tagged_urls.extend(find_embedded_media_in_multimedia_box(section)) # same, but in the art_bottom container art_bottom = soup.find_all(attrs={"class": "art_bottom"}) if art_bottom: for section in art_bottom: tagged_urls.extend(find_embedded_media_in_multimedia_box(section)) return tagged_urls
def extract_embedded_audio_links(main_content, netloc, associated_sites): strong_article_links = main_content.find("div", {"id": "strongArticleLinks"}) if not strong_article_links: return [] embedded_audio_link_list = strong_article_links.find("ul", {"id": "audioContents"}) if not embedded_audio_link_list: return [] tagged_urls = [] for item in embedded_audio_link_list.findAll("li", recursive=False): if item.object: flash_obj = item.object data_url = flash_obj.get("data") if data_url: source_url = media_utils.extract_source_url_from_dewplayer(data_url) title = item.text tags = classify_and_tag(source_url, netloc, associated_sites) tags |= set(["sidebar box", "audio", "embedded"]) tagged_url = make_tagged_url(source_url, title, tags) tagged_urls.append(tagged_url) else: raise ValueError("Could not find the source url for the flash object. Fix your parser.") else: raise ValueError("Could not find the flash object for embedded audio. Fix your parser.") return tagged_urls
def extract_text_and_links_from_paragraph(paragraph_hxs): def separate_img_and_text_links(links): img_links = [l for l in links if l.select("./img")] text_links = [l for l in links if l not in img_links] return [extract_title_and_url(link) for link in text_links], [extract_img_link_info(link) for link in img_links] links = paragraph_hxs.select(".//a") titles_and_urls, img_targets_and_urls = separate_img_and_text_links(links) tagged_urls = list() for title, url in titles_and_urls: tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags.update(['in text']) if title == constants.GHOST_LINK_TITLE: tags.update([constants.GHOST_LINK_TAG]) tagged_urls.append(make_tagged_url(url, title, tags)) for img_target, url in img_targets_and_urls: tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags.update(['in text', 'embedded image']) tagged_urls.append(make_tagged_url(url, img_target, tags)) # plaintext urls text_fragments = paragraph_hxs.select("./text()").extract() if text_fragments: text = u"".join(remove_text_formatting_markup_from_fragments(text_fragments)) for paragraph in text_fragments: plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags.update(['plaintext', 'in text']) tagged_urls.append(make_tagged_url(url, url, tags)) else: text = u"" # iframes iframes = paragraph_hxs.select(".//iframe") for iframe in iframes: target_url, tags = extract_and_tag_iframe_source(iframe) tagged_urls.append(make_tagged_url(target_url, "__EMBEDDED_IFRAME__", tags)) return text, tagged_urls
def extract_links_from_embbeded_media(content_hxs): body_hxs = content_hxs.select(".//div [@class='entry-body']") tagged_urls = [] for script_hxs in body_hxs.select('./script'): snippet = script_hxs.select('./text()').extract() if len(snippet) > 0 and media_utils.ignore_snippet(snippet[0]): continue script_src = script_hxs.select("./@src").extract() if not script_src: raise ValueError("Found a <script> with no src attr.") if script_src[0].startswith("//platform.twitter.com/widgets.js"): # tagged_urls.append(make_tagged_url(constants.NO_URL, constants.NO_TITLE, set(['embedded', 'tweet', constants.UNFINISHED_TAG]))) previous_blockquote = script_hxs.select("./preceding-sibling::blockquote[1]") if previous_blockquote: if 'twitter-tweet' in previous_blockquote[0].select("./@class").extract(): url = previous_blockquote.select('./a[last()]/@href').extract()[0] tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) title = u"[RENDERED TWEET]" tags |= set(['embedded', 'tweet']) tagged_urls.append(make_tagged_url(url, title, tags)) else: raise ValueError("This blockquote does not appear to be a tweet.") else: raise ValueError("Found a twitter widget <script> without its companion blockquote.") elif script_src[0].startswith("http://storify.com"): url = script_src[0] title = constants.RENDERED_STORIFY_TITLE tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags |= set(['embedded', 'storify']) tagged_urls.append(make_tagged_url(url, title, tags)) else: noscript_hxs = script_hxs.select('./following-sibling::noscript[1]') if noscript_hxs: link_hxs = noscript_hxs.select('a') title, url = extract_title_and_url(link_hxs) tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) title = constants.RENDERED_TWEET_TITLE tags |= set(['embedded']) tagged_urls.append(make_tagged_url(url, title, tags)) else: raise ValueError("This blockquote does not appear to be a tweet.") return tagged_urls
def extract_kplayer_infos(kplayer_flash, title, site_netloc, site_internal_sites): url_part1 = kplayer_flash.object["data"] url_part2 = kplayer_flash.object.find("param", {"name": "flashVars"})["value"] if url_part1 is not None and url_part2 is not None: url = "%s?%s" % (url_part1, url_part2) all_tags = classify_and_tag(url, site_netloc, site_internal_sites) return make_tagged_url(url, title, all_tags | set(["video", "embedded", "kplayer"])) else: raise ValueError("We couldn't find an URL in the flash player. Update the parser.")
def extract_and_tag_url_from_iframe(item): embedded_frame = item.select(".//iframe") if embedded_frame: target_url, tags = extract_and_tag_iframe_source(embedded_frame) tags = classify_and_tag(target_url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) return make_tagged_url(target_url, title, tags) else: return None
def extract_tagged_url_from_associated_link(link_list_item, netloc, associated_sites, additional_tags=[]): # sometimes list items are used to show things which aren't links # but more like unclickable ads url = link_list_item.a.get("href") title = remove_text_formatting_markup_from_fragments(link_list_item.a.contents).strip() tags = classify_and_tag(url, netloc, associated_sites) tags |= set(additional_tags) tagged_url = make_tagged_url(url, title, tags) return tagged_url
def extract_embedded_links_from_articlebody(article_body): embedded_links = list() for link in extract_usable_links(article_body): url = link.get('href') title = remove_text_formatting_markup_from_fragments(link.contents) tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES) tags.add('in text') embedded_links.append(make_tagged_url(url, title, tags)) for embedded_video_frame in article_body.findAll('iframe'): url = embedded_video_frame.get('src') title = '[Video] {0}'.format(url) tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES) tags = tags.union(['in text', 'embedded']) embedded_links.append(make_tagged_url(url, title, tags)) return embedded_links
def extract_links_from_intro(fragment): tagged_urls = list() inline_links = fragment.find_all('a') titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(fragment)) for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(base_tags) tags.add('in intro') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.add('in intro') tags.add('plaintext') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) return tagged_urls
def extract_links_from_tags(hxs): tag_navbar_hxs = hxs.select("//nav [@class='entry-tags']") tagged_urls = list() for link_hxs in tag_navbar_hxs.select("./ul/li/a"): title, url = extract_title_and_url(link_hxs) tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags |= set(['keyword']) tagged_urls.append(make_tagged_url(url, title, tags)) return tagged_urls
def extract_related_links(hxs): aside_hxs = hxs.select("//div//aside [@class='entry-related']") tagged_urls = list() related_link_hxs = aside_hxs.select(".//ul/li//a") for link_hxs in related_link_hxs: title, url = extract_title_and_url(link_hxs) tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags |= set(['bottom box', 'related']) tagged_urls.append(make_tagged_url(url, title, tags)) return tagged_urls
def select_title_and_url(selector, tag_name): url = selector.select("./@href").extract()[0] title = selector.select(".//text()").extract() if title: title = remove_text_formatting_markup_from_fragments(title[0]) tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags = tags.union([tag_name]) else: tags = set([tag_name, constants.GHOST_LINK_TAG]) title = constants.GHOST_LINK_TITLE return make_tagged_url(url, title, tags)
def classify_and_make_tagged_url(urls_and_titles, additional_tags=set()): """ Classify (with tags) every element in a list of (url, title) tuples Returns a list of TaggedURLs """ tagged_urls = [] for url, title in urls_and_titles: tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) if is_on_same_domain(url): tags.update(['internal site']) tagged_urls.append(make_tagged_url(url, title, tags | additional_tags)) return tagged_urls
def extract_embedded_media_in_article(soup): tagged_urls = list() story = soup.find(attrs={'class': 'article-body'}) scripts = story.findAll('script', recursive=True) for script in scripts: url = script.get('src') if url: scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) if netloc == "storify.com": url = url.rstrip(".js") all_tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tagged_urls.append(tagging.make_tagged_url(url, url, all_tags | set(['embedded', 'storify']))) return tagged_urls
def extract_links_from_sidebar_box(soup): tagged_urls = list() sidebar_boxes = soup.find_all('div', {'class': 'box alt'}) if sidebar_boxes: for sidebar_box in sidebar_boxes: links = sidebar_box.find_all('a') titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(base_tags) tags.add('sidebar box') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) return tagged_urls
def extract_text_and_links_from_paragraph(paragraph): def extract_url_and_title(link): if isinstance(link.contents[0], bs.Tag): if link.contents[0].name == 'img': img_target = link.contents[0].get('src') return link.get('href'), '(img){0}'.format(img_target) else: title = remove_text_formatting_markup_from_fragments(link.contents) return link.get('href'), title else: return link.get('href'), remove_text_formatting_markup_from_fragments(link.contents) # Why do we filter on link.contents? Because sometimes there # are <a id="more"></a> links which point to nothing. # Awesome. urls_and_titles = [extract_url_and_title(link) for link in paragraph.findAll('a', recursive=False) if link.contents] tagged_urls = list() for url, title in urls_and_titles: tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES) tags.update(['in text']) tagged_urls.append(make_tagged_url(url, title, tags)) text_fragments = paragraph.contents if text_fragments: text = u"".join(remove_text_formatting_markup_from_fragments(text_fragments)) plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(text_fragments)) for url in plaintext_urls: tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES) tags.update(['plaintext', 'in text']) tagged_urls.append(make_tagged_url(url, url, tags)) else: text = u"" return text, tagged_urls
def extract_embedded_media(article): tagged_urls = list() # extract any iframe from maincontent iframes = article.findAll("iframe") for media in iframes: url = media.get('src') tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES) tags.add('embedded') tags.add('iframe') tagged_url = make_tagged_url(url, url, tags) tagged_urls.append(tagged_url) return tagged_urls
def extract_article_tags(soup): tagged_urls = list() meta_box = soup.find(attrs={"class": "meta"}) if meta_box.find(attrs={'class': 'tags'}): tags = meta_box.find(attrs={'class': 'tags'}) links = tags.find_all("a") titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(base_tags) tags.add('keyword') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) return tagged_urls
def classify_and_make_tagged_url(urls_and_titles, additional_tags=set()): """ Classify (with tags) every element in a list of (url, title) tuples Returns a list of TaggedURLs """ tagged_urls = [] for url, title in urls_and_titles: tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES) if is_on_same_domain(url): tags = tags.union(["internal site", "internal"]) all_tags = tags.union(additional_tags) tagged_urls.append(make_tagged_url(url, title, all_tags)) return tagged_urls
def extract_text_content_and_links_from_articletext(main_content, has_intro=True): article_text = main_content in_text_tagged_urls = [] all_cleaned_paragraphs = [] all_rough_paragraphs = [] all_plaintext_urls = [] embedded_tweets = [] def is_text_content(blob): if isinstance(blob, bs.Tag) and blob.name in TEXT_MARKUP_TAGS: return True if isinstance(blob, bs.NavigableString): return True return False text_fragments = [c for c in article_text.contents if is_text_content(c)] if text_fragments: # we first need to avoid treating embedded tweets as text for paragraph in text_fragments: if isinstance(paragraph, bs.NavigableString): all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph)) all_rough_paragraphs.append(paragraph) else: if not paragraph.find("blockquote", {"class": "twitter-tweet"}): in_text_links = extract_and_tag_in_text_links(paragraph) in_text_tagged_urls.extend(in_text_links) all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph)) all_rough_paragraphs.append(paragraph) else: embedded_tweets.extend( twitter_utils.extract_rendered_tweet(paragraph, DHNET_NETLOC, DHNET_INTERNAL_SITES) ) # extracting plaintext links for paragraph in all_rough_paragraphs: plaintext_urls = extract_plaintext_urls_from_text( remove_text_formatting_and_links_from_fragments(paragraph) ) for url in plaintext_urls: tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES) tags.update(["plaintext", "in text"]) all_plaintext_urls.append(make_tagged_url(url, url, tags)) else: all_cleaned_paragraphs = [] return all_cleaned_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets
def extract_text_content(story): """ Finds the story's body, cleans up the text to remove all html formatting. Returns a list of strings, one per found paragraph, and all the plaintext urls, as TaggedURLs """ story = story.find('div', {'id': 'story_body'}) paragraphs = story.findAll('p', recursive=False) tagged_urls = list() # extract regular, in text links inline_links = list() plaintext_urls = list() text = list() if paragraphs: for paragraph in paragraphs: text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph))) links = paragraph.findAll('a', recursive=True) inline_links.extend(links) plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) tags.update(['plaintext', 'in text']) tagged_urls.append(make_tagged_url(url, url, tags)) titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links if not i.find('img')] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) tags.add('in text') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) else: text = u"" return text, tagged_urls