Python remove_text_formatting_markup_from_fragments示例，parser_tools.utils.remove_text_formatting_markup_from_fragments Python示例

示例#1

0

显示文件

文件： sudpresse.py 项目： sevas/csxj-crawler

 def extract_url_and_title(link):
     if isinstance(link.contents[0], bs.Tag):
         if link.contents[0].name == 'img':
             img_target = link.contents[0].get('src')
             return link.get('href'), '(img){0}'.format(img_target)
         else:
             title = remove_text_formatting_markup_from_fragments(link.contents)
             return link.get('href'), title
     else:
         return link.get('href'), remove_text_formatting_markup_from_fragments(link.contents)

示例#2

0

显示文件

文件： dhnet.py 项目： sevas/csxj-crawler

def extract_text_content_and_links_from_articletext(main_content, has_intro=True):
    article_text = main_content

    in_text_tagged_urls = []
    all_cleaned_paragraphs = []
    all_rough_paragraphs = []
    all_plaintext_urls = []
    embedded_tweets = []

    def is_text_content(blob):
        if isinstance(blob, bs.Tag) and blob.name in TEXT_MARKUP_TAGS:
            return True
        if isinstance(blob, bs.NavigableString):
            return True
        return False

    text_fragments = [c for c in article_text.contents if is_text_content(c)]

    if text_fragments:
        # we first need to avoid treating embedded tweets as text
        for paragraph in text_fragments:
            if isinstance(paragraph, bs.NavigableString):
                all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph))
                all_rough_paragraphs.append(paragraph)

            else:
                if not paragraph.find("blockquote", {"class": "twitter-tweet"}):
                    in_text_links = extract_and_tag_in_text_links(paragraph)
                    in_text_tagged_urls.extend(in_text_links)
                    all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph))
                    all_rough_paragraphs.append(paragraph)
                else:
                    embedded_tweets.extend(
                        twitter_utils.extract_rendered_tweet(paragraph, DHNET_NETLOC, DHNET_INTERNAL_SITES)
                    )

        # extracting plaintext links
        for paragraph in all_rough_paragraphs:
            plaintext_urls = extract_plaintext_urls_from_text(
                remove_text_formatting_and_links_from_fragments(paragraph)
            )
            for url in plaintext_urls:
                tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES)
                tags.update(["plaintext", "in text"])
                all_plaintext_urls.append(make_tagged_url(url, url, tags))
    else:
        all_cleaned_paragraphs = []

    return all_cleaned_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets

示例#3

0

显示文件

文件： lalibre.py 项目： sevas/csxj-crawler

def sanitize_paragraph(paragraph):
    """Returns plain text article"""

    sanitized_paragraph = [remove_text_formatting_markup_from_fragments(fragment, strip_chars='\t\r\n') for fragment in paragraph.contents if
                           not isinstance(fragment, BeautifulSoup.Comment)]

    return ''.join(sanitized_paragraph)

示例#4

0

显示文件

文件： lalibre.py 项目： sevas/csxj-crawler

def extract_intro(main_content):
    hat = main_content.find('div', {'id': 'articleHat'})

    if hat:
        return remove_text_formatting_markup_from_fragments(hat.contents, strip_chars=' \t\r\n')
    else:
        return u''

示例#5

0

显示文件

文件： septsursept.py 项目： sevas/csxj-crawler

def extract_intro(soup):
    intro_box = soup.find(attrs={"class": "intro"})
    tagged_urls = []

    if intro_box:
        intro_fragments = intro_box.find_all('b')
        intro = utils.remove_text_formatting_markup_from_fragments(intro_fragments)
        inline_links = intro_box.find_all("a")
        titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]
        plaintext_urls = utils.extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(intro))

        for title, url, base_tags in titles_and_urls:
            tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
            tags.update(base_tags)
            tags.add('in intro')
            tagged_urls.append(tagging.make_tagged_url(url, title, tags))

        for url in plaintext_urls:
            tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
            tags.add('in intro')
            tags.add('plaintext')
            tagged_urls.append(tagging.make_tagged_url(url, url, tags))
    else:
        intro = ""

    return intro, tagged_urls

示例#6

0

显示文件

文件： lesoir_new.py 项目： sevas/csxj-crawler

def extract_text_content_and_links(soup) :
    tagged_urls = list()
    inline_links = []
    text = list()

    article_body = soup.find(attrs = {"class" : "article-body"})
    text_fragments = article_body.find_all("p")
    other_fragments = article_body.find_all("h2", {"style": "display: inline; font-size: 1em; padding: 0px; margin: 0px;"})
    all_fragments = text_fragments + other_fragments

    if all_fragments:
        for paragraph in text_fragments:
            text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph)))
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))
            for url in plaintext_urls:
                tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
                tags.update(['plaintext', 'in text'])
                tagged_urls.append(tagging.make_tagged_url(url, url, tags))

    else:
        text = u""

    for p in all_fragments:
        link = p.find_all("a")
        inline_links.extend(link)

    titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]

    for title, url, base_tags in titles_and_urls:
        tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
        tags.update(base_tags)
        tags.add('in text')
        tagged_urls.append(tagging.make_tagged_url(url, title, tags))

    return text, tagged_urls

示例#7

0

显示文件

文件： sudpresse.py 项目： sevas/csxj-crawler

        def extract_url_and_title(item):
            url = item.a.get('href')
            title = remove_text_formatting_markup_from_fragments(item.a.contents)

            tags = set()
            if not title:
                title = u'No Title'
                tags.add('ghost link')
            return url, title, tags

示例#8

0

显示文件

文件： dhnet.py 项目： sevas/csxj-crawler

def cleanup_text_fragment(text_fragment):
    """
    Recursively cleans up a text fragment (e.g. nested tags).
    Returns a plain text string with no formatting info whatsoever.
    """
    if isinstance(text_fragment, bs.Tag):
        return remove_text_formatting_markup_from_fragments(text_fragment.contents)
    else:
        return text_fragment

示例#9

0

显示文件

文件： lalibre.py 项目： sevas/csxj-crawler

def extract_text_content_and_links(main_content):
    article_text = main_content.find('div', {'id': 'articleText'})

    in_text_tagged_urls = []
    all_rough_paragraphs = []
    all_clean_paragraphs = []
    all_plaintext_urls = []
    embedded_tweets = []

    def is_text_content(blob):
        if isinstance(blob, BeautifulSoup.Tag) and blob.name in TEXT_MARKUP_TAGS:
            return True
        if isinstance(blob, BeautifulSoup.NavigableString):
            return True
        return False

    text_fragments = [c for c in article_text.contents if is_text_content(c)]

    if text_fragments:
        for paragraph in text_fragments:
            if isinstance(paragraph, BeautifulSoup.NavigableString):
                all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n'))
                all_rough_paragraphs.append(paragraph)
            else:
                if not paragraph.find('blockquote', {'class': 'twitter-tweet'}):
                    in_text_links = extract_and_tag_in_text_links(paragraph)
                    in_text_tagged_urls.extend(in_text_links)
                    all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n'))
                    all_rough_paragraphs.append(paragraph)
                else:
                    embedded_tweets.extend(
                        twitter_utils.extract_rendered_tweet(paragraph, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES))

        for p in all_rough_paragraphs:
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(p))
            for url in plaintext_urls:
                tags = classify_and_tag(url, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
                tags.update(['plaintext', 'in text'])

                all_plaintext_urls.append(make_tagged_url(url, url, tags))
    else:
        all_clean_paragraphs = []

    return all_clean_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets

示例#10

0

显示文件

文件： rtlinfo.py 项目： sevas/csxj-crawler

def extract_intro(main_article):
    left_column = main_article.find('div', {'id':'leftCol'})
    intro_container = left_column.find('h2', recursive=False)

    if intro_container:
        intro = remove_text_formatting_markup_from_fragments(intro_container.contents)
    else:
        intro = None

    return intro

示例#11

0

显示文件

文件： dhnet.py 项目： sevas/csxj-crawler

def extract_article_data(source):
    """
    """
    if hasattr(source, "read"):
        html_content = source.read()
    else:
        try:
            html_content = fetch_html_content(source)
        except HTTPError as e:
            if e.code == 404:
                return None, None
            else:
                raise
        except Exception:
            raise

    soup = make_soup_from_html_content(html_content)
    main_content = soup.find("div", {"id": "maincontent"})

    if main_content and main_content.h1:
        title = remove_text_formatting_markup_from_fragments(main_content.h1.contents)
        pub_date, pub_time = extract_date_from_maincontent(main_content)
        category = extract_category_from_maincontent(main_content)
        author_name = extract_author_name_from_maincontent(main_content)

        article_text = main_content.find("div", {"id": "articleText"})
        if article_has_intro(article_text):
            intro = extract_intro_from_articletext(article_text)
            text, in_text_links = extract_text_content_and_links_from_articletext(article_text)
        else:
            intro = u""
            text, in_text_links = extract_text_content_and_links_from_articletext(article_text, False)

        audio_content_links = ipm_utils.extract_embedded_audio_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        sidebox_links = ipm_utils.extract_and_tag_associated_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        bottom_links = ipm_utils.extract_bottom_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        embedded_content_links = extract_links_to_embedded_content(main_content)
        all_links = in_text_links + sidebox_links + embedded_content_links + bottom_links + audio_content_links

        updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.DHNET_SAME_OWNER)

        fetched_datetime = datetime.today()

        # print generate_test_func('twizz_stream', 'dhnet', dict(tagged_urls=updated_tagged_urls))
        # save_sample_data_file(html_content, source, 'twizz_stream', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/dhnet')

        # import os
        # generate_unittest("links_tweet_with_emoji", "dhnet", dict(urls=updated_tagged_urls), html_content, source, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/dhnet"), True)

        new_article = ArticleData(
            source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author_name, intro, text
        )
        return new_article, html_content
    else:
        return None, html_content

示例#12

0

显示文件

文件： dhnet.py 项目： sevas/csxj-crawler

def extract_intro_from_articletext(article_text):
    """
    Finds the introduction paragraph, returns a string with the text
    """
    # intro text seems to always be in the first paragraph.
    if article_has_intro(article_text):
        intro_paragraph = article_text.p
        return remove_text_formatting_markup_from_fragments(intro_paragraph.contents)
    # but sometimes there is no intro. What the hell.
    else:
        return u""

示例#13

0

显示文件

文件： lavenir.py 项目： sevas/csxj-crawler

def select_title_and_url(selector, tag_name):
    url = selector.select("./@href").extract()[0]
    title = selector.select(".//text()").extract()
    if title:
        title = remove_text_formatting_markup_from_fragments(title[0])
        tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
        tags = tags.union([tag_name])
    else:
        tags = set([tag_name, constants.GHOST_LINK_TAG])
        title = constants.GHOST_LINK_TITLE
    return make_tagged_url(url, title, tags)

示例#14

0

显示文件

文件： lesoir_new.py 项目： sevas/csxj-crawler

def extract_intro(soup):
    if soup.find(attrs={"class": "article-content"}).h3:
        intro_box = soup.find(attrs={"class": "article-content"})

        def extract_links_from_intro(fragment):
            tagged_urls = list()
            inline_links = fragment.find_all('a')
            titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(fragment))

            for title, url, base_tags in titles_and_urls:
                tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
                tags.update(base_tags)
                tags.add('in intro')
                tagged_urls.append(tagging.make_tagged_url(url, title, tags))

            for url in plaintext_urls:
                tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
                tags.add('in intro')
                tags.add('plaintext')
                tagged_urls.append(tagging.make_tagged_url(url, url, tags))
            return tagged_urls

        if len(intro_box.find("h3").contents) > 0:
            fragment = intro_box.find("h3").contents[0]
            tagged_urls = extract_links_from_intro(intro_box.find("h3"))
            intro = remove_text_formatting_markup_from_fragments(fragment, strip_chars='\t\r\n').rstrip()
            return intro, tagged_urls

        if intro_box.find("h3").find_next_sibling("p"):
            fragment = intro_box.find("h3").find_next_sibling("p")
            tagged_urls = extract_links_from_intro(fragment)
            intro = remove_text_formatting_markup_from_fragments(fragment, strip_chars='\t\r\n')
            return intro, tagged_urls

        else:
            return [], []
    else:
        return [], []

示例#15

0

显示文件

文件： lesoir.py 项目： sevas/csxj-crawler

def extract_title_and_url_from_bslink(link):
    base_tags = []
    if link.get('href'):
        url = link.get('href')

    else:
        url = constants.GHOST_LINK_URL
        base_tags.append(constants.GHOST_LINK_TAG)

    if link.contents:
        title = remove_text_formatting_markup_from_fragments(link.contents)
    else:
        title = constants.GHOST_LINK_TITLE
        base_tags.append(constants.GHOST_LINK_TAG)

    return title, url, base_tags

示例#16

0

显示文件

文件： sudpresse.py 项目： sevas/csxj-crawler

def extract_headlines_from_column_3(column):
    stories = column.findAll('div', {'class': 'octetFun'})

    last_story = column.findAll('div', {'class': 'octetFun noborder'})
    if last_story:
        stories.append(last_story[0])

    headlines = list()
    for story in stories:
        if story.h3.a.contents:
            clean_title = remove_text_formatting_markup_from_fragments(story.h3.a.contents)
            if story.h3.a.get('href'):
                title_and_url = clean_title, story.h3.a.get('href')
                headlines.append(title_and_url)

    return headlines

示例#17

0

显示文件

文件： sudinfo.py 项目： sevas/csxj-crawler

def extract_text_and_links_from_paragraph(paragraph_hxs):
    def separate_img_and_text_links(links):
        img_links = [l for l in links if l.select("./img")]
        text_links = [l for l in links if l not in img_links]

        return [extract_title_and_url(link) for link in text_links], [extract_img_link_info(link) for link in img_links]

    links = paragraph_hxs.select(".//a")

    titles_and_urls, img_targets_and_urls = separate_img_and_text_links(links)

    tagged_urls = list()
    for title, url in titles_and_urls:
        tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
        tags.update(['in text'])
        if title == constants.GHOST_LINK_TITLE:
            tags.update([constants.GHOST_LINK_TAG])
        tagged_urls.append(make_tagged_url(url, title, tags))

    for img_target, url in img_targets_and_urls:
        tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
        tags.update(['in text', 'embedded image'])
        tagged_urls.append(make_tagged_url(url, img_target, tags))

    # plaintext urls
    text_fragments = paragraph_hxs.select("./text()").extract()
    if text_fragments:
        text = u"".join(remove_text_formatting_markup_from_fragments(text_fragments))

        for paragraph in text_fragments:
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))

            for url in plaintext_urls:
                tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
                tags.update(['plaintext', 'in text'])

                tagged_urls.append(make_tagged_url(url, url, tags))
    else:
        text = u""

    # iframes
    iframes = paragraph_hxs.select(".//iframe")
    for iframe in iframes:
        target_url, tags = extract_and_tag_iframe_source(iframe)
        tagged_urls.append(make_tagged_url(target_url, "__EMBEDDED_IFRAME__", tags))

    return text, tagged_urls

示例#18

0

显示文件

文件： rtlinfo.py 项目： sevas/csxj-crawler

def extract_external_links(main_article):
    container = main_article.find('div', {'class':'art_ext_links'})

    if container:
        link_list = container.ul
        items = link_list.findAll('li')
        urls_and_titles = [(i.a.get('href'), remove_text_formatting_markup_from_fragments(i.a.contents))  for i in items]

        tagged_urls = list()
        for url, title in urls_and_titles:
            tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES)
            tagged_urls.append(make_tagged_url(url, title, tags))

        return tagged_urls

    else:
        return []

示例#19

0

显示文件

文件： rtlinfo.py 项目： sevas/csxj-crawler

def extract_related_links(main_article):
    container = main_article.find('div', {'class':'relatedArticles'})

    if container:
        left_list, right_list = container.findAll('ul')
        all_list_items = [link_list.findAll('li', recursive=False) for link_list in (left_list, right_list)]

        tagged_urls = list()
        for item in chain(*all_list_items):
            url, title = item.a.get('href'), remove_text_formatting_markup_from_fragments(item.a.contents)
            tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES)
            tags.add('associated')

            tagged_urls.append(make_tagged_url(url, title, tags))

        return tagged_urls
    else:
        return []

示例#20

0

显示文件

文件： rtlinfo.py 项目： sevas/csxj-crawler

def extract_embedded_links_from_articlebody(article_body):
    embedded_links = list()

    for link in extract_usable_links(article_body):
        url = link.get('href')
        title = remove_text_formatting_markup_from_fragments(link.contents)
        tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES)
        tags.add('in text')
        embedded_links.append(make_tagged_url(url, title, tags))

    for embedded_video_frame in article_body.findAll('iframe'):
        url = embedded_video_frame.get('src')
        title = '[Video] {0}'.format(url)
        tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES)
        tags = tags.union(['in text', 'embedded'])
        embedded_links.append(make_tagged_url(url, title, tags))

    return embedded_links

示例#21

0

显示文件

文件： rtlinfo.py 项目： sevas/csxj-crawler

def extract_links_and_text_content(main_article):
    article_body = main_article.find('div', {'class':'articleBody rtl_margin_top_25'})

    embedded_links = extract_embedded_links_from_articlebody(article_body)

    all_paragraphs = article_body.findAll('p', recursive=False)
    cleaned_up_paragraphs = list()
    all_plaintext_urls = list()

    for p in all_paragraphs:
        paragraph = remove_text_formatting_markup_from_fragments(p.contents)
        plaintext_urls = extract_plaintext_urls_from_text(paragraph)
        for url in plaintext_urls:
            tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES)
            tags = tags.union(['in text', 'plaintext'])
            all_plaintext_urls.append(make_tagged_url(url, url, tags))

        cleaned_up_paragraphs.append(paragraph)

    all_links = embedded_links+all_plaintext_urls
    return all_links, cleaned_up_paragraphs

示例#22

0

显示文件

文件： lavenir.py 项目： sevas/csxj-crawler

def extract_article_data_new_style(source, hxs):
    """ """
    category = hxs.select("//nav [contains(@id,'breadcrumb')]//li").extract()

    datetime_string = hxs.select("//div [@class='row content']//time/@datetime").extract()
    if not datetime_string:
        raise ValueError("Could not find the date, update the parser")

    parsed_datetime = datetime_from_iso8601(datetime_string[0])
    pub_date, pub_time = parsed_datetime.date(), parsed_datetime.time()
    fetched_datetime = datetime.now()

    title = hxs.select("//header//h1/text()").extract()
    if not title:
        raise ValueError()
    title = title[0]

    content_hxs = hxs.select("//div [@class='entry-content']")

    author_fragments = content_hxs.select(".//p [@class='copyright']/text()").extract()
    author = ''.join([remove_text_formatting_markup_from_fragments(author_fragments, strip_chars='\r\n\t ')])

    intro, intro_links = extract_intro_and_links_new(content_hxs)
    content, content_links = extract_content_and_links_new(content_hxs)

    other_div_hxs = content_hxs.select("//div [@class='entry-content']/div [not(contains(@class, 'entry-'))]")
    content_media_links = extract_links_from_other_divs(other_div_hxs)
    related_links = extract_related_links(hxs)
    media_links = extract_links_from_embbeded_media(content_hxs)
    tag_links = extract_links_from_tags(hxs)

    all_links = it.chain(intro_links, content_links, media_links, content_media_links, related_links, tag_links)
    updated_tagged_urls = update_tagged_urls(all_links, LAVENIR_SAME_OWNER)

    article_data = ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                               updated_tagged_urls,
                               category, author,
                               intro, content)
    return article_data

示例#23

0

显示文件

文件： sudpresse.py 项目： sevas/csxj-crawler

def extract_text_and_links_from_paragraph(paragraph):
    def extract_url_and_title(link):
        if isinstance(link.contents[0], bs.Tag):
            if link.contents[0].name == 'img':
                img_target = link.contents[0].get('src')
                return link.get('href'), '(img){0}'.format(img_target)
            else:
                title = remove_text_formatting_markup_from_fragments(link.contents)
                return link.get('href'), title
        else:
            return link.get('href'), remove_text_formatting_markup_from_fragments(link.contents)

    # Why do we filter on link.contents? Because sometimes there
    # are <a id="more"></a> links which point to nothing.
    # Awesome.
    urls_and_titles = [extract_url_and_title(link) for link in paragraph.findAll('a', recursive=False) if link.contents]

    tagged_urls = list()

    for url, title in urls_and_titles:
        tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES)
        tags.update(['in text'])
        tagged_urls.append(make_tagged_url(url, title, tags))

    text_fragments = paragraph.contents

    if text_fragments:
        text = u"".join(remove_text_formatting_markup_from_fragments(text_fragments))

        plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(text_fragments))
        for url in plaintext_urls:
            tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES)
            tags.update(['plaintext', 'in text'])

            tagged_urls.append(make_tagged_url(url, url, tags))
    else:
        text = u""

    return text, tagged_urls

示例#24

0

显示文件

文件： septsursept.py 项目： sevas/csxj-crawler

def extract_text_content_and_links(soup):
    article_text = []
    inline_links = []
    plaintext_urls = []

    content_box = soup.find(attrs={"id": "detail_content"})
    text = content_box.find_all(attrs={"class": "clear"})

    for fragment in text:
        paragraphs = fragment.find_all("p", recursive=False)
        for p in paragraphs:
            clean_text = remove_text_formatting_markup_from_fragments(p, strip_chars="\n")
            if clean_text:
                article_text.append(clean_text)

            found_plaintext_links = utils.extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(p))
            plaintext_urls.extend(found_plaintext_links)
            link = p.find_all("a")
            inline_links.extend(link)

    titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]

    tagged_urls = list()
    for title, url, base_tags in titles_and_urls:
        tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
        tags.update(base_tags)
        tags.add('in text')
        tagged_urls.append(tagging.make_tagged_url(url, title, tags))

    for url in plaintext_urls:
        tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
        tags.add('in text')
        tags.add('plaintext')
        tagged_urls.append(tagging.make_tagged_url(url, url, tags))

    return article_text, tagged_urls

示例#25

0

显示文件

文件： lesoir.py 项目： sevas/csxj-crawler

def extract_text_content(story):
    """
    Finds the story's body, cleans up the text to remove all html formatting.
    Returns a list of strings, one per found paragraph, and all the plaintext urls, as TaggedURLs
    """
    story = story.find('div', {'id': 'story_body'})
    paragraphs = story.findAll('p', recursive=False)

    tagged_urls = list()

    # extract regular, in text links
    inline_links = list()
    plaintext_urls = list()
    text = list()

    if paragraphs:
        for paragraph in paragraphs:
            text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph)))
            links = paragraph.findAll('a', recursive=True)
            inline_links.extend(links)
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))
            for url in plaintext_urls:
                tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
                tags.update(['plaintext', 'in text'])
                tagged_urls.append(make_tagged_url(url, url, tags))

        titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links if not i.find('img')]
        for title, url, base_tags in titles_and_urls:
            tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
            tags.add('in text')
            tagged_urls.append(tagging.make_tagged_url(url, title, tags))

    else:
        text = u""

    return text, tagged_urls

示例#26

0

显示文件

文件： rtlinfo.py 项目： sevas/csxj-crawler

def extract_title(main_article):
    left_column = main_article.find('div', {'id':'leftCol'})
    title = left_column.find('h1', {'class':'rtl_font_weight_normal'})

    return remove_text_formatting_markup_from_fragments(title.contents)

示例#27

0

显示文件

文件： septsursept.py 项目： sevas/csxj-crawler

def extract_category(soup):
    category_box = soup.find(attrs={"class": "actua_nav"})
    links = category_box.find_all('a')
    return [utils.remove_text_formatting_markup_from_fragments(link.contents[0]) for link in links]

示例#28

0

显示文件

文件： septsursept.py 项目： sevas/csxj-crawler

def find_embedded_media_in_multimedia_box(multimedia_box):
    tagged_urls = list()
    all_sections = multimedia_box.findAll("section")
    for section in all_sections:

        if 'photo' in section.attrs['class']:
            continue

        elif 'poll' in section.attrs['class']:
            continue

        elif 'asset' in section.attrs['class']:
            url = section.find('a').get('href')
            title = section.find('a').contents[0]
            tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
            tags.add('embedded')
            tagged_urls.append(tagging.make_tagged_url(url, title, tags))

        elif 'video' in section.attrs['class']:
            # it might be an iframe
            if section.find("iframe"):
                iframe = section.find("iframe")
                url = iframe.get("src")
                if url:
                    tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
                    tags.add('embedded')
                    tags.add('iframe')
                    tagged_urls.append(tagging.make_tagged_url(url, url, tags))
                else:
                    raise ValueError("There seems to be an iframe but we could not find a link. Please update parser.")

            elif section.find("embed"):
                embedded_stuff = section.find("embed")
                url = embedded_stuff.get("src")
                if url:
                    tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
                    tags.add('embedded')
                    tagged_urls.append(tagging.make_tagged_url(url, url, tags))
                else:
                    raise ValueError("There seems to be an embedded video but we could not find a link. Please update parser.")
            else:
                raise ValueError("There seems to be an embedded video but we could not identify it. Please update parser.")

        elif 'snippet' in section.attrs['class']:

            # it might be a tweet
            tweets = section.find_all(attrs={"class": "twitter-tweet"})
            if tweets:
                for tweet in tweets:
                    links = tweet.find_all("a")
                    for link in links:
                        if link.get("data-datetime"):
                            url = link.get("href")
                            tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
                            tags.add('embedded')
                            tags.add('tweet')
                            tagged_urls.append(tagging.make_tagged_url(url, url, tags))

            # it might be an embedded javascript object that shows a twitter account or query
            twitter_widget = section.find_all(attrs={"class": "tweet_widget"})
            if twitter_widget:
                if len(twitter_widget) == 1:
                    if twitter_widget[0].find('script'):
                        script_url = twitter_widget[0].find('script').get('src')
                        if twitter_utils.is_twitter_widget_url(script_url):
                            title, url, tags = twitter_utils.get_widget_type(twitter_widget[0].findAll('script')[1].contents[0])
                            tags |= tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
                            tags |= set(['script', 'embedded'])
                            tagged_urls.append(tagging.make_tagged_url(url, title, tags))

                    elif section.find("script"):
                        script_url = section.find('script').get('src')
                        if twitter_utils.is_twitter_widget_url(script_url):
                            title, url, tags = twitter_utils.get_widget_type(section.findAll('script')[1].contents[0])
                            tags |= tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
                            tags |= set(['script', 'embedded'])
                            tagged_urls.append(tagging.make_tagged_url(url, title, tags))
                        else:
                            raise ValueError("Embedded script of unknown type was detected ('{0}'). Update the parser.".format(script_url))

                    elif twitter_widget[0].find('noscript'):
                        noscript = twitter_widget[0].find('noscript')
                        link = noscript.find('a')
                        if link:
                            url = link.get('href')
                            title = remove_text_formatting_markup_from_fragments(link.contents)
                            all_tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
                            all_tags |= set(['script', 'embedded'])
                            tagged_urls.append(tagging.make_tagged_url(url, title, all_tags))
                        else:
                            raise ValueError("No link was found in the <noscript> section. Update the parser.")

                    else:
                        raise ValueError("Could not extract fallback noscript url for this embedded javascript object. Update the parser.")
                else:
                    raise ValueError("There seems to be more than one embedded twitter wdget in the SNIPPET, check this")

            # it might be a spotify container
            spotify_widget = section.find(attrs={"class": "spotify"})
            if spotify_widget:
                if spotify_widget.find("iframe").get("src"):
                    url = spotify_widget.find("iframe").get("src")
                    all_tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
                    all_tags |= set(['spotify', 'embedded'])
                    tagged_urls.append(tagging.make_tagged_url(url, url, all_tags))
                else:
                    raise ValueError("There seems to be a spotify widget but we could not find a link")

        else:
            raise ValueError("There seems to be an undefined embedded media here, you should check")

    return tagged_urls

示例#29

0

显示文件

文件： lesoir.py 项目： sevas/csxj-crawler

def sanitize_paragraph(paragraph):
    """
    Removes image links, removes paragraphs, formatting
    """
    return remove_text_formatting_markup_from_fragments(paragraph)

示例#30

0

显示文件

文件： dhnet.py 项目： sevas/csxj-crawler

 def extract_link_and_title(link):
     return link.get("href"), remove_text_formatting_markup_from_fragments(link.contents)