Python classify_and_tag примеры, csxj.common.tagging.classify_and_tag Python примеры использования

Пример #1

0

Показать файл

Файл: lavenir.py Проект: sevas/csxj-crawler

def extract_links_from_text_hxs(hxs):
    tagged_urls = list()
    # intext urls: take all the <a>, except what might be inside a rendered tweet

    intext_link_hxs = hxs.select(".//a")
    for link_hxs in intext_link_hxs:
        title, url = extract_title_and_url(link_hxs)
        tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
        tags.add('in text')
        tagged_urls.append(make_tagged_url(url, title, tags))

    #plaintext text urls
    raw_content = hxs.select(".//p/text()").extract()

    if raw_content:
        for paragraph in raw_content:
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))
            for url in plaintext_urls:
                tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
                tags.update(['plaintext', 'in text'])
                tagged_urls.append(make_tagged_url(url, url, tags))

    #embedded objects
    iframe_sources = hxs.select(".//iframe/@src").extract()
    for url in iframe_sources:
        tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
        tags = tags.union(['in text', 'embedded', 'iframe'])
        tagged_urls.append(make_tagged_url(url, url, tags))

    return tagged_urls

Пример #2

0

Показать файл

Файл: lesoir_new.py Проект: sevas/csxj-crawler

def extract_text_content_and_links(soup) :
    tagged_urls = list()
    inline_links = []
    text = list()

    article_body = soup.find(attrs = {"class" : "article-body"})
    text_fragments = article_body.find_all("p")
    other_fragments = article_body.find_all("h2", {"style": "display: inline; font-size: 1em; padding: 0px; margin: 0px;"})
    all_fragments = text_fragments + other_fragments

    if all_fragments:
        for paragraph in text_fragments:
            text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph)))
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))
            for url in plaintext_urls:
                tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
                tags.update(['plaintext', 'in text'])
                tagged_urls.append(tagging.make_tagged_url(url, url, tags))

    else:
        text = u""

    for p in all_fragments:
        link = p.find_all("a")
        inline_links.extend(link)

    titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]

    for title, url, base_tags in titles_and_urls:
        tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
        tags.update(base_tags)
        tags.add('in text')
        tagged_urls.append(tagging.make_tagged_url(url, title, tags))

    return text, tagged_urls

Пример #3

0

Показать файл

Файл: septsursept.py Проект: sevas/csxj-crawler

def extract_links_from_sidebar_box(soup):
    tagged_urls = list()
    sidebar_box = soup.find(attrs={"class": "teas_article_306 mar10 clear clearfix relatedcomponents"})
    # there are links to articles
    if sidebar_box:
        sidebar_box.find_all(attrs={"class": "clearfix"})
        articles = sidebar_box.find_all(attrs={"class": "clearfix"})
        links = articles[0].find_all("a")
        titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links]
        for title, url, base_tags in titles_and_urls:
            tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
            tags.update(base_tags)
            tags.add('sidebar box')
            tagged_urls.append(tagging.make_tagged_url(url, title, tags))

        # and also links to thematic tags
        tags = sidebar_box.find_all(attrs={"class": "bt_meer_over clearfix"})
        for tag in tags:
            links = tag.find_all("a")
            titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links]
            for title, url, base_tags in titles_and_urls:
                tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
                tags.update(base_tags)
                tags.add('keyword')
                tags.add('sidebar box')
                tagged_urls.append(tagging.make_tagged_url(url, title, tags))

    return tagged_urls

Пример #4

0

Показать файл

Файл: septsursept.py Проект: sevas/csxj-crawler

def extract_intro(soup):
    intro_box = soup.find(attrs={"class": "intro"})
    tagged_urls = []

    if intro_box:
        intro_fragments = intro_box.find_all('b')
        intro = utils.remove_text_formatting_markup_from_fragments(intro_fragments)
        inline_links = intro_box.find_all("a")
        titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]
        plaintext_urls = utils.extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(intro))

        for title, url, base_tags in titles_and_urls:
            tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
            tags.update(base_tags)
            tags.add('in intro')
            tagged_urls.append(tagging.make_tagged_url(url, title, tags))

        for url in plaintext_urls:
            tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
            tags.add('in intro')
            tags.add('plaintext')
            tagged_urls.append(tagging.make_tagged_url(url, url, tags))
    else:
        intro = ""

    return intro, tagged_urls

Пример #5

0

Показать файл

Файл: lesoir.py Проект: sevas/csxj-crawler

def extract_links_from_embedded_content(story):
    tagged_urls = []

    # generic iframes
    iframe_items = story.findAll("iframe", recursive=True)
    for iframe in iframe_items:
        url = iframe.get('src')
        all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
        tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded', 'iframe'])))

    # extract embedded storify
    scripts = story.findAll('script', recursive=True)
    for script in scripts:
        url = script.get('src')
        if url:
            scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
            if netloc == "storify.com":
                url = url.rstrip(".js")
                all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
                tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded', 'storify'])))

    # TO DO NEXT : reconstruc kplayer URL
    kplayer = story.find('div', {'class': 'containerKplayer'})
    if kplayer:
        kplayer_flash = kplayer.find('div', {'class': 'flash_kplayer'})
        url_part1 = kplayer_flash.object['data']
        url_part2 = kplayer_flash.object.find('param', {'name': 'flashVars'})['value']
        if url_part1 is not None and url_part2 is not None:
            url = "%s?%s" % (url_part1, url_part2)
            all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
            tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded', 'kplayer'])))
        else:
            raise ValueError("We couldn't find an URL in the flash player. Update the parser.")

    return tagged_urls

Пример #6

0

Показать файл

Файл: media_utils.py Проект: sevas/csxj-crawler

def extract_tagged_url_from_embedded_script(script, site_netloc, site_internal_sites):
    if script.get('src'):
        script_url = script.get('src')
        if twitter_utils.is_twitter_widget_url(script_url):
            if script.contents:
                title, url, tags = twitter_utils.get_widget_type(script.contents[0])
            else:
                # sometimes the TWTR.Widget code is in the next <script> container. Whee.
                sibling_script = script.findNextSibling('script')
                title, url, tags = twitter_utils.get_widget_type(sibling_script.contents[0])
            tags |= classify_and_tag(url, site_netloc, site_internal_sites)
            tags |= set(['script', 'embedded'])
            return make_tagged_url(url, title, tags)
        else:
            if script.findNextSibling('noscript'):
                noscript = script.findNextSibling('noscript')
                link = noscript.find('a')
                if link:
                    url = link.get('href')
                    title = remove_text_formatting_markup_from_fragments(link.contents)
                    all_tags = classify_and_tag(url, site_netloc, site_internal_sites)
                    all_tags |= set(['script', 'embedded'])
                    return make_tagged_url(url, title, all_tags)
                else:
                    raise ValueError("No link was found in the <noscript> section. Update the parser.")
            else:
                raise ValueError("Could not extract fallback noscript url for this embedded javascript object. Update the parser.")
    else:
        raise ValueError("Embedded script of unknown type was detected. Update the parser.")

Пример #7

0

Показать файл

Файл: sudinfo.py Проект: sevas/csxj-crawler

def extract_associated_links(hxs):
    links = hxs.select("//div[@id='picture']/descendant::div[@class='bloc-01']//a")

    all_tagged_urls = []

    if links:
        def extract_url_and_title(link_hxs):
            url = link_hxs.select('@href').extract()[0]
            title = u"".join(link_hxs.select("text()").extract())

            tags = set()
            if not title:
                title = u'No Title'
                tags.add(constants.GHOST_LINK_TAG)
            if not url:
                url = u''
                tags.add('no target')
            return url, title, tags

        all_tagged_urls = list()
        for item in links:
            url, title, tags = extract_url_and_title(item)
            tags.update(classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES))
            link_type = item.select('@class')
            if link_type and link_type[0] in LINK_TYPE_TO_TAG:
                tags.update(LINK_TYPE_TO_TAG[link_type])

            tags.add("sidebar box")

            all_tagged_urls.append(make_tagged_url(url, title, tags))

    media_links = hxs.select("//div[@id='picture']/descendant::div[@class='wrappAllMedia']/div")

    for i, item in enumerate(media_links):
        if item.select('./img'):
            pass # images are lame
        elif item.select(".//div[starts-with(@id, 'media-youtube')]"):
            youtube_div = item.select(".//div[starts-with(@id, 'media-youtube')]")
            youtube_object = youtube_div.select("./object")
            url = hxs_media_utils.extract_url_from_youtube_object(youtube_object)
            tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
            tags |= set(['youtube', 'embedded', 'video'])
            title = parser_constants.NO_TITLE
            all_tagged_urls.append(make_tagged_url(url, title, tags))
        elif item.select(".//div[contains(@class, 'emvideo-kewego')]"):
            kplayer_div = item.select(".//div[contains(@class, 'emvideo-kewego')]")
            kplayer_object = kplayer_div.select("./object")
            url = hxs_media_utils.extract_url_from_kplayer_object(kplayer_object)
            tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
            tags |= set(['kewego', 'embedded', 'video'])
            title = parser_constants.NO_TITLE
            all_tagged_urls.append(make_tagged_url(url, title, tags))
        elif not item.select("./div/text()"):
            pass # empty divs are lame
        else:

            raise ValueError("The media box contains something other than an image or a youtube video. Update your parser")

    return all_tagged_urls

Пример #8

0

Показать файл

Файл: sudpresse.py Проект: sevas/csxj-crawler

def extract_associated_links(article):
    links_block = article.find('div', {'class': 'bloc-01'})

    if links_block:
        link_list = links_block.find('ul')

        def extract_url_and_title(item):
            url = item.a.get('href')
            title = remove_text_formatting_markup_from_fragments(item.a.contents)

            tags = set()
            if not title:
                title = u'No Title'
                tags.add('ghost link')
            return url, title, tags

        all_tagged_urls = list()
        for item in link_list.findAll('li'):
            url, title, tags = extract_url_and_title(item)
            tags.update(classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES))

            link_type = item.get('class')
            if link_type in LINK_TYPE_TO_TAG:
                tags.update(LINK_TYPE_TO_TAG[link_type])

            tags.add("sidebar box")

            all_tagged_urls.append(make_tagged_url(url, title, tags))

        return all_tagged_urls
    else:
        return []

Пример #9

0

Показать файл

Файл: sudinfo.py Проект: sevas/csxj-crawler

def extract_content_and_links(hxs):
    content_paragraphs_hxs = hxs.select("//div [@id='article']/p[starts-with(@class, 'publiele')]/following-sibling::p")

    all_content_paragraphs, all_tagged_urls = list(), list()

    # process paragraphs
    for p in content_paragraphs_hxs:
        text, tagged_urls = extract_text_and_links_from_paragraph(p)
        all_content_paragraphs.append(text)
        all_tagged_urls.extend(tagged_urls)

    #extract embedded videos
    divs = hxs.select("//div [@id='article']/p[starts-with(@class, 'publiele')]/following-sibling::div/div [@class='bottomVideos']")

    for div in divs:
        urls = div.select("./div [contains(@class, 'emvideo-kewego')]//video/@poster").extract()
        for url in urls:
            tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
            tags.update(['bottom', 'video', 'embedded'])

            all_tagged_urls.append(make_tagged_url(url, url, tags))

    new_media_items = hxs.select("//div [@class='digital-wally_digitalobject']//li")

    all_tagged_urls.extend(extract_links_from_media_items(new_media_items))

    return all_content_paragraphs, all_tagged_urls

Пример #10

0

Показать файл

Файл: septsursept.py Проект: sevas/csxj-crawler

def extract_embedded_media(soup):
    tagged_urls = list()

    # extract embedded media from any iframe in the article body
    content_box = soup.find(attrs={"id": "detail_content"})
    text = content_box.find_all(attrs={"class": "clear"})
    for fragment in text:
        for p in fragment.find_all("p", recursive=False):
            embedded_container = p.findAll("iframe")
            for x in embedded_container:
                url = x.get("src")
                tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
                tags.add('embedded')
                tags.add('iframe')
                tags.add('in text')
                tagged_urls.append(tagging.make_tagged_url(url, url, tags))

    # some embedded media are not in the artucle body, but embedded in the art_aside container
    art_aside = soup.find_all(attrs={"class": "art_aside"})
    if art_aside:
        for section in art_aside:
            tagged_urls.extend(find_embedded_media_in_multimedia_box(section))

    # same, but in the art_bottom container
    art_bottom = soup.find_all(attrs={"class": "art_bottom"})
    if art_bottom:
        for section in art_bottom:
            tagged_urls.extend(find_embedded_media_in_multimedia_box(section))

    return tagged_urls

Пример #11

0

Показать файл

Файл: ipm_utils.py Проект: sevas/csxj-crawler

def extract_embedded_audio_links(main_content, netloc, associated_sites):
    strong_article_links = main_content.find("div", {"id": "strongArticleLinks"})
    if not strong_article_links:
        return []

    embedded_audio_link_list = strong_article_links.find("ul", {"id": "audioContents"})

    if not embedded_audio_link_list:
        return []

    tagged_urls = []
    for item in embedded_audio_link_list.findAll("li", recursive=False):
        if item.object:
            flash_obj = item.object
            data_url = flash_obj.get("data")
            if data_url:
                source_url = media_utils.extract_source_url_from_dewplayer(data_url)
                title = item.text
                tags = classify_and_tag(source_url, netloc, associated_sites)
                tags |= set(["sidebar box", "audio", "embedded"])
                tagged_url = make_tagged_url(source_url, title, tags)
                tagged_urls.append(tagged_url)
            else:
                raise ValueError("Could not find the source url for the flash object. Fix your parser.")
        else:
            raise ValueError("Could not find the flash object for embedded audio. Fix your parser.")

    return tagged_urls

Пример #12

0

Показать файл

Файл: sudinfo.py Проект: sevas/csxj-crawler

def extract_text_and_links_from_paragraph(paragraph_hxs):
    def separate_img_and_text_links(links):
        img_links = [l for l in links if l.select("./img")]
        text_links = [l for l in links if l not in img_links]

        return [extract_title_and_url(link) for link in text_links], [extract_img_link_info(link) for link in img_links]

    links = paragraph_hxs.select(".//a")

    titles_and_urls, img_targets_and_urls = separate_img_and_text_links(links)

    tagged_urls = list()
    for title, url in titles_and_urls:
        tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
        tags.update(['in text'])
        if title == constants.GHOST_LINK_TITLE:
            tags.update([constants.GHOST_LINK_TAG])
        tagged_urls.append(make_tagged_url(url, title, tags))

    for img_target, url in img_targets_and_urls:
        tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
        tags.update(['in text', 'embedded image'])
        tagged_urls.append(make_tagged_url(url, img_target, tags))

    # plaintext urls
    text_fragments = paragraph_hxs.select("./text()").extract()
    if text_fragments:
        text = u"".join(remove_text_formatting_markup_from_fragments(text_fragments))

        for paragraph in text_fragments:
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))

            for url in plaintext_urls:
                tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
                tags.update(['plaintext', 'in text'])

                tagged_urls.append(make_tagged_url(url, url, tags))
    else:
        text = u""

    # iframes
    iframes = paragraph_hxs.select(".//iframe")
    for iframe in iframes:
        target_url, tags = extract_and_tag_iframe_source(iframe)
        tagged_urls.append(make_tagged_url(target_url, "__EMBEDDED_IFRAME__", tags))

    return text, tagged_urls

Пример #13

0

Показать файл

Файл: lavenir.py Проект: sevas/csxj-crawler

def extract_links_from_embbeded_media(content_hxs):
    body_hxs = content_hxs.select(".//div [@class='entry-body']")
    tagged_urls = []
    for script_hxs in body_hxs.select('./script'):
        snippet = script_hxs.select('./text()').extract()

        if len(snippet) > 0 and media_utils.ignore_snippet(snippet[0]):
            continue

        script_src = script_hxs.select("./@src").extract()
        if not script_src:
            raise ValueError("Found a <script> with no src attr.")

        if script_src[0].startswith("//platform.twitter.com/widgets.js"):
            # tagged_urls.append(make_tagged_url(constants.NO_URL, constants.NO_TITLE, set(['embedded', 'tweet', constants.UNFINISHED_TAG])))
            previous_blockquote = script_hxs.select("./preceding-sibling::blockquote[1]")
            if previous_blockquote:
                if 'twitter-tweet' in previous_blockquote[0].select("./@class").extract():
                    url = previous_blockquote.select('./a[last()]/@href').extract()[0]
                    tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
                    title = u"[RENDERED TWEET]"
                    tags |= set(['embedded', 'tweet'])
                    tagged_urls.append(make_tagged_url(url, title, tags))
                else:
                    raise ValueError("This blockquote does not appear to be a tweet.")
            else:
                raise ValueError("Found a twitter widget <script> without its companion blockquote.")
        elif script_src[0].startswith("http://storify.com"):
            url = script_src[0]
            title = constants.RENDERED_STORIFY_TITLE
            tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
            tags |= set(['embedded', 'storify'])
            tagged_urls.append(make_tagged_url(url, title, tags))
        else:
            noscript_hxs = script_hxs.select('./following-sibling::noscript[1]')
            if noscript_hxs:
                link_hxs = noscript_hxs.select('a')
                title, url = extract_title_and_url(link_hxs)
                tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
                title = constants.RENDERED_TWEET_TITLE
                tags |= set(['embedded'])
                tagged_urls.append(make_tagged_url(url, title, tags))
            else:
                raise ValueError("This blockquote does not appear to be a tweet.")

    return tagged_urls

Пример #14

0

Показать файл

Файл: ipm_utils.py Проект: sevas/csxj-crawler

def extract_kplayer_infos(kplayer_flash, title, site_netloc, site_internal_sites):
    url_part1 = kplayer_flash.object["data"]
    url_part2 = kplayer_flash.object.find("param", {"name": "flashVars"})["value"]
    if url_part1 is not None and url_part2 is not None:
        url = "%s?%s" % (url_part1, url_part2)
        all_tags = classify_and_tag(url, site_netloc, site_internal_sites)
        return make_tagged_url(url, title, all_tags | set(["video", "embedded", "kplayer"]))
    else:
        raise ValueError("We couldn't find an URL in the flash player. Update the parser.")

Пример #15

0

Показать файл

Файл: sudinfo.py Проект: sevas/csxj-crawler

    def extract_and_tag_url_from_iframe(item):
        embedded_frame = item.select(".//iframe")
        if embedded_frame:
            target_url, tags = extract_and_tag_iframe_source(embedded_frame)
            tags = classify_and_tag(target_url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)

            return make_tagged_url(target_url, title, tags)
        else:
            return None

Пример #16

0

Показать файл

Файл: ipm_utils.py Проект: sevas/csxj-crawler

def extract_tagged_url_from_associated_link(link_list_item, netloc, associated_sites, additional_tags=[]):
    # sometimes list items are used to show things which aren't links
    # but more like unclickable ads
    url = link_list_item.a.get("href")
    title = remove_text_formatting_markup_from_fragments(link_list_item.a.contents).strip()
    tags = classify_and_tag(url, netloc, associated_sites)
    tags |= set(additional_tags)
    tagged_url = make_tagged_url(url, title, tags)
    return tagged_url

Пример #17

0

Показать файл

Файл: rtlinfo.py Проект: sevas/csxj-crawler

def extract_embedded_links_from_articlebody(article_body):
    embedded_links = list()

    for link in extract_usable_links(article_body):
        url = link.get('href')
        title = remove_text_formatting_markup_from_fragments(link.contents)
        tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES)
        tags.add('in text')
        embedded_links.append(make_tagged_url(url, title, tags))

    for embedded_video_frame in article_body.findAll('iframe'):
        url = embedded_video_frame.get('src')
        title = '[Video] {0}'.format(url)
        tags = classify_and_tag(url, RTLINFO_OWN_NETLOC, RTLINFO_INTERNAL_SITES)
        tags = tags.union(['in text', 'embedded'])
        embedded_links.append(make_tagged_url(url, title, tags))

    return embedded_links

Пример #18

0

Показать файл

Файл: lesoir_new.py Проект: sevas/csxj-crawler

        def extract_links_from_intro(fragment):
            tagged_urls = list()
            inline_links = fragment.find_all('a')
            titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(fragment))

            for title, url, base_tags in titles_and_urls:
                tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
                tags.update(base_tags)
                tags.add('in intro')
                tagged_urls.append(tagging.make_tagged_url(url, title, tags))

            for url in plaintext_urls:
                tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
                tags.add('in intro')
                tags.add('plaintext')
                tagged_urls.append(tagging.make_tagged_url(url, url, tags))
            return tagged_urls

Пример #19

0

Показать файл

Файл: lavenir.py Проект: sevas/csxj-crawler

def extract_links_from_tags(hxs):
    tag_navbar_hxs = hxs.select("//nav [@class='entry-tags']")
    tagged_urls = list()
    for link_hxs in tag_navbar_hxs.select("./ul/li/a"):
        title, url = extract_title_and_url(link_hxs)
        tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
        tags |= set(['keyword'])
        tagged_urls.append(make_tagged_url(url, title, tags))

    return tagged_urls

Пример #20

0

Показать файл

Файл: lavenir.py Проект: sevas/csxj-crawler

def extract_related_links(hxs):
    aside_hxs = hxs.select("//div//aside [@class='entry-related']")
    tagged_urls = list()
    related_link_hxs = aside_hxs.select(".//ul/li//a")
    for link_hxs in related_link_hxs:
        title, url = extract_title_and_url(link_hxs)
        tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
        tags |= set(['bottom box', 'related'])
        tagged_urls.append(make_tagged_url(url, title, tags))
    return tagged_urls

Пример #21

0

Показать файл

Файл: lavenir.py Проект: sevas/csxj-crawler

def select_title_and_url(selector, tag_name):
    url = selector.select("./@href").extract()[0]
    title = selector.select(".//text()").extract()
    if title:
        title = remove_text_formatting_markup_from_fragments(title[0])
        tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
        tags = tags.union([tag_name])
    else:
        tags = set([tag_name, constants.GHOST_LINK_TAG])
        title = constants.GHOST_LINK_TITLE
    return make_tagged_url(url, title, tags)

Пример #22

0

Показать файл

Файл: lesoir.py Проект: sevas/csxj-crawler

def classify_and_make_tagged_url(urls_and_titles, additional_tags=set()):
    """
    Classify (with tags) every element in a list of (url, title) tuples
    Returns a list of TaggedURLs
    """
    tagged_urls = []
    for url, title in urls_and_titles:
        tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
        if is_on_same_domain(url):
            tags.update(['internal site'])
        tagged_urls.append(make_tagged_url(url, title, tags | additional_tags))
    return tagged_urls

Пример #23

0

Показать файл

Файл: lesoir_new.py Проект: sevas/csxj-crawler

def extract_embedded_media_in_article(soup):
    tagged_urls = list()
    story = soup.find(attrs={'class': 'article-body'})
    scripts = story.findAll('script', recursive=True)
    for script in scripts:
        url = script.get('src')
        if url:
            scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
            if netloc == "storify.com":
                url = url.rstrip(".js")
                all_tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
                tagged_urls.append(tagging.make_tagged_url(url, url, all_tags | set(['embedded', 'storify'])))
    return tagged_urls

Пример #24

0

Показать файл

Файл: lesoir_new.py Проект: sevas/csxj-crawler

def extract_links_from_sidebar_box(soup):
    tagged_urls = list()
    sidebar_boxes = soup.find_all('div', {'class': 'box alt'})
    if sidebar_boxes:
        for sidebar_box in sidebar_boxes:
            links = sidebar_box.find_all('a')
            titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links]
            for title, url, base_tags in titles_and_urls:
                tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
                tags.update(base_tags)
                tags.add('sidebar box')
                tagged_urls.append(tagging.make_tagged_url(url, title, tags))
    return tagged_urls

Пример #25

0

Показать файл

Файл: sudpresse.py Проект: sevas/csxj-crawler

def extract_text_and_links_from_paragraph(paragraph):
    def extract_url_and_title(link):
        if isinstance(link.contents[0], bs.Tag):
            if link.contents[0].name == 'img':
                img_target = link.contents[0].get('src')
                return link.get('href'), '(img){0}'.format(img_target)
            else:
                title = remove_text_formatting_markup_from_fragments(link.contents)
                return link.get('href'), title
        else:
            return link.get('href'), remove_text_formatting_markup_from_fragments(link.contents)

    # Why do we filter on link.contents? Because sometimes there
    # are <a id="more"></a> links which point to nothing.
    # Awesome.
    urls_and_titles = [extract_url_and_title(link) for link in paragraph.findAll('a', recursive=False) if link.contents]

    tagged_urls = list()

    for url, title in urls_and_titles:
        tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES)
        tags.update(['in text'])
        tagged_urls.append(make_tagged_url(url, title, tags))

    text_fragments = paragraph.contents

    if text_fragments:
        text = u"".join(remove_text_formatting_markup_from_fragments(text_fragments))

        plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(text_fragments))
        for url in plaintext_urls:
            tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES)
            tags.update(['plaintext', 'in text'])

            tagged_urls.append(make_tagged_url(url, url, tags))
    else:
        text = u""

    return text, tagged_urls

Пример #26

0

Показать файл

Файл: sudpresse.py Проект: sevas/csxj-crawler

def extract_embedded_media(article):
    tagged_urls = list()
    # extract any iframe from maincontent
    iframes = article.findAll("iframe")
    for media in iframes:
        url = media.get('src')
        tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES)
        tags.add('embedded')
        tags.add('iframe')
        tagged_url = make_tagged_url(url, url, tags)
        tagged_urls.append(tagged_url)

    return tagged_urls

Пример #27

0

Показать файл

Файл: lesoir_new.py Проект: sevas/csxj-crawler

def extract_article_tags(soup):
    tagged_urls = list()
    meta_box = soup.find(attrs={"class": "meta"})
    if meta_box.find(attrs={'class': 'tags'}):
        tags = meta_box.find(attrs={'class': 'tags'})
        links = tags.find_all("a")
        titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links]
        for title, url, base_tags in titles_and_urls:
            tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
            tags.update(base_tags)
            tags.add('keyword')
            tagged_urls.append(tagging.make_tagged_url(url, title, tags))

    return tagged_urls

Пример #28

0

Показать файл

Файл: dhnet.py Проект: sevas/csxj-crawler

def classify_and_make_tagged_url(urls_and_titles, additional_tags=set()):
    """
    Classify (with tags) every element in a list of (url, title) tuples
    Returns a list of TaggedURLs
    """
    tagged_urls = []
    for url, title in urls_and_titles:
        tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        if is_on_same_domain(url):
            tags = tags.union(["internal site", "internal"])
        all_tags = tags.union(additional_tags)
        tagged_urls.append(make_tagged_url(url, title, all_tags))

    return tagged_urls

Пример #29

0

Показать файл

Файл: dhnet.py Проект: sevas/csxj-crawler

def extract_text_content_and_links_from_articletext(main_content, has_intro=True):
    article_text = main_content

    in_text_tagged_urls = []
    all_cleaned_paragraphs = []
    all_rough_paragraphs = []
    all_plaintext_urls = []
    embedded_tweets = []

    def is_text_content(blob):
        if isinstance(blob, bs.Tag) and blob.name in TEXT_MARKUP_TAGS:
            return True
        if isinstance(blob, bs.NavigableString):
            return True
        return False

    text_fragments = [c for c in article_text.contents if is_text_content(c)]

    if text_fragments:
        # we first need to avoid treating embedded tweets as text
        for paragraph in text_fragments:
            if isinstance(paragraph, bs.NavigableString):
                all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph))
                all_rough_paragraphs.append(paragraph)

            else:
                if not paragraph.find("blockquote", {"class": "twitter-tweet"}):
                    in_text_links = extract_and_tag_in_text_links(paragraph)
                    in_text_tagged_urls.extend(in_text_links)
                    all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph))
                    all_rough_paragraphs.append(paragraph)
                else:
                    embedded_tweets.extend(
                        twitter_utils.extract_rendered_tweet(paragraph, DHNET_NETLOC, DHNET_INTERNAL_SITES)
                    )

        # extracting plaintext links
        for paragraph in all_rough_paragraphs:
            plaintext_urls = extract_plaintext_urls_from_text(
                remove_text_formatting_and_links_from_fragments(paragraph)
            )
            for url in plaintext_urls:
                tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES)
                tags.update(["plaintext", "in text"])
                all_plaintext_urls.append(make_tagged_url(url, url, tags))
    else:
        all_cleaned_paragraphs = []

    return all_cleaned_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets

Пример #30

0

Показать файл

Файл: lesoir.py Проект: sevas/csxj-crawler

def extract_text_content(story):
    """
    Finds the story's body, cleans up the text to remove all html formatting.
    Returns a list of strings, one per found paragraph, and all the plaintext urls, as TaggedURLs
    """
    story = story.find('div', {'id': 'story_body'})
    paragraphs = story.findAll('p', recursive=False)

    tagged_urls = list()

    # extract regular, in text links
    inline_links = list()
    plaintext_urls = list()
    text = list()

    if paragraphs:
        for paragraph in paragraphs:
            text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph)))
            links = paragraph.findAll('a', recursive=True)
            inline_links.extend(links)
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))
            for url in plaintext_urls:
                tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
                tags.update(['plaintext', 'in text'])
                tagged_urls.append(make_tagged_url(url, url, tags))

        titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links if not i.find('img')]
        for title, url, base_tags in titles_and_urls:
            tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
            tags.add('in text')
            tagged_urls.append(tagging.make_tagged_url(url, title, tags))

    else:
        text = u""

    return text, tagged_urls

Python classify_and_tag примеры использования