Пример #1
0
    def run(self):
        """
        Fetch homepage URLs, lookup content item IDS, and set number of minutes
        it's been on the homepage.
        """
        p = self.options.pop('page')
        for link in pageone.get(p, **self.options):
            u = link.get('url')

            # smartly handle urls
            u = url.prepare(u, canonicalize=False)
            if u and not u in self.url_lookup:
                u = url.prepare(u, canonicalize=True)

            # yield metrics
            if u and u in self.url_lookup:
                cids = self.url_lookup[u]
                for cid in cids:
                    yield {
                        'datetime': dates.now(),
                        'content_item_id': cid,
                        'metrics': {
                            'time_on_homepage': self.recipe.get('minutes', 60)
                        }
                    }
Пример #2
0
    def run(self):
        """
        Fetch homepage URLs, lookup content item IDS, and set number of minutes
        it's been on the homepage.
        """
        p = self.options.pop('page')
        for link in pageone.get(p, **self.options):
            u = link.get('url')

            # smartly handle urls
            u = url.prepare(u, canonicalize=False)
            if u and not u in self.url_lookup:
                u = url.prepare(u, canonicalize=True)

            # yield metrics
            if u and u in self.url_lookup:
                cids = self.url_lookup[u]
                for cid in cids:
                    yield {
                        'datetime': dates.now(),
                        'content_item_id': cid,
                        'metrics': {
                            'time_on_homepage': self.recipe.get('minutes', 60)
                        }
                    }
Пример #3
0
def _prepare_url(o, field, source=None, **kw):
    """
    Prepare a url
    """
    if field not in o:
        return None
    if o[field] is None:
        return None

    if kw.get('canonicalize', False):
        return url.prepare(o[field], source=source, **kw)

    # prepare urls before attempting cached request.
    u = url.prepare(o[field], source=source, expand=False, canonicalize=False)
    cache_response = url_cache.get(u)

    return cache_response.value
Пример #4
0
def _prepare_url(o, field, source=None, **kw):
    """
    Prepare a url
    """
    if field not in o:
        return None
    if o[field] is None:
        return None

    if kw.get('canonicalize', False):
        return url.prepare(o[field], source=source, **kw)

    # prepare urls before attempting cached request.
    u = url.prepare(o[field], source=source, expand=False, canonicalize=False)
    cache_response = url_cache.get(u)

    return cache_response.value
Пример #5
0
    def format(self, obj):
        """
        For now all of these options are standard to twitter events.
        """
        # set the status.
        obj['status'] = self.options.get('event_status', 'pending')

        # prepare url (these are formatted as redirects).
        obj['url'] = url.prepare(obj['url'], expand=False, canonicalize=False)

        # ignore bad domains / org's own domains.
        if self._is_bad_domain(obj['url']):
            return

        # extract and merge article data.
        if url.is_article(obj['url']):
            data = article.extract(obj['url'], type=None)
            if data:
                obj.update(data)
                obj.pop('type', None)
                obj.pop('site_name', None)
                obj.pop('favicon', None)

        # set source id:
        _id = obj.pop('id', obj.get('url', gen_uuid()))
        if ":" in _id:
            _id = _id.split(':')[-1]
        obj['source_id'] = _id

        # TODO: Make formatting more elegant.
        if self.options.get('set_event_title', None):
            obj['title'] = self.options.get(
                'set_event_title').format(**self._fmt(obj))

        if self.options.get('set_event_description', None):
            obj['description'] = self.options.get(
                'set_event_description').format(**self._fmt(obj))

        if self.options.get('set_event_tag_ids', None) and \
           len(self.options.get('set_event_tag_ids')):

            obj['tag_ids'] = self.options.get('set_event_tag_ids')

        # hack because the app cant handle this field being a list.
        if self.options.get('set_event_content_items', None):
            if 'content_item_ids' not in obj:
                obj['content_item_ids'] = []
            for c in self.options.get('set_event_content_items', []):
                if isinstance(c, dict):
                    if c.get('id', None):
                        obj['content_item_ids'].append(c.get('id'))
                elif isinstance(c, int):
                    obj['content_item_ids'].append(c)
        # filter links.
        if self.options.get('must_link', False) \
           and not len(obj.get('links', [])):
            return None
        return obj
Пример #6
0
 def work(self, raw_url):
     """
     Standardize + cache a raw url
     returning it's standardized url + global bitly url.
     """
     # standradize the url
     if url.is_abs(raw_url):
         source = raw_url
     else:
         source = None
     return url.prepare(raw_url, source=source, canonicalize=True, expand=True)
Пример #7
0
    def _gen_lookups(self):
        """
        Create a tree of url > content item ids.
        """
        # create containers
        self.url_lookup = defaultdict(list)

        # populate with ALL content items.
        for c in self.api.orgs.simple_content():
            u = c.pop('url', None)
            if u:
                self.url_lookup[url.prepare(u)].append(c['id'])
Пример #8
0
def prepare_url(o, field, source=None):
    """
    Prepare a url
    """
    if field not in o:
        return None
    if o[field] is None:
        return None
    # prepare it first before the sending to the canoncilation cache
    u = url.prepare(o[field], source=source, canonicalize=False, expand=False)
    cache_response = url_cache.get(u)
    return cache_response.value
Пример #9
0
    def _gen_lookups(self):
        """
        Create a tree of url > content item ids.
        """
        # create containers
        self.url_lookup = defaultdict(list)

        # populate with ALL content items.
        for c in self.api.orgs.simple_content():
            u = c.pop('url', None)
            if u:
                self.url_lookup[url.prepare(u)].append(c['id'])
Пример #10
0
def prepare_url(o, field, source=None):
    """
    Prepare a url
    """
    if field not in o:
        return None
    if o[field] is None:
        return None
    # prepare it first before the sending to the canoncilation cache
    u = url.prepare(o[field], source=source, canonicalize=False, expand=False)
    cache_response = url_cache.get(u)
    return cache_response.value
Пример #11
0
 def work(self, raw_url):
     """
     Standardize + cache a raw url
     returning it's standardized url + global bitly url.
     """
     # standradize the url
     if url.is_abs(raw_url):
         source = raw_url
     else:
         source = None
     return url.prepare(raw_url,
                        source=source,
                        canonicalize=True,
                        expand=True)
Пример #12
0
    def test_unshorten_url(self):

        cases = [
            ('http://nyti.ms/1oxYm3e',
             'http://www.nytimes.com/video/movies/100000002920951/anatomy-8216the-fault-in-our-stars8217.html'),
            ('nyti.ms/1oxYm3e',
             'http://www.nytimes.com/video/movies/100000002920951/anatomy-8216the-fault-in-our-stars8217.html'),
            ('http://bit.ly/1kzIQWw',
             'http://www.fromscratchradio.com/show/marc-dacosta'),
            ('bit.ly/aaaaaa', 'http://bit.ly/aaaaaa'),
            ('http://ow.ly/i/5OTms', 'http://ow.ly/i/5OTms'),
            # ('http://j.mp/1jBOKo1', 'http://earthfix.info/portables')
        ]
        for c in cases:
            test, truth = c
            try:
                test = url.prepare(test)
                assert(test == truth)
            except AssertionError:
                print "failed on %s" % test
                raise
Пример #13
0
    def test_unshorten_url(self):

        cases = [
            ('http://nyti.ms/1oxYm3e',
             'http://www.nytimes.com/video/movies/100000002920951/anatomy-8216the-fault-in-our-stars8217.html'
             ),
            ('nyti.ms/1oxYm3e',
             'http://www.nytimes.com/video/movies/100000002920951/anatomy-8216the-fault-in-our-stars8217.html'
             ),
            ('http://bit.ly/1kzIQWw',
             'http://www.fromscratchradio.com/show/marc-dacosta'),
            ('bit.ly/aaaaaa', 'http://bit.ly/aaaaaa'),
            ('http://ow.ly/i/5OTms', 'http://ow.ly/i/5OTms'),
            # ('http://j.mp/1jBOKo1', 'http://earthfix.info/portables')
        ]
        for c in cases:
            test, truth = c
            try:
                test = url.prepare(test)
                assert (test == truth)
            except AssertionError:
                print "failed on %s" % test
                raise
Пример #14
0
def extract(source_url):
    """
    Article extraction. Method is as follows:
    1. Get html from url.
    2. Canonicalize URL.
    3. If not canonical, prepare the url.
    4. Extract meta tags.
    5. If embedly is active, use it for content extraction.
    6. If embedly doesnt return content or is not active, use readability
    7. If readability doesnt return content, use article tag.
    8. If authors aren't detcted from meta tags, detect them in article body.
    """

    # fetch page
    page_html = network.get(source_url)

    # something failed.
    if not page_html:
        log.warning("Failed to extract html from {}".format(source_url))
        return None

    soup = BeautifulSoup(page_html)

    # get canonical url
    canonical_url = meta.canonical_url(soup)
    if not canonical_url:
        canonical_url = url.prepare(
            source_url, source=source_url, canonicalize=False)

    # domain
    domain = url.get_domain(canonical_url)

    # get meta tags + other data
    data = {
        'url': canonical_url,
        'domain': domain,
        'title': meta.title(soup, canonical_url),
        'description': meta.description(soup, canonical_url),
        'img_url': meta.img_url(soup, canonical_url),
        'created': meta.publish_date(soup, canonical_url),
        'favicon': meta.favicon(soup, canonical_url),
        'site_name': meta.site_name(soup, canonical_url),
        'page_type': meta.page_type(soup, canonical_url),
        'authors': author.extract(soup),
        'body': None
    }

    # embed videos
    if url.is_video(canonical_url):
        data['body'] = embed.video(canonical_url)
        return data

    # extract article body
    if settings.EMBEDLY_ENABLED:
        data['body'] = body_via_embedly(canonical_url)
    if not data['body']:
        data['body'] = body_via_readability(page_html, canonical_url)

    # # extract body from article tag
    body, raw_html = body_via_article_tag(soup, canonical_url)

    # merge body
    if not data['body']:
        data['body'] = body

    # get creators from raw article html
    if not len(data['authors']) and raw_html:
        data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS)

        # remove site name from authors
        if data.get('site_name'):
            data['authors'] = [
                a.replace(data['site_name'].upper(), "").strip()
                for a in data['authors']
            ]

    # # get links from raw_html + content
    # links = [u for u in url.from_any(data['body']) if source_url not in u]
    # for u in url.from_any(raw_html, source=source_url):
    #     if u not in links and (u != source_url or not u.startswith(source_url)):
    #         links.append(u)

    # # split out internal / external links / article links
    # data['links'] = url.categorize_links(links, domain)

    return data
Пример #15
0
 def test_prepare_with_redirect_back(self):
     source = 'https://www.revealnews.org/article/a-brief-history-of-the-modern-strawberry/'
     u = '//cdn.embedly.com/widgets/media.html?url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DfPxUIz5GHAE&src=http%3A%2F%2Fwww.youtube.com%2Fembed%2FfPxUIz5GHAE&type=text%2Fhtml&key=1b74e47c9db441f8a998fb6138abca72&schema=youtube'
     out = url.prepare(u, source)
     assert(out == 'http://www.youtube.com/watch?v=fPxUIz5GHAE')
Пример #16
0
def extract(source_url):
    """
    Article extraction. Method is as follows:
    1. Get html from url.
    2. Canonicalize URL.
    3. If not canonical, prepare the url.
    4. Extract meta tags.
    5. If embedly is active, use it for content extraction.
    6. If embedly doesnt return content or is not active, use readability
    7. If readability doesnt return content, use article tag.
    8. If authors aren't detcted from meta tags, detect them in article body.
    """

    # fetch page
    page_html = network.get(source_url)

    # something failed.
    if not page_html:
        log.warning("Failed to extract html from {}".format(source_url))
        return None

    soup = BeautifulSoup(page_html)

    # get canonical url
    canonical_url = meta.canonical_url(soup)
    if not canonical_url:
        canonical_url = url.prepare(source_url,
                                    source=source_url,
                                    canonicalize=False)

    # domain
    domain = url.get_domain(canonical_url)

    # get meta tags + other data
    data = {
        'url': canonical_url,
        'domain': domain,
        'title': meta.title(soup, canonical_url),
        'description': meta.description(soup, canonical_url),
        'img_url': meta.img_url(soup, canonical_url),
        'created': meta.publish_date(soup, canonical_url),
        'favicon': meta.favicon(soup, canonical_url),
        'site_name': meta.site_name(soup, canonical_url),
        'page_type': meta.page_type(soup, canonical_url),
        'authors': author.extract(soup),
        'body': None
    }

    # extract body from embedly + readability
    if settings.EMBEDLY_ENABLED:
        data['body'] = body_via_embedly(canonical_url)

    if not data['body']:
        data['body'] = body_via_readability(page_html, canonical_url)

    # # extract body from article tag
    body, raw_html = body_via_article_tag(soup, canonical_url)

    # merge body
    if not data['body']:
        data['body'] = body

    # get creators from raw article html
    if not len(data['authors']) and raw_html:
        data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS)

        # remove site name from authors
        if data.get('site_name'):
            data['authors'] = [
                a.replace(data['site_name'].upper(), "").strip()
                for a in data['authors']
            ]

    # # get links from raw_html + content
    # links = [u for u in url.from_any(data['body']) if source_url not in u]
    # for u in url.from_any(raw_html, source=source_url):
    #     if u not in links and (u != source_url or not u.startswith(source_url)):
    #         links.append(u)

    # # split out internal / external links / article links
    # data['links'] = url.categorize_links(links, domain)

    return data
Пример #17
0
 def test_prepare_with_redirect_back(self):
     source = 'https://www.revealnews.org/article/a-brief-history-of-the-modern-strawberry/'
     u = '//cdn.embedly.com/widgets/media.html?url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DfPxUIz5GHAE&src=http%3A%2F%2Fwww.youtube.com%2Fembed%2FfPxUIz5GHAE&type=text%2Fhtml&key=1b74e47c9db441f8a998fb6138abca72&schema=youtube'
     out = url.prepare(u, source)
     assert (out == 'http://www.youtube.com/watch?v=fPxUIz5GHAE')