def run(self): """ Fetch homepage URLs, lookup content item IDS, and set number of minutes it's been on the homepage. """ p = self.options.pop('page') for link in pageone.get(p, **self.options): u = link.get('url') # smartly handle urls u = url.prepare(u, canonicalize=False) if u and not u in self.url_lookup: u = url.prepare(u, canonicalize=True) # yield metrics if u and u in self.url_lookup: cids = self.url_lookup[u] for cid in cids: yield { 'datetime': dates.now(), 'content_item_id': cid, 'metrics': { 'time_on_homepage': self.recipe.get('minutes', 60) } }
def _prepare_url(o, field, source=None, **kw): """ Prepare a url """ if field not in o: return None if o[field] is None: return None if kw.get('canonicalize', False): return url.prepare(o[field], source=source, **kw) # prepare urls before attempting cached request. u = url.prepare(o[field], source=source, expand=False, canonicalize=False) cache_response = url_cache.get(u) return cache_response.value
def format(self, obj): """ For now all of these options are standard to twitter events. """ # set the status. obj['status'] = self.options.get('event_status', 'pending') # prepare url (these are formatted as redirects). obj['url'] = url.prepare(obj['url'], expand=False, canonicalize=False) # ignore bad domains / org's own domains. if self._is_bad_domain(obj['url']): return # extract and merge article data. if url.is_article(obj['url']): data = article.extract(obj['url'], type=None) if data: obj.update(data) obj.pop('type', None) obj.pop('site_name', None) obj.pop('favicon', None) # set source id: _id = obj.pop('id', obj.get('url', gen_uuid())) if ":" in _id: _id = _id.split(':')[-1] obj['source_id'] = _id # TODO: Make formatting more elegant. if self.options.get('set_event_title', None): obj['title'] = self.options.get( 'set_event_title').format(**self._fmt(obj)) if self.options.get('set_event_description', None): obj['description'] = self.options.get( 'set_event_description').format(**self._fmt(obj)) if self.options.get('set_event_tag_ids', None) and \ len(self.options.get('set_event_tag_ids')): obj['tag_ids'] = self.options.get('set_event_tag_ids') # hack because the app cant handle this field being a list. if self.options.get('set_event_content_items', None): if 'content_item_ids' not in obj: obj['content_item_ids'] = [] for c in self.options.get('set_event_content_items', []): if isinstance(c, dict): if c.get('id', None): obj['content_item_ids'].append(c.get('id')) elif isinstance(c, int): obj['content_item_ids'].append(c) # filter links. if self.options.get('must_link', False) \ and not len(obj.get('links', [])): return None return obj
def work(self, raw_url): """ Standardize + cache a raw url returning it's standardized url + global bitly url. """ # standradize the url if url.is_abs(raw_url): source = raw_url else: source = None return url.prepare(raw_url, source=source, canonicalize=True, expand=True)
def _gen_lookups(self): """ Create a tree of url > content item ids. """ # create containers self.url_lookup = defaultdict(list) # populate with ALL content items. for c in self.api.orgs.simple_content(): u = c.pop('url', None) if u: self.url_lookup[url.prepare(u)].append(c['id'])
def prepare_url(o, field, source=None): """ Prepare a url """ if field not in o: return None if o[field] is None: return None # prepare it first before the sending to the canoncilation cache u = url.prepare(o[field], source=source, canonicalize=False, expand=False) cache_response = url_cache.get(u) return cache_response.value
def test_unshorten_url(self): cases = [ ('http://nyti.ms/1oxYm3e', 'http://www.nytimes.com/video/movies/100000002920951/anatomy-8216the-fault-in-our-stars8217.html'), ('nyti.ms/1oxYm3e', 'http://www.nytimes.com/video/movies/100000002920951/anatomy-8216the-fault-in-our-stars8217.html'), ('http://bit.ly/1kzIQWw', 'http://www.fromscratchradio.com/show/marc-dacosta'), ('bit.ly/aaaaaa', 'http://bit.ly/aaaaaa'), ('http://ow.ly/i/5OTms', 'http://ow.ly/i/5OTms'), # ('http://j.mp/1jBOKo1', 'http://earthfix.info/portables') ] for c in cases: test, truth = c try: test = url.prepare(test) assert(test == truth) except AssertionError: print "failed on %s" % test raise
def test_unshorten_url(self): cases = [ ('http://nyti.ms/1oxYm3e', 'http://www.nytimes.com/video/movies/100000002920951/anatomy-8216the-fault-in-our-stars8217.html' ), ('nyti.ms/1oxYm3e', 'http://www.nytimes.com/video/movies/100000002920951/anatomy-8216the-fault-in-our-stars8217.html' ), ('http://bit.ly/1kzIQWw', 'http://www.fromscratchradio.com/show/marc-dacosta'), ('bit.ly/aaaaaa', 'http://bit.ly/aaaaaa'), ('http://ow.ly/i/5OTms', 'http://ow.ly/i/5OTms'), # ('http://j.mp/1jBOKo1', 'http://earthfix.info/portables') ] for c in cases: test, truth = c try: test = url.prepare(test) assert (test == truth) except AssertionError: print "failed on %s" % test raise
def extract(source_url): """ Article extraction. Method is as follows: 1. Get html from url. 2. Canonicalize URL. 3. If not canonical, prepare the url. 4. Extract meta tags. 5. If embedly is active, use it for content extraction. 6. If embedly doesnt return content or is not active, use readability 7. If readability doesnt return content, use article tag. 8. If authors aren't detcted from meta tags, detect them in article body. """ # fetch page page_html = network.get(source_url) # something failed. if not page_html: log.warning("Failed to extract html from {}".format(source_url)) return None soup = BeautifulSoup(page_html) # get canonical url canonical_url = meta.canonical_url(soup) if not canonical_url: canonical_url = url.prepare( source_url, source=source_url, canonicalize=False) # domain domain = url.get_domain(canonical_url) # get meta tags + other data data = { 'url': canonical_url, 'domain': domain, 'title': meta.title(soup, canonical_url), 'description': meta.description(soup, canonical_url), 'img_url': meta.img_url(soup, canonical_url), 'created': meta.publish_date(soup, canonical_url), 'favicon': meta.favicon(soup, canonical_url), 'site_name': meta.site_name(soup, canonical_url), 'page_type': meta.page_type(soup, canonical_url), 'authors': author.extract(soup), 'body': None } # embed videos if url.is_video(canonical_url): data['body'] = embed.video(canonical_url) return data # extract article body if settings.EMBEDLY_ENABLED: data['body'] = body_via_embedly(canonical_url) if not data['body']: data['body'] = body_via_readability(page_html, canonical_url) # # extract body from article tag body, raw_html = body_via_article_tag(soup, canonical_url) # merge body if not data['body']: data['body'] = body # get creators from raw article html if not len(data['authors']) and raw_html: data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS) # remove site name from authors if data.get('site_name'): data['authors'] = [ a.replace(data['site_name'].upper(), "").strip() for a in data['authors'] ] # # get links from raw_html + content # links = [u for u in url.from_any(data['body']) if source_url not in u] # for u in url.from_any(raw_html, source=source_url): # if u not in links and (u != source_url or not u.startswith(source_url)): # links.append(u) # # split out internal / external links / article links # data['links'] = url.categorize_links(links, domain) return data
def test_prepare_with_redirect_back(self): source = 'https://www.revealnews.org/article/a-brief-history-of-the-modern-strawberry/' u = '//cdn.embedly.com/widgets/media.html?url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DfPxUIz5GHAE&src=http%3A%2F%2Fwww.youtube.com%2Fembed%2FfPxUIz5GHAE&type=text%2Fhtml&key=1b74e47c9db441f8a998fb6138abca72&schema=youtube' out = url.prepare(u, source) assert(out == 'http://www.youtube.com/watch?v=fPxUIz5GHAE')
def extract(source_url): """ Article extraction. Method is as follows: 1. Get html from url. 2. Canonicalize URL. 3. If not canonical, prepare the url. 4. Extract meta tags. 5. If embedly is active, use it for content extraction. 6. If embedly doesnt return content or is not active, use readability 7. If readability doesnt return content, use article tag. 8. If authors aren't detcted from meta tags, detect them in article body. """ # fetch page page_html = network.get(source_url) # something failed. if not page_html: log.warning("Failed to extract html from {}".format(source_url)) return None soup = BeautifulSoup(page_html) # get canonical url canonical_url = meta.canonical_url(soup) if not canonical_url: canonical_url = url.prepare(source_url, source=source_url, canonicalize=False) # domain domain = url.get_domain(canonical_url) # get meta tags + other data data = { 'url': canonical_url, 'domain': domain, 'title': meta.title(soup, canonical_url), 'description': meta.description(soup, canonical_url), 'img_url': meta.img_url(soup, canonical_url), 'created': meta.publish_date(soup, canonical_url), 'favicon': meta.favicon(soup, canonical_url), 'site_name': meta.site_name(soup, canonical_url), 'page_type': meta.page_type(soup, canonical_url), 'authors': author.extract(soup), 'body': None } # extract body from embedly + readability if settings.EMBEDLY_ENABLED: data['body'] = body_via_embedly(canonical_url) if not data['body']: data['body'] = body_via_readability(page_html, canonical_url) # # extract body from article tag body, raw_html = body_via_article_tag(soup, canonical_url) # merge body if not data['body']: data['body'] = body # get creators from raw article html if not len(data['authors']) and raw_html: data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS) # remove site name from authors if data.get('site_name'): data['authors'] = [ a.replace(data['site_name'].upper(), "").strip() for a in data['authors'] ] # # get links from raw_html + content # links = [u for u in url.from_any(data['body']) if source_url not in u] # for u in url.from_any(raw_html, source=source_url): # if u not in links and (u != source_url or not u.startswith(source_url)): # links.append(u) # # split out internal / external links / article links # data['links'] = url.categorize_links(links, domain) return data
def test_prepare_with_redirect_back(self): source = 'https://www.revealnews.org/article/a-brief-history-of-the-modern-strawberry/' u = '//cdn.embedly.com/widgets/media.html?url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DfPxUIz5GHAE&src=http%3A%2F%2Fwww.youtube.com%2Fembed%2FfPxUIz5GHAE&type=text%2Fhtml&key=1b74e47c9db441f8a998fb6138abca72&schema=youtube' out = url.prepare(u, source) assert (out == 'http://www.youtube.com/watch?v=fPxUIz5GHAE')