def test_reveal(self): source_url = 'https://www.revealnews.org/article/a-brief-history-of-the-modern-strawberry/' d = article.extract(source_url) assert ('ARIANE WU' in d['authors']) assert (d['page_type'] == 'article') assert (d['title'] == 'A Brief History of the Modern Strawberry') assert ( d['description'] == 'This short stop-motion animation explains how clever advertising tactics and certain pesticides helped make the strawberry cheaply and widely available in the U.S.' ) assert (d['domain'] == 'revealnews.org') assert (d['site_name'] == 'Reveal') assert (d['created'] == datetime.datetime(2014, 11, 11, 0, 57, tzinfo=pytz.utc)) assert ( d['favicon'] == 'https://www.revealnews.org/wp-content/themes/reveal2015/static/images/cir/favicon.ico' ) assert ( d['img_url'] == 'https://www.revealnews.org/wp-content/uploads/2015/02/Strawberry-CA0.png' ) assert ( 'it seems that strawberries are served with just about everything' in d['body']) assert (d['url'] == source_url)
def format(self, obj): """ For now all of these options are standard to twitter events. """ # set the status. obj['status'] = self.options.get('event_status', 'pending') # prepare url (these are formatted as redirects). obj['url'] = url.prepare(obj['url'], expand=False, canonicalize=False) # ignore bad domains / org's own domains. if self._is_bad_domain(obj['url']): return # extract and merge article data. if url.is_article(obj['url']): data = article.extract(obj['url'], type=None) if data: obj.update(data) obj.pop('type', None) obj.pop('site_name', None) obj.pop('favicon', None) # set source id: _id = obj.pop('id', obj.get('url', gen_uuid())) if ":" in _id: _id = _id.split(':')[-1] obj['source_id'] = _id # TODO: Make formatting more elegant. if self.options.get('set_event_title', None): obj['title'] = self.options.get( 'set_event_title').format(**self._fmt(obj)) if self.options.get('set_event_description', None): obj['description'] = self.options.get( 'set_event_description').format(**self._fmt(obj)) if self.options.get('set_event_tag_ids', None) and \ len(self.options.get('set_event_tag_ids')): obj['tag_ids'] = self.options.get('set_event_tag_ids') # hack because the app cant handle this field being a list. if self.options.get('set_event_content_items', None): if 'content_item_ids' not in obj: obj['content_item_ids'] = [] for c in self.options.get('set_event_content_items', []): if isinstance(c, dict): if c.get('id', None): obj['content_item_ids'].append(c.get('id')) elif isinstance(c, int): obj['content_item_ids'].append(c) # filter links. if self.options.get('must_link', False) \ and not len(obj.get('links', [])): return None return obj
def test_nytimes(self): source_url = 'http://www.nytimes.com/2015/06/05/fashion/mens-style/farewell-my-lovely-cigarettes.html?smid=tw-share&_r=0' d = article.extract(source_url) assert('CHOIRE SICHA' in d['authors']) assert(d['page_type'] == 'article') assert(d['title'] == 'Farewell, My Lovely Cigarettes') assert(d['description'] == 'A lifelong smoker takes his final puff and looks back on a 30-year habit.') assert(d['domain'] == 'nytimes.com') assert(d['site_name'] == 'Nytimes') assert(d['created'] == datetime.datetime(2015, 6, 3, 0, 0, tzinfo=pytz.utc)) assert(d['favicon'] == 'http://static01.nyt.com/favicon.ico') assert(d['img_url'] == 'http://static01.nyt.com/images/2015/06/05/fashion/05RITESOFPASSAGE1/05RITESOFPASSAGE1-facebookJumbo.jpg') assert('Someone could easily get cut' in d['body']) assert(d['url'] == 'http://www.nytimes.com/2015/06/05/fashion/mens-style/farewell-my-lovely-cigarettes.html')
def test_reveal(self): source_url = 'https://www.revealnews.org/article/a-brief-history-of-the-modern-strawberry/' d = article.extract(source_url) assert('ARIANE WU' in d['authors']) assert(d['page_type'] == 'article') assert(d['title'] == 'A Brief History of the Modern Strawberry') assert(d['description'] == 'This short stop-motion animation explains how clever advertising tactics and certain pesticides helped make the strawberry cheaply and widely available in the U.S.') assert(d['domain'] == 'revealnews.org') assert(d['site_name'] == 'Reveal') assert(d['created'] == datetime.datetime(2014, 11, 11, 0, 57, tzinfo=pytz.utc)) assert(d['favicon'] == 'https://www.revealnews.org/wp-content/themes/reveal2015/static/images/cir/favicon.ico') assert(d['img_url'] == 'https://www.revealnews.org/wp-content/uploads/2015/02/Strawberry-CA0.png') assert('it seems that strawberries are served with just about everything' in d['body']) assert(d['url'] == source_url)
def test_propublica(self): source_url = 'http://www.propublica.org/article/congress-to-consider-scaling-down-group-homes-for-troubled-children' d = article.extract(source_url) assert(['JOAQUIN SAPIEN'] == d['authors']) assert(d['page_type'] == 'article') assert(d['title'] == 'Congress to Consider Scaling Down Group Homes for Troubled Children') assert(d['description'] == 'At a hearing in Washington, a renewed call for addressing the violence and neglect that plagues group homes for foster youth.') assert(d['domain'] == 'propublica.org') assert(d['site_name'] == 'ProPublica') assert(d['created'] == datetime.datetime(2015, 5, 20, 17, 47, 13, tzinfo=pytz.utc)) assert('www.propublica.org/favicon.ico' in d['favicon']) assert(d['img_url'] == 'http://www.propublica.org/images/ngen/gypsy_og_image/20150520-group-home-hearing-1200x630.jpg') assert('finding that children had repeatedly been sent to facilities that were rife with abuse and that had become known recruiting grounds for pimp' in d['body']) assert(d['url'] == source_url)
def work(self, url, type='article'): """ Standardize + cache a raw url returning it's standardized url + global bitly url. """ return article.extract(url, type=type)
def _extract(self, url, type): if type == 'article': return article.extract(url) else: raise NotImplemented( "NewsLynx only has support for Article Extraction.")
def test_multiple_authors(self): source_url = 'http://www.propublica.org/article/new-snowden-documents-reveal-secret-memos-expanding-spying' d = article.extract(source_url) assert(len(d.get('authors', [])) == 2)
def test_multiple_authors(self): source_url = 'http://www.propublica.org/article/new-snowden-documents-reveal-secret-memos-expanding-spying' d = article.extract(source_url) assert (len(d.get('authors', [])) == 2)