def test_dm1(self): """4 authors""" i = ('http://www.dailymail.co.uk/news/article-2633025/' 'London-cleric-convicted-NYC-terrorism-trial.html') o = urls_response(i) e1 = '{{sfn | Malm | Witheridge | Drury | Bates | 2014}}' e2 = ('* {{cite web ' '| last=Malm ' '| first=Sara ' '| last2=Witheridge ' '| first2=Annette ' '| last3=Drury ' '| first3=Ian ' '| last4=Bates ' '| first4=Daniel ' '| title=Hate preacher Abu Hamza GUILTY of setting up US terror ' 'training camps ' '| website=Mail Online ' '| date=2014-05-19 ' '| url=http://www.dailymail.co.uk/news/article-2633025/' 'London-cleric-convicted-NYC-terrorism-trial.html ' '| ref=harv ' '| accessdate=') self.assertIn(e1, o.sfn) self.assertIn(e2, o.cite)
def test_oth12(self): """Times of India, author could not be detected.""" i = ('http://timesofindia.indiatimes.com/city/pune/' 'UK-allows-working-visas-for-Indian-students/' 'articleshow/1163528927.cms?') o = urls_response(i) sfn = "{{sfn | Kashyap | 2001}}" self.assertIn(sfn, o.sfn)
def test_bbc1(self): """no authors""" i = 'https://www.bbc.com/news/world-asia-27653361' o = urls_response(i) ct = ("* {{cite web " "| title=US 'received Qatar assurances' on Afghan prisoner deal " "| website=BBC News " "| date=2014-06-01 " "| url=http://www.bbc.com/news/world-asia-27653361 " "| ref={{sfnref | BBC News | 2014}} " "| accessdate=") self.assertIn(ct, o.cite)
def test_bbc3(self): """https version of bbc2 (differs a lot!)""" i = 'https://www.bbc.com/news/science-environment-23814524' o = urls_response(i) ct = ('* {{cite web ' '| last=Gage ' '| first=Suzi ' '| title=Sea otter return boosts ailing seagrass in California ' '| website=BBC News ' '| date=2013-08-26 ' '| url=http://www.bbc.com/news/science-environment-23814524 ' '| ref=harv ' '| accessdate=') self.assertIn(ct, o.cite)
def test_tgd3(self): """"Staff" in author name.""" i = ('http://www.tgdaily.com/space-features/' '82906-sma-reveals-giant-star-cluster-in-the-making') o = urls_response(i) ct = ('* {{cite web ' '| title=SMA reveals giant star cluster in the making ' '| website=TG Daily ' '| date=2013-12-17 ' '| url=http://www.tgdaily.com/space-features/' '82906-sma-reveals-giant-star-cluster-in-the-making ' '| ref={{sfnref | TG Daily | 2013}} ' '| accessdate=') self.assertIn(ct, o.cite)
def test_oth14(self): """thebulletin.org""" i = ('http://www.independent.co.uk/news/business/' 'the-investment-column-tt-group-1103208.html') o = urls_response(i) ct = ('* {{cite web ' '| title=The Investment column: TT Group ' '| website=The Independent ' '| date=1999-06-29 ' '| url=http://www.independent.co.uk/news/business/' 'the-investment-column-tt-group-1103208.html ' '| ref={{sfnref | The Independent | 1999}} ' '| accessdate=') self.assertIn(ct, o.cite)
def test_nyt5(self): """special case for date format (not in usual meta tags)""" i = ('https://www.nytimes.com/2007/06/13/world/americas/' '13iht-whale.1.6123654.html') o = urls_response(i) ct = ('* {{cite web ' '| title=19th-century harpoon gives clue on whales ' '| website=International Herald Tribune ' '| date=2007-06-13 ' '| url=https://www.nytimes.com/2007/06/13/world/americas/' '13iht-whale.1.6123654.html ' '| ref={{sfnref | International Herald Tribune | 2007}} ' '| accessdate=') self.assertIn(ct, o.cite)
def test_bbc5(self): """news.bbc.co.uk, 1 author""" i = 'http://news.bbc.co.uk/2/hi/business/2570109.stm' o = urls_response(i) ct = ("* {{cite web " "| last=Madslien " "| first=Jorn " "| title=Inside the Bentley factory " "| website=BBC NEWS " "| date=2002-12-24 " "| url=http://news.bbc.co.uk/2/hi/business/2570109.stm " "| ref=harv " "| accessdate=") self.assertIn(ct, o.cite)
def test_oth13(self): """thebulletin.org""" i = 'http://www.highbeam.com/doc/1P3-3372742961.html' o = urls_response(i) ct = ('* {{cite web ' '| last=Martin ' '| first=Tracy ' '| title=Dynamometers Explained ' '| website=HighBeam Research ' '| date=2014-07-01 ' '| url=http://www.highbeam.com/doc/1P3-3372742961.html ' '| ref=harv ' '| accessdate=') self.assertIn(ct, o.cite)
def test_tgd2(self): """Hard to find author and date.""" i = ('http://www.tgdaily.com/web/' '100381-apple-might-buy-beats-for-32-billion') o = urls_response(i) ct = ('* {{cite web ' '| title=Apple might buy Beats for $3.2 billion ' '| website=TG Daily ' '| date=2014-05-09 ' '| url=http://www.tgdaily.com/web/' '100381-apple-might-buy-beats-for-32-billion ' '| ref={{sfnref | TG Daily | 2014}} ' '| accessdate=') self.assertIn(ct, o.cite)
def test_nyt3(self): """oldstylct, 1 author""" i = 'http://www.nytimes.com/2007/12/25/world/africa/25kenya.html' o = urls_response(i) ct = ('* {{cite web ' '| last=Gettleman ' '| first=Jeffrey ' '| title=Election Rules Complicate Kenya Race ' '| website=The New York Times ' '| date=2007-12-25 ' '| url=https://www.nytimes.com/2007/12/25/world/africa/' '25kenya.html ' '| ref=harv ' '| accessdate=') self.assertIn(ct, o.cite)
def test_bbc6(self): """bbc.com, 1 author""" i = 'http://www.bbc.com/news/science-environment-26267918' o = urls_response(i) ct = ( "* {{cite web " "| last=Amos " "| first=Jonathan " "| title=European Space Agency picks Plato planet-hunting mission " "| website=BBC News " "| date=2014-02-20 " "| url=http://www.bbc.com/news/science-environment-26267918 " "| ref=harv " "| accessdate=") self.assertIn(ct, o.cite)
def test_tgd1(self): """ABCNews. Wrong author: | last=News | first=ABC.""" i = 'http://abcnews.go.com/blogs/headlines/2006/12/saddam_executed/' o = urls_response(i) ct = ('* {{cite web ' '| last=Ross ' '| first=Brian ' '| title=Saddam Executed; An Era Comes to an End ' '| website=ABC News Blogs ' '| date=2006-12-30 ' '| url=http://abcnews.go.com/blogs/headlines/2006/12/' 'saddam_executed/ ' '| ref=harv ' '| accessdate=') self.assertIn(ct, o.cite)
def test_bbc4(self): """news.bbc.co.uk, 1 author""" i = 'http://news.bbc.co.uk/2/hi/programmes/newsnight/5178122.stm' o = urls_response(i) ct = ("* {{cite web " "| last=Jones " "| first=Meirion " "| title=Malaria advice 'risks lives' " "| website=BBC NEWS " "| date=2006-07-13 " "| url=" "http://news.bbc.co.uk/2/hi/programmes/newsnight/5178122.stm " "| ref=harv " "| accessdate=") self.assertIn(ct, o.cite)
def test_oth12(self): """thebulletin.org""" i = 'http://thebulletin.org/evidence-shows-iron-dome-not-working7318' o = urls_response(i) ct = ('* {{cite web ' '| last=Postol ' '| first=Theodore A. ' '| title=The evidence that shows Iron Dome is not working ' '| website=Bulletin of the Atomic Scientists ' '| date=2014-07-19 ' '| url=http://thebulletin.org/' 'evidence-shows-iron-dome-not-working7318 ' '| ref=harv ' '| accessdate=') self.assertIn(ct, o.cite)
def test_oth11(self): """Business News Daily.""" i = ('http://www.businessnewsdaily.com/6762-male-female-entrepreneurs' '.html?cmpid=514642_20140715_27858876') o = urls_response(i) ct = ('* {{cite web ' '| last=Helmrich ' '| first=Brittney ' '| title=Male vs. Female Entrepreneurs: How Are They Different? ' '| website=Business News Daily ' '| date=2014-07-10 ' '| url=http://www.businessnewsdaily.com/6762-male-female-' 'entrepreneurs.html ' '| ref=harv ' '| accessdate=') self.assertIn(ct, o.cite)
def test_oth7(self): """Contains a By Topic line and also the byline contains ' | '.""" i = ('http://news.mit.edu/2014/' 'traffic-lights-theres-a-better-way-0707') o = urls_response(i) ct = ('* {{cite web ' '| last=Chandler ' '| first=David L. ' '| title=Traffic lights: There’s a better way ' '| website=MIT News ' '| date=2014-07-07 ' '| url=http://news.mit.edu/2014/' 'traffic-lights-theres-a-better-way-0707 ' '| ref=harv ' '| accessdate=') self.assertIn(ct, o.cite)
def test_oth1(self): """Get title by hometitle comparison.""" i = 'http://www.ensani.ir/fa/content/326173/default.aspx' o = urls_response(i) ct = ('* {{cite web ' '| last=جلیلیان ' '| first=شهرام ' '| last2=نیا ' '| first2=امیر علی ' '| title=ورود کاسی ها به میان رودان و پیامدهای آن ' '| website=پرتال جامع علوم انسانی ' '| date=2014-05-20 ' '| url=http://www.ensani.ir/fa/content/326173/default.aspx ' '| language=fa ' '| ref=harv ' '| accessdate=') self.assertIn(ct, o.cite)
def test_bg3(self): """bostonmagazine.com. Author tags return unrelated authors.""" i = ('http://www.bostonmagazine.com/news/blog/2013/08/21/' 'juliette-kayyem-jumps-in-for-guv/') o = urls_response(i) ct = ( '* {{cite web ' '| last=Bernstein ' '| first=David S. ' '| title=Juliette Kayyem Is Running for Governor of Massachusetts ' '| website=Boston Magazine ' '| date=2013-08-21 ' '| url=http://www.bostonmagazine.com/news/blog/2013/08/21/' 'juliette-kayyem-jumps-in-for-guv/ ' '| ref=harv ' '| accessdate=') self.assertIn(ct, o.cite)
def test_oth6(self): """Detection of website name.""" i = 'http://www.farsnews.com/newstext.php?nn=13930418000036' o = urls_response(i) # Fars news is using 'خبرگزاری فارس' as og:author which is wrong # and thats why its name is not italicized in sfn. ct = ('* {{cite web ' '| author=خبرگزاری فارس ' '| title=آیت\u200cالله محمدی گیلانی دارفانی را وداع گفت ' '| website=خبرگزاری فارس ' '| date=2014-07-09 ' '| url=http://www.farsnews.com/newstext.php?nn=13930418000036 ' '| language=fa ' '| ref=harv ' '| accessdate=') self.assertIn('{{sfn | خبرگزاری فارس | 2014}}', o.sfn) self.assertIn(ct, o.cite)
def test_nyt1(self): """newstylct, 1 author""" i = ('http://www.nytimes.com/2014/05/30/business/international/' 'on-the-internet-the-right-to-forget-vs-the-right-to-know.html?' 'hp&_r=0') o = urls_response(i) ct = ( '* {{cite web ' '| last=Hakim ' '| first=Danny ' '| title=Right to Be Forgotten? Not That Easy ' '| website=The New York Times ' '| date=2014-05-29 ' '| url=https://www.nytimes.com/2014/05/30/business/international/' 'on-the-internet-the-right-to-forget-vs-the-right-to-know.html ' '| ref=harv ' '| accessdate=') self.assertIn(ct, o.cite)
def test_wp1(self): """`1 author, 2005, the pubdate is different from last edit date""" i = ('http://www.washingtonpost.com/wp-dyn/content/article/2005/09/02/' 'AR2005090200822.html') o = urls_response(i) e1 = '{{sfn | Sachs | 2005}}' e2 = ('* {{cite web ' '| last=Sachs ' '| first=Andrea ' '| title=March of the Migration ' '| website=Washington Post ' '| date=2005-09-04 ' '| url=http://www.washingtonpost.com/wp-dyn/content/article/' '2005/09/02/AR2005090200822.html ' '| ref=harv ' '| accessdate=') self.assertIn(e1, o.sfn) self.assertIn(e2, o.cite)
def test_oth8(self): """Two authors from guardian that are mentions in other tags, too.""" i = ('http://www.theguardian.com/world/2014/jul/14/' 'israel-drone-launched-gaza-ashdod') o = urls_response(i) ct = ('* {{cite web ' '| last=Beaumont ' '| first=Peter ' '| last2=Crowcroft ' '| first2=Orlando ' '| title=Israel says it has shot down drone launched from Gaza ' '| website=the Guardian ' '| date=2014-07-14 ' '| url=http://www.theguardian.com/world/2014/jul/14/' 'israel-drone-launched-gaza-ashdod ' '| ref=harv ' '| accessdate=') self.assertIn(ct, o.cite)
def test_oth4(self): """rel="author" tag contains invalid information.""" i = ('http://www.livescience.com/' '46619-sterile-neutrino-experiment-beginning.html?' 'cmpid=514645_20140702_27078936') o = urls_response(i) ct = ("* {{cite web " "| last=Ghose " "| first=Tia " "| title='Revolutionary' Physics: " "Do Sterile Neutrinos Lurk in the Universe? " "| website=Live Science " "| date=2014-07-01 " "| url=http://www.livescience.com/" "46619-sterile-neutrino-experiment-beginning.html " "| ref=harv " "| accessdate=") self.assertIn(ct, o.cite)
def test_nyt6(self): """lastname=O'Connor""" i = ('http://www.nytimes.com/2003/10/09/us/' 'adding-weight-to-suspicion-sonar-is-linked-to-whale-deaths.html') o = urls_response(i) ct = ( "* {{cite web " "| last=O'Connor " "| first=Anahad " "| title=Adding Weight to Suspicion, " "Sonar Is Linked to Whale Deaths " "| website=The New York Times " "| date=2003-10-09 " "| url=https://www.nytimes.com/2003/10/09/us/" "adding-weight-to-suspicion-sonar-is-linked-to-whale-deaths.html " "| ref=harv " "| accessdate=") self.assertIn(ct, o.cite)
def test_dt3(self): """1 author, 2011""" i = ('http://www.telegraph.co.uk/news/8323909/' 'The-sperm-whale-works-in-extraordinary-ways.html') o = urls_response(i) e1 = '{{sfn | Whitehead | 2011}}' e2 = ("* {{cite web " "| last=Whitehead " "| first=Hal " "| title=The sperm whale works in extraordinary ways " "| website=Telegraph.co.uk " "| date=2011-02-15 " "| url=http://www.telegraph.co.uk/news/science/8323909/" "The-sperm-whale-works-in-extraordinary-ways.html " "| ref=harv " "| accessdate=") self.assertIn(e1, o.sfn) self.assertIn(e2, o.cite)
def test_dt2(self): """1 author, 2003""" i = ('http://www.telegraph.co.uk/news/science/science-news/3313298/' 'Marine-collapse-linked-to-whale-decline.html') o = urls_response(i) e1 = '{{sfn | Highfield | 2003}}' e2 = ("* {{cite web " "| last=Highfield " "| first=Roger " "| title=Marine 'collapse' linked to whale decline " "| website=Telegraph.co.uk " "| date=2003-09-29 " "| url=http://www.telegraph.co.uk/news/science/science-news/" "3313298/Marine-collapse-linked-to-whale-decline.html " "| ref=harv " "| accessdate=") self.assertIn(e1, o.sfn) self.assertIn(e2, o.cite)
def test_oth15(self): """Contains <link property="og:site_name" href="ایسنا" />""" i = ('http://www.isna.ir/news/95110603890/' '%D8%A8%D8%B1%D8%AC%D8%A7%D9%85-%D8%B4%D8%B1%D8%A7%DB%8C%D8%B7-' '%D8%A8%DB%8C%D9%86-%D8%A7%D9%84%D9%85%D9%84%D9%84%DB%8C-' '%D8%A7%DB%8C%D8%B1%D8%A7%D9%86-%D8%B1%D8%A7-' '%DA%A9%D8%A7%D9%85%D9%84%D8%A7-%D9%85%D8%AA%D8%AD%D9%88%D9%84-' '%DA%A9%D8%B1%D8%AF') o = urls_response(i) ct = ('* {{cite web ' '| title=برجام شرایط بینالمللی ایران را کاملا متحول کرد ' '| website=ایسنا ' '| date=2017-01-25 ' '| url=http://www.isna.ir/news/95110603890/ ' '| language=fa ' '| ref={{sfnref | ایسنا | 2017}} ' '| accessdate=') self.assertIn(ct, o.cite)
def test_bg1(self): """boston.com, dateformat '%B %d, %Y'""" i = ( 'http://www.boston.com/cars/news-and-reviews/2014/06/28/hot-rod' '-stamps-google-road-prospectus/hylbVi9qonAwBIH10CwiDP/story.html') o = urls_response(i, '%B %d, %Y') ct = ('* {{cite web ' '| last=Griffith ' '| first=Bill ' '| title=Hot Rod Stamps; Google on Road; A GM Prospectus ' '| website=Boston.com ' '| date=June 29, 2014 ' '| url=http://www.boston.com/cars/news-and-reviews/' '2014/06/28/hot-rod-stamps-google-road-prospectus/' 'hylbVi9qonAwBIH10CwiDP/story.html ' '| ref=harv ' '| accessdate=') self.assertIn(ct, o.cite)
def waybackmachine_response(archive_url: str, date_format: str = '%Y-%m-%d'): """Create the response namedtuple.""" m = URL_FULLMATCH(archive_url) if not m: # Could not parse the archive_url. Treat as an ordinary URL. return urls_response(archive_url, date_format) archive_year, archive_month, archive_day, original_url = \ m.groups() parent_conn, child_conn = Pipe() original_process = Process(target=original_url2dict, args=(original_url, child_conn)) original_process.start() try: archive_dict = url2dict(archive_url) except (ContentTypeError, ContentLengthError) as e: logger.exception(archive_url) return Response(sfnt='Could not process the request.', ctnt=e, error=100) archive_dict['date_format'] = date_format archive_dict['url'] = original_url archive_dict['archive-url'] = archive_url archive_dict['archive-date'] = date(int(archive_year), int(archive_month), int(archive_day)) original_dict = parent_conn.recv() original_process.join() if original_dict: # The original_process has been successful if (original_dict['title'] == archive_dict['title'] or original_dict['soup-title'] == archive_dict['soup-title']): archive_dict.update(original_dict) archive_dict['dead-url'] = 'no' else: # and original title is the same as archive title. Otherwise it # means that the content probably has changed and the original data # cannot be trusted. archive_dict['dead-url'] = 'unfit' else: archive_dict['dead-url'] = 'yes' if archive_dict['website'] == 'Internet Archive': archive_dict['website'] = (urlparse(original_url).hostname.replace( 'www.', '')) return dictionary_to_response(archive_dict)