Exemplo n.º 1
0
 def test_dm1(self):
     """4 authors"""
     i = ('http://www.dailymail.co.uk/news/article-2633025/'
          'London-cleric-convicted-NYC-terrorism-trial.html')
     o = urls_response(i)
     e1 = '{{sfn | Malm | Witheridge | Drury | Bates | 2014}}'
     e2 = ('* {{cite web '
           '| last=Malm '
           '| first=Sara '
           '| last2=Witheridge '
           '| first2=Annette '
           '| last3=Drury '
           '| first3=Ian '
           '| last4=Bates '
           '| first4=Daniel '
           '| title=Hate preacher Abu Hamza GUILTY of setting up US terror '
           'training camps '
           '| website=Mail Online '
           '| date=2014-05-19 '
           '| url=http://www.dailymail.co.uk/news/article-2633025/'
           'London-cleric-convicted-NYC-terrorism-trial.html '
           '| ref=harv '
           '| accessdate=')
     self.assertIn(e1, o.sfn)
     self.assertIn(e2, o.cite)
Exemplo n.º 2
0
 def test_oth12(self):
     """Times of India, author could not be detected."""
     i = ('http://timesofindia.indiatimes.com/city/pune/'
          'UK-allows-working-visas-for-Indian-students/'
          'articleshow/1163528927.cms?')
     o = urls_response(i)
     sfn = "{{sfn | Kashyap | 2001}}"
     self.assertIn(sfn, o.sfn)
Exemplo n.º 3
0
 def test_bbc1(self):
     """no authors"""
     i = 'https://www.bbc.com/news/world-asia-27653361'
     o = urls_response(i)
     ct = ("* {{cite web "
           "| title=US 'received Qatar assurances' on Afghan prisoner deal "
           "| website=BBC News "
           "| date=2014-06-01 "
           "| url=http://www.bbc.com/news/world-asia-27653361 "
           "| ref={{sfnref | BBC News | 2014}} "
           "| accessdate=")
     self.assertIn(ct, o.cite)
Exemplo n.º 4
0
 def test_bbc3(self):
     """https version of bbc2 (differs a lot!)"""
     i = 'https://www.bbc.com/news/science-environment-23814524'
     o = urls_response(i)
     ct = ('* {{cite web '
           '| last=Gage '
           '| first=Suzi '
           '| title=Sea otter return boosts ailing seagrass in California '
           '| website=BBC News '
           '| date=2013-08-26 '
           '| url=http://www.bbc.com/news/science-environment-23814524 '
           '| ref=harv '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 5
0
 def test_tgd3(self):
     """"Staff" in author name."""
     i = ('http://www.tgdaily.com/space-features/'
          '82906-sma-reveals-giant-star-cluster-in-the-making')
     o = urls_response(i)
     ct = ('* {{cite web '
           '| title=SMA reveals giant star cluster in the making '
           '| website=TG Daily '
           '| date=2013-12-17 '
           '| url=http://www.tgdaily.com/space-features/'
           '82906-sma-reveals-giant-star-cluster-in-the-making '
           '| ref={{sfnref | TG Daily | 2013}} '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 6
0
 def test_oth14(self):
     """thebulletin.org"""
     i = ('http://www.independent.co.uk/news/business/'
          'the-investment-column-tt-group-1103208.html')
     o = urls_response(i)
     ct = ('* {{cite web '
           '| title=The Investment column: TT Group '
           '| website=The Independent '
           '| date=1999-06-29 '
           '| url=http://www.independent.co.uk/news/business/'
           'the-investment-column-tt-group-1103208.html '
           '| ref={{sfnref | The Independent | 1999}} '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 7
0
 def test_nyt5(self):
     """special case for date format (not in usual meta tags)"""
     i = ('https://www.nytimes.com/2007/06/13/world/americas/'
          '13iht-whale.1.6123654.html')
     o = urls_response(i)
     ct = ('* {{cite web '
           '| title=19th-century harpoon gives clue on whales '
           '| website=International Herald Tribune '
           '| date=2007-06-13 '
           '| url=https://www.nytimes.com/2007/06/13/world/americas/'
           '13iht-whale.1.6123654.html '
           '| ref={{sfnref | International Herald Tribune | 2007}} '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 8
0
 def test_bbc5(self):
     """news.bbc.co.uk, 1 author"""
     i = 'http://news.bbc.co.uk/2/hi/business/2570109.stm'
     o = urls_response(i)
     ct = ("* {{cite web "
           "| last=Madslien "
           "| first=Jorn "
           "| title=Inside the Bentley factory "
           "| website=BBC NEWS "
           "| date=2002-12-24 "
           "| url=http://news.bbc.co.uk/2/hi/business/2570109.stm "
           "| ref=harv "
           "| accessdate=")
     self.assertIn(ct, o.cite)
Exemplo n.º 9
0
 def test_oth13(self):
     """thebulletin.org"""
     i = 'http://www.highbeam.com/doc/1P3-3372742961.html'
     o = urls_response(i)
     ct = ('* {{cite web '
           '| last=Martin '
           '| first=Tracy '
           '| title=Dynamometers Explained '
           '| website=HighBeam Research '
           '| date=2014-07-01 '
           '| url=http://www.highbeam.com/doc/1P3-3372742961.html '
           '| ref=harv '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 10
0
 def test_tgd2(self):
     """Hard to find author and date."""
     i = ('http://www.tgdaily.com/web/'
          '100381-apple-might-buy-beats-for-32-billion')
     o = urls_response(i)
     ct = ('* {{cite web '
           '| title=Apple might buy Beats for $3.2 billion '
           '| website=TG Daily '
           '| date=2014-05-09 '
           '| url=http://www.tgdaily.com/web/'
           '100381-apple-might-buy-beats-for-32-billion '
           '| ref={{sfnref | TG Daily | 2014}} '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 11
0
 def test_nyt3(self):
     """oldstylct, 1 author"""
     i = 'http://www.nytimes.com/2007/12/25/world/africa/25kenya.html'
     o = urls_response(i)
     ct = ('* {{cite web '
           '| last=Gettleman '
           '| first=Jeffrey '
           '| title=Election Rules Complicate Kenya Race '
           '| website=The New York Times '
           '| date=2007-12-25 '
           '| url=https://www.nytimes.com/2007/12/25/world/africa/'
           '25kenya.html '
           '| ref=harv '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 12
0
 def test_bbc6(self):
     """bbc.com, 1 author"""
     i = 'http://www.bbc.com/news/science-environment-26267918'
     o = urls_response(i)
     ct = (
         "* {{cite web "
         "| last=Amos "
         "| first=Jonathan "
         "| title=European Space Agency picks Plato planet-hunting mission "
         "| website=BBC News "
         "| date=2014-02-20 "
         "| url=http://www.bbc.com/news/science-environment-26267918 "
         "| ref=harv "
         "| accessdate=")
     self.assertIn(ct, o.cite)
Exemplo n.º 13
0
 def test_tgd1(self):
     """ABCNews. Wrong author:  | last=News | first=ABC."""
     i = 'http://abcnews.go.com/blogs/headlines/2006/12/saddam_executed/'
     o = urls_response(i)
     ct = ('* {{cite web '
           '| last=Ross '
           '| first=Brian '
           '| title=Saddam Executed; An Era Comes to an End '
           '| website=ABC News Blogs '
           '| date=2006-12-30 '
           '| url=http://abcnews.go.com/blogs/headlines/2006/12/'
           'saddam_executed/ '
           '| ref=harv '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 14
0
 def test_bbc4(self):
     """news.bbc.co.uk, 1 author"""
     i = 'http://news.bbc.co.uk/2/hi/programmes/newsnight/5178122.stm'
     o = urls_response(i)
     ct = ("* {{cite web "
           "| last=Jones "
           "| first=Meirion "
           "| title=Malaria advice 'risks lives' "
           "| website=BBC NEWS "
           "| date=2006-07-13 "
           "| url="
           "http://news.bbc.co.uk/2/hi/programmes/newsnight/5178122.stm "
           "| ref=harv "
           "| accessdate=")
     self.assertIn(ct, o.cite)
Exemplo n.º 15
0
 def test_oth12(self):
     """thebulletin.org"""
     i = 'http://thebulletin.org/evidence-shows-iron-dome-not-working7318'
     o = urls_response(i)
     ct = ('* {{cite web '
           '| last=Postol '
           '| first=Theodore A. '
           '| title=The evidence that shows Iron Dome is not working '
           '| website=Bulletin of the Atomic Scientists '
           '| date=2014-07-19 '
           '| url=http://thebulletin.org/'
           'evidence-shows-iron-dome-not-working7318 '
           '| ref=harv '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 16
0
 def test_oth11(self):
     """Business News Daily."""
     i = ('http://www.businessnewsdaily.com/6762-male-female-entrepreneurs'
          '.html?cmpid=514642_20140715_27858876')
     o = urls_response(i)
     ct = ('* {{cite web '
           '| last=Helmrich '
           '| first=Brittney '
           '| title=Male vs. Female Entrepreneurs: How Are They Different? '
           '| website=Business News Daily '
           '| date=2014-07-10 '
           '| url=http://www.businessnewsdaily.com/6762-male-female-'
           'entrepreneurs.html '
           '| ref=harv '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 17
0
 def test_oth7(self):
     """Contains a By Topic line and also the byline contains ' | '."""
     i = ('http://news.mit.edu/2014/'
          'traffic-lights-theres-a-better-way-0707')
     o = urls_response(i)
     ct = ('* {{cite web '
           '| last=Chandler '
           '| first=David L. '
           '| title=Traffic lights: There’s a better way '
           '| website=MIT News '
           '| date=2014-07-07 '
           '| url=http://news.mit.edu/2014/'
           'traffic-lights-theres-a-better-way-0707 '
           '| ref=harv '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 18
0
 def test_oth1(self):
     """Get title by hometitle comparison."""
     i = 'http://www.ensani.ir/fa/content/326173/default.aspx'
     o = urls_response(i)
     ct = ('* {{cite web '
           '| last=جلیلیان '
           '| first=شهرام '
           '| last2=نیا '
           '| first2=امیر علی '
           '| title=ورود کاسی ها به میان رودان و پیامدهای آن '
           '| website=پرتال جامع علوم انسانی '
           '| date=2014-05-20 '
           '| url=http://www.ensani.ir/fa/content/326173/default.aspx '
           '| language=fa '
           '| ref=harv '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 19
0
 def test_bg3(self):
     """bostonmagazine.com. Author tags return unrelated authors."""
     i = ('http://www.bostonmagazine.com/news/blog/2013/08/21/'
          'juliette-kayyem-jumps-in-for-guv/')
     o = urls_response(i)
     ct = (
         '* {{cite web '
         '| last=Bernstein '
         '| first=David S. '
         '| title=Juliette Kayyem Is Running for Governor of Massachusetts '
         '| website=Boston Magazine '
         '| date=2013-08-21 '
         '| url=http://www.bostonmagazine.com/news/blog/2013/08/21/'
         'juliette-kayyem-jumps-in-for-guv/ '
         '| ref=harv '
         '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 20
0
 def test_oth6(self):
     """Detection of website name."""
     i = 'http://www.farsnews.com/newstext.php?nn=13930418000036'
     o = urls_response(i)
     # Fars news is using 'خبرگزاری فارس' as og:author which is wrong
     # and thats why its name is not italicized in sfn.
     ct = ('* {{cite web '
           '| author=خبرگزاری فارس '
           '| title=آیت\u200cالله محمدی گیلانی دارفانی را وداع گفت '
           '| website=خبرگزاری فارس '
           '| date=2014-07-09 '
           '| url=http://www.farsnews.com/newstext.php?nn=13930418000036 '
           '| language=fa '
           '| ref=harv '
           '| accessdate=')
     self.assertIn('{{sfn | خبرگزاری فارس | 2014}}', o.sfn)
     self.assertIn(ct, o.cite)
Exemplo n.º 21
0
 def test_nyt1(self):
     """newstylct, 1 author"""
     i = ('http://www.nytimes.com/2014/05/30/business/international/'
          'on-the-internet-the-right-to-forget-vs-the-right-to-know.html?'
          'hp&_r=0')
     o = urls_response(i)
     ct = (
         '* {{cite web '
         '| last=Hakim '
         '| first=Danny '
         '| title=Right to Be Forgotten? Not That Easy '
         '| website=The New York Times '
         '| date=2014-05-29 '
         '| url=https://www.nytimes.com/2014/05/30/business/international/'
         'on-the-internet-the-right-to-forget-vs-the-right-to-know.html '
         '| ref=harv '
         '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 22
0
 def test_wp1(self):
     """`1 author, 2005, the pubdate is different from last edit date"""
     i = ('http://www.washingtonpost.com/wp-dyn/content/article/2005/09/02/'
          'AR2005090200822.html')
     o = urls_response(i)
     e1 = '{{sfn | Sachs | 2005}}'
     e2 = ('* {{cite web '
           '| last=Sachs '
           '| first=Andrea '
           '| title=March of the Migration '
           '| website=Washington Post '
           '| date=2005-09-04 '
           '| url=http://www.washingtonpost.com/wp-dyn/content/article/'
           '2005/09/02/AR2005090200822.html '
           '| ref=harv '
           '| accessdate=')
     self.assertIn(e1, o.sfn)
     self.assertIn(e2, o.cite)
Exemplo n.º 23
0
 def test_oth8(self):
     """Two authors from guardian that are mentions in other tags, too."""
     i = ('http://www.theguardian.com/world/2014/jul/14/'
          'israel-drone-launched-gaza-ashdod')
     o = urls_response(i)
     ct = ('* {{cite web '
           '| last=Beaumont '
           '| first=Peter '
           '| last2=Crowcroft '
           '| first2=Orlando '
           '| title=Israel says it has shot down drone launched from Gaza '
           '| website=the Guardian '
           '| date=2014-07-14 '
           '| url=http://www.theguardian.com/world/2014/jul/14/'
           'israel-drone-launched-gaza-ashdod '
           '| ref=harv '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 24
0
 def test_oth4(self):
     """rel="author" tag contains invalid information."""
     i = ('http://www.livescience.com/'
          '46619-sterile-neutrino-experiment-beginning.html?'
          'cmpid=514645_20140702_27078936')
     o = urls_response(i)
     ct = ("* {{cite web "
           "| last=Ghose "
           "| first=Tia "
           "| title='Revolutionary' Physics: "
           "Do Sterile Neutrinos Lurk in the Universe? "
           "| website=Live Science "
           "| date=2014-07-01 "
           "| url=http://www.livescience.com/"
           "46619-sterile-neutrino-experiment-beginning.html "
           "| ref=harv "
           "| accessdate=")
     self.assertIn(ct, o.cite)
Exemplo n.º 25
0
 def test_nyt6(self):
     """lastname=O'Connor"""
     i = ('http://www.nytimes.com/2003/10/09/us/'
          'adding-weight-to-suspicion-sonar-is-linked-to-whale-deaths.html')
     o = urls_response(i)
     ct = (
         "* {{cite web "
         "| last=O'Connor "
         "| first=Anahad "
         "| title=Adding Weight to Suspicion, "
         "Sonar Is Linked to Whale Deaths "
         "| website=The New York Times "
         "| date=2003-10-09 "
         "| url=https://www.nytimes.com/2003/10/09/us/"
         "adding-weight-to-suspicion-sonar-is-linked-to-whale-deaths.html "
         "| ref=harv "
         "| accessdate=")
     self.assertIn(ct, o.cite)
Exemplo n.º 26
0
 def test_dt3(self):
     """1 author, 2011"""
     i = ('http://www.telegraph.co.uk/news/8323909/'
          'The-sperm-whale-works-in-extraordinary-ways.html')
     o = urls_response(i)
     e1 = '{{sfn | Whitehead | 2011}}'
     e2 = ("* {{cite web "
           "| last=Whitehead "
           "| first=Hal "
           "| title=The sperm whale works in extraordinary ways "
           "| website=Telegraph.co.uk "
           "| date=2011-02-15 "
           "| url=http://www.telegraph.co.uk/news/science/8323909/"
           "The-sperm-whale-works-in-extraordinary-ways.html "
           "| ref=harv "
           "| accessdate=")
     self.assertIn(e1, o.sfn)
     self.assertIn(e2, o.cite)
Exemplo n.º 27
0
 def test_dt2(self):
     """1 author, 2003"""
     i = ('http://www.telegraph.co.uk/news/science/science-news/3313298/'
          'Marine-collapse-linked-to-whale-decline.html')
     o = urls_response(i)
     e1 = '{{sfn | Highfield | 2003}}'
     e2 = ("* {{cite web "
           "| last=Highfield "
           "| first=Roger "
           "| title=Marine 'collapse' linked to whale decline "
           "| website=Telegraph.co.uk "
           "| date=2003-09-29 "
           "| url=http://www.telegraph.co.uk/news/science/science-news/"
           "3313298/Marine-collapse-linked-to-whale-decline.html "
           "| ref=harv "
           "| accessdate=")
     self.assertIn(e1, o.sfn)
     self.assertIn(e2, o.cite)
Exemplo n.º 28
0
 def test_oth15(self):
     """Contains <link property="og:site_name" href="ایسنا" />"""
     i = ('http://www.isna.ir/news/95110603890/'
          '%D8%A8%D8%B1%D8%AC%D8%A7%D9%85-%D8%B4%D8%B1%D8%A7%DB%8C%D8%B7-'
          '%D8%A8%DB%8C%D9%86-%D8%A7%D9%84%D9%85%D9%84%D9%84%DB%8C-'
          '%D8%A7%DB%8C%D8%B1%D8%A7%D9%86-%D8%B1%D8%A7-'
          '%DA%A9%D8%A7%D9%85%D9%84%D8%A7-%D9%85%D8%AA%D8%AD%D9%88%D9%84-'
          '%DA%A9%D8%B1%D8%AF')
     o = urls_response(i)
     ct = ('* {{cite web '
           '| title=برجام شرایط بین‌المللی ایران را کاملا متحول کرد '
           '| website=ایسنا '
           '| date=2017-01-25 '
           '| url=http://www.isna.ir/news/95110603890/ '
           '| language=fa '
           '| ref={{sfnref | ایسنا | 2017}} '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 29
0
 def test_bg1(self):
     """boston.com, dateformat '%B %d, %Y'"""
     i = (
         'http://www.boston.com/cars/news-and-reviews/2014/06/28/hot-rod'
         '-stamps-google-road-prospectus/hylbVi9qonAwBIH10CwiDP/story.html')
     o = urls_response(i, '%B %d, %Y')
     ct = ('* {{cite web '
           '| last=Griffith '
           '| first=Bill '
           '| title=Hot Rod Stamps; Google on Road; A GM Prospectus '
           '| website=Boston.com '
           '| date=June 29, 2014 '
           '| url=http://www.boston.com/cars/news-and-reviews/'
           '2014/06/28/hot-rod-stamps-google-road-prospectus/'
           'hylbVi9qonAwBIH10CwiDP/story.html '
           '| ref=harv '
           '| accessdate=')
     self.assertIn(ct, o.cite)
Exemplo n.º 30
0
def waybackmachine_response(archive_url: str, date_format: str = '%Y-%m-%d'):
    """Create the response namedtuple."""
    m = URL_FULLMATCH(archive_url)
    if not m:
        # Could not parse the archive_url. Treat as an ordinary URL.
        return urls_response(archive_url, date_format)
    archive_year, archive_month, archive_day, original_url = \
        m.groups()
    parent_conn, child_conn = Pipe()
    original_process = Process(target=original_url2dict,
                               args=(original_url, child_conn))
    original_process.start()
    try:
        archive_dict = url2dict(archive_url)
    except (ContentTypeError, ContentLengthError) as e:
        logger.exception(archive_url)
        return Response(sfnt='Could not process the request.',
                        ctnt=e,
                        error=100)
    archive_dict['date_format'] = date_format
    archive_dict['url'] = original_url
    archive_dict['archive-url'] = archive_url
    archive_dict['archive-date'] = date(int(archive_year), int(archive_month),
                                        int(archive_day))
    original_dict = parent_conn.recv()
    original_process.join()
    if original_dict:
        # The original_process has been successful
        if (original_dict['title'] == archive_dict['title']
                or original_dict['soup-title'] == archive_dict['soup-title']):
            archive_dict.update(original_dict)
            archive_dict['dead-url'] = 'no'
        else:
            # and original title is the same as archive title. Otherwise it
            # means that the content probably has changed and the original data
            # cannot be trusted.
            archive_dict['dead-url'] = 'unfit'
    else:
        archive_dict['dead-url'] = 'yes'
    if archive_dict['website'] == 'Internet Archive':
        archive_dict['website'] = (urlparse(original_url).hostname.replace(
            'www.', ''))
    return dictionary_to_response(archive_dict)