Exemplo n.º 1
0
 def process_item(self, html_page):
     try:
         publish_date = examine(html_page['html'])
         from boilerpipe.extract import Extractor
         extractor = Extractor(extractor='ArticleExtractor',
                               html=html_page['html'])
         body = str(extractor.getText())
         title = str(extractor.source.getTitle())
         art = {
             'title': title,
             'body': body,
             'lang': self.lang,
             'source': html_page['source'],
             'url': html_page['url'],
             'crawl_date': html_page['timestamp'],
             'publish_date': publish_date,
             'article_id': sha1(html_page['url'].encode('utf-8')).hexdigest(),
             'sentences': []
         }
         if self.art_ok(art['body']):
             content = art['body']
             content = content.replace(u'\xa0', u' ')
             content = content.replace('\\n', '\n')
             sents = []
             if self.lang == 'en':
                 sents = sent_tokenize(content)
             else:
                 for para in content.split('\n'):
                     sents += sentence_split(para, self.lang)
                 sents = [sent for sent in sents if self.check_sent(sent)]
             art['sentences'] = sents
             if len(sents) >= 3:
                 self.output_corpus.add_instance(art)
     except Exception as e:
         pass
Exemplo n.º 2
0
def test_download():
    '''test page download'''
    assert examine(' ', False) is None
    assert examine('0'*int(10e7), False) is None
    assert fetch_url('https://httpbin.org/status/404') is None
    url = 'https://httpbin.org/status/200'
    teststring = fetch_url(url)
    assert teststring is None
    assert examine(teststring) is None
    url = 'https://httpbin.org/links/2/2'
    teststring = fetch_url(url)
    assert teststring is not None
    assert examine(teststring) is None
    url = 'https://httpbin.org/html'
    teststring = fetch_url(url)
    assert teststring is not None
    assert examine(teststring, False) is None
Exemplo n.º 3
0
def test_download():
    '''test page download'''
    #assert fetch_url('https://www.iana.org/404') is None
    #assert fetch_url('https://www.google.com/blank.html') is None
    #assert fetch_url('https://blank.org') is None
    assert fetch_url('https://httpbin.org/status/404') is None
    url = 'https://httpbin.org/status/200'
    teststring = fetch_url(url)
    assert teststring is None
    assert examine(teststring) is None
    url = 'https://httpbin.org/links/2/2'
    teststring = fetch_url(url)
    assert teststring is not None
    assert examine(teststring) is None
    url = 'https://httpbin.org/html'
    teststring = fetch_url(url)
    assert teststring is not None
    assert examine(teststring, False) is None
Exemplo n.º 4
0
def test_cli():
    '''test the command-line interface'''
    assert examine(' ', extensive_bool=True) is None
    assert examine('0'*int(10e7), extensive_bool=True) is None
    assert examine('<html><body><span class="entry-date">12. Juli 2016</span></body></html>', True) == '2016-07-12'
    assert examine('<html><body>2016-07-12</body></html>', extensive_bool=True) == '2016-07-12'
    assert examine('<html><body>2016-07-12</body></html>', extensive_bool=True, maxdate='2015-01-01') is None
    assert examine('<html><body>2016-07-12</body></html>', extensive_bool=True, maxdate='2017-12-31') == '2016-07-12'
    assert examine('<html><body>2016-07-12</body></html>', extensive_bool=True, maxdate='2017-41-41') == '2016-07-12'
Exemplo n.º 5
0
def test_cli():
    '''test the command-line interface'''
    assert examine(' ', True) is None
    assert examine('0'*int(10e7), True) is None
    assert examine('<html><body><span class="entry-date">12. Juli 2016</span></body></html>', True) == '2016-07-12'
    assert examine('<html><body>2016-07-12</body></html>', True) == '2016-07-12'