def test_get_query(self): header, body = split_header(self.test_text) q = get_query(parse_header(header)) query = (u'(((Japan OR Fukushima) AND (Erdbeben OR nuklear OR Tsunami' ' OR Krise\nOR Katastrophe OR Tepco)) ' ' AND date(geq(7/3/2011) AND leq(31/8/2011)) AND\n' 'pub(B\xf6rsen Zeitung OR Frankfurter Rundschau OR ' 'taz OR die tageszeitung))') self.assertEqual(q, query) header, body = split_header(self.test_text2) q = get_query(parse_header(header)) self.assertIsNone(q)
def test_parse_no_header(self): header, body = split_header(self.test_text2) header = header.replace(u'\ufeff', '').strip() self.assertFalse(bool(header)) n_found = len(list(split_body(body))) self.assertEqual(n_found, 1)
def split(self): return split_header(self.test_text)
def test_kop_as_headline(self): # Some lexis nexis files contain "KOP: " instaed of "HEADLINE: " header, body = split_header(self.test_text3) article = parse_article(next(split_body(body))) self.assertEqual("Gretta Duisenberg oprichtster van Palestina-groep", article['title'])
def test_kop_as_headline(self): # Some lexis nexis files contain "KOP: " instaed of "HEADLINE: " header, body = split_header(self.test_text3) article = body_to_article(*parse_article(next(split_body(body)))) self.assertEqual("Gretta Duisenberg oprichtster van Palestina-groep", article.headline)