def test_docs_serialization_format(self): import json from eWRT.util.module_path import get_resource DOCS = [{'id': 7, 'body': 'Ehre sei Gott.', 'title': '', 'format': 'text/html', 'header': {'test': 'testvalue'}}, {'id': 8, 'body': '', 'title': 'Guten Tag!', 'format': 'text/html', 'header': {}}] REFERENCE_MULTI = json.load( open(get_resource(__file__, 'data/jeremia_reference_output_documents.json'))) REFERENCE_SINGLE = json.load(open(get_resource( __file__, 'data/jeremia_reference_output_single_document.json'))) # document list j = Jeremia() result = j.submit_documents(DOCS) result.sort() REFERENCE_MULTI.sort() assert REFERENCE_MULTI == result # single document result = j.submit_document(DOCS[0]) assert REFERENCE_SINGLE == result
def test_missing_space_tokenattribute(self): def text_as_doc(text): docs = [{ 'id': 'alpha', 'body': text, 'title': '', 'format': 'text/html', 'header': {} }] return docs j = Jeremia() test_texts = { 'Min. 25 000 Kč - řidiči nákladních automobilů, tahačů a… Úřad práce Písek http://t.co/QowX6PQjrR': 17, 'Retos de la #RSE (II): 1. Más autocrítica en las memorias de sostenibilidad': 17, } for text, token_number in list(test_texts.items()): result = j.submit_documents(documents=text_as_doc(text)) res_xml = list(result)[0]['xml_content'] assert len(list( XMLContent(res_xml).sentences[0].tokens)) == token_number
def test_custom_headers(self): ''' verifies that custom headers are preserved and no headers added ''' docs = ({'id': 123, 'body': 'Hier wird die Zensur zuschlagen. Der zweite Satz ist aber okay.', 'title': 'Testdokument :)', 'format': 'text/html', 'header': {'dc:author': 'Ana', 'dc:source': 'http://test.org'} }, {'id': 124, 'body': 'Das zweite Dokument.', 'title': 'Zweites Testdokument :)', 'format': 'text/html', 'header': {}, }, ) j = Jeremia() first, second = j.submit_documents(docs) # swap documents, if required if first['content_id'] == '124': first, second = second, first assert 'dc:source="http://test.org"' in first['xml_content'] assert 'dc:author="Ana"' in first['xml_content'] assert '<wl:page xmlns:wl="http://www.weblyzard.com/wl/2013#" xmlns:dc="http://purl.org/dc/elements/1.1/" wl:id="124" dc:format="text/html" xml:lang="de" wl:nilsimsa="8030473ac029f400680409349e47100e00a29585c04a25ec808342b4c0a1aec8">' in second[ 'xml_content']
def test_custom_headers(self): ''' verifies that custom headers are preserved and no headers added ''' docs = ( { 'id': 123, 'body': 'Hier wird die Zensur zuschlagen. Der zweite Satz ist aber okay.', 'title': 'Testdokument :)', 'format': 'text/html', 'header': { 'dc:author': 'Ana', 'dc:source': 'http://test.org' } }, { 'id': 124, 'body': 'Das zweite Dokument.', 'title': 'Zweites Testdokument :)', 'format': 'text/html', 'header': {}, }, ) j = Jeremia() first, second = j.submit_documents(docs) # swap documents, if required if first['content_id'] == '124': first, second = second, first assert 'dc:source="http://test.org"' in first['xml_content'] assert 'dc:author="Ana"' in first['xml_content'] assert '<wl:page xmlns:wl="http://www.weblyzard.com/wl/2013#" xmlns:dc="http://purl.org/dc/elements/1.1/" wl:id="124" dc:format="text/html" xml:lang="de" wl:nilsimsa="8030473ac029f400680409349e47100e00a29585c04a25ec808342b4c0a1aec8">'.lower( ) in second['xml_content'].lower()
def get_weblyzard_xml_documents(corpus_documents): ''' Performs the pre-processing of the corpus documents (i.e. text files are converted into the weblyzard XML format. ''' jeremia = Jeremia() xml_content = [XMLContent(doc['xml_content']) for doc in jeremia.submit_documents(corpus_documents)] return {doc.content_id: doc for doc in xml_content}
def get_weblyzard_xml_documents(corpus_documents): ''' Performs the pre-processing of the corpus documents (i.e. text files are converted into the weblyzard XML format. ''' jeremia = Jeremia() xml_content = [ XMLContent(doc['xml_content']) for doc in jeremia.submit_documents(corpus_documents) ] return {doc.content_id: doc for doc in xml_content}
def test_sentence_splitting(self): j = Jeremia() for doc in j.submit_documents(self.DOCS[:1]): # extract sentences print(doc) xml_obj = XMLContent(doc['xml_content']) sentences = [s.sentence for s in xml_obj.sentences] print(doc['xml_content']) assert 'wl:is_title' in doc['xml_content'] print(sentences)
def test_illegal_xml_format_filtering(self): DOCS = [{'id': 'alpha', 'body': 'This is an illegal XML Sequence: J\x1amica', 'title': 'Hello "world" more ', 'format': 'text/html', 'header': {}}] j = Jeremia() for doc in j.submit_documents(DOCS): xml = XMLContent(doc['xml_content']) print(doc['xml_content']) assert xml.sentences[0].sentence != None
def test_illegal_xml_format_filtering(self): DOCS = [{ 'id': 'alpha', 'body': 'This is an illegal XML Sequence: J\x1amica', 'title': 'Hello "world" more ', 'format': 'text/html', 'header': {} }] j = Jeremia() for doc in j.submit_documents(DOCS): xml = XMLContent(doc['xml_content']) print(doc['xml_content']) assert xml.sentences[0].sentence != None
def test_single_document_with_annotations(self): ''' Tests the handling of single document annotations. ''' DOC = {'id': 12, 'body': 'UBS has finally succeeded. They obtained a 10% share of CS.', 'title': 'UBS versus Credit Suisse.', 'format': 'text/html', 'title_annotation': [{'start': 0, 'end': 3, 'surfaceForm': 'UBS', 'key': 'http://dbpedia.org/UBS'}, {'start': 11, 'end': 24, 'surfaceForm': 'Credit Suisse', 'key': 'http://dbpedia.org/Credit Suisse'}], 'body_annotation': [{'start': 0, 'end': 3, 'surfaceForm': 'UBS', 'key': 'http://dbpedia.org/UBS'}, {'start': 56, 'end': 58, 'surfaceForm': 'CS', 'key': 'http://dbpedia.org/Credit Suisse'}], 'header': {}, } j = Jeremia() # this test requires Jeremia version 0.0.4+ if j.version() < "0.0.4": return print('submitting document with annotations...') result = j.submit_document(DOC) # check: all annotations have been preserved print(result) assert len(result['annotation']) == 4 # check: annotations for annotation in result['annotation']: # title if annotation['md5sum'] == '8e3f3deac5e6c01dab521c07e3a60d7b': assert annotation['start'] == 0 or annotation['start'] == 11 assert annotation['end'] == 3 or annotation['end'] == 24 # first body sentence elif annotation['md5sum'] == 'ffafdc744dcda3d58ab6eafc86ad99b1': assert annotation['start'] == 0 assert annotation['end'] == 3 # second body sentence with adjusted indices elif annotation['md5sum'] == '25faaf0960a68ae741125ca436b330ee': assert annotation['start'] == 29 assert annotation['end'] == 31
def test_missing_space_tokenattribute(self): def text_as_doc(text): docs = [{'id': 'alpha', 'body': text, 'title': '', 'format': 'text/html', 'header': {}}] return docs j = Jeremia() test_texts = { 'Min. 25 000 Kč - řidiči nákladních automobilů, tahačů a… Úřad práce Písek http://t.co/QowX6PQjrR': 17, 'Retos de la #RSE (II): 1. Más autocrítica en las memorias de sostenibilidad': 17, } for text, token_number in test_texts.iteritems(): result = j.submit_documents(documents=text_as_doc(text)) res_xml = list(result)[0]['xml_content'] assert len( list(XMLContent(res_xml).sentences[0].tokens)) == token_number
def test_docs_serialization_format(self): import json from eWRT.util.module_path import get_resource DOCS = [{ 'id': 7, 'body': 'Ehre sei Gott.', 'title': '', 'format': 'text/html', 'header': { 'test': 'testvalue' } }, { 'id': 8, 'body': '', 'title': 'Guten Tag!', 'format': 'text/html', 'header': {} }] REFERENCE_MULTI = json.load( open( get_resource(__file__, 'data/jeremia_reference_output_documents.json'))) REFERENCE_SINGLE = json.load( open( get_resource( __file__, 'data/jeremia_reference_output_single_document.json'))) # document list j = Jeremia() result = j.submit_documents(DOCS) result.sort() REFERENCE_MULTI.sort() assert REFERENCE_MULTI == result # single document result = j.submit_document(DOCS[0]) assert REFERENCE_SINGLE == result
def test_blacklist(self): ''' tests the blacklist-based sentence filtering ''' source_id = 1 blacklist = ['6e44889df94d6408bbeeab8837bfbe01', '422d7f2000393b8c50a37f9d363ad511'] docs = [{'id': 123, 'body': 'Hier wird die Zensur zuschlagen. Der zweite Satz ist aber okay.', 'title': 'Testdokument :)', 'format': 'text/html', 'header': {}}] # use the blacklist j = Jeremia() j.update_blacklist(source_id=source_id, blacklist=blacklist) sentences = self._get_sentences( j.submit_documents(docs, source_id=source_id)).pop() assert 'Hier wird die Zensur zuschlagen.' not in sentences assert 'Der zweite Satz ist aber okay.' in sentences # check blacklist items assert blacklist == j.get_blacklist(source_id) # no blacklist sentences = self._get_sentences(j.submit_documents(docs)).pop() assert 'Hier wird die Zensur zuschlagen.' in sentences assert 'Der zweite Satz ist aber okay.' in sentences # clear blacklist j.clear_blacklist(source_id) sentences = self._get_sentences( j.submit_documents(docs, source_id=source_id)).pop() assert 'Hier wird die Zensur zuschlagen.' in sentences assert 'Der zweite Satz ist aber okay.' in sentences # check empty blacklist assert [] == j.get_blacklist(source_id)
def test_single_document_processing(self): j = Jeremia() print('submitting document...') document_annotated = j.submit_document(self.DOCS[1]) self.assertTrue(document_annotated != "")
def test_has_queued_threads_exception(self): j = Jeremia(url='http://localhost:8080') has_queued_threads = j.has_queued_threads() assert has_queued_threads == True
def test_has_queued_threads(self): has_queued_threads = Jeremia().has_queued_threads() assert has_queued_threads == True or has_queued_threads == False
def test_illegal_input_args(self): j = Jeremia() with self.assertRaises(ValueError): j.submit_documents([])
def test_blacklist(self): ''' tests the blacklist-based sentence filtering ''' source_id = 1 blacklist = [ '6e44889df94d6408bbeeab8837bfbe01', '422d7f2000393b8c50a37f9d363ad511' ] docs = [{ 'id': 123, 'body': 'Hier wird die Zensur zuschlagen. Der zweite Satz ist aber okay.', 'title': 'Testdokument :)', 'format': 'text/html', 'header': {} }] # use the blacklist j = Jeremia() j.update_blacklist(source_id=source_id, blacklist=blacklist) sentences = self._get_sentences( j.submit_documents(docs, source_id=source_id)).pop() assert 'Hier wird die Zensur zuschlagen.' not in sentences assert 'Der zweite Satz ist aber okay.' in sentences # check blacklist items assert blacklist == j.get_blacklist(source_id) # no blacklist sentences = self._get_sentences(j.submit_documents(docs)).pop() assert 'Hier wird die Zensur zuschlagen.' in sentences assert 'Der zweite Satz ist aber okay.' in sentences # clear blacklist j.clear_blacklist(source_id) sentences = self._get_sentences( j.submit_documents(docs, source_id=source_id)).pop() assert 'Hier wird die Zensur zuschlagen.' in sentences assert 'Der zweite Satz ist aber okay.' in sentences # check empty blacklist assert [] == j.get_blacklist(source_id)
def test_single_document_with_annotations(self): ''' Tests the handling of single document annotations. ''' DOC = { 'id': 12, 'body': 'UBS has finally succeeded. They obtained a 10% share of CS.', 'title': 'UBS versus Credit Suisse.', 'format': 'text/html', 'title_annotation': [{ 'start': 0, 'end': 3, 'surfaceForm': 'UBS', 'key': 'http://dbpedia.org/UBS' }, { 'start': 11, 'end': 24, 'surfaceForm': 'Credit Suisse', 'key': 'http://dbpedia.org/Credit Suisse' }], 'body_annotation': [{ 'start': 0, 'end': 3, 'surfaceForm': 'UBS', 'key': 'http://dbpedia.org/UBS' }, { 'start': 56, 'end': 58, 'surfaceForm': 'CS', 'key': 'http://dbpedia.org/Credit Suisse' }], 'header': {}, } j = Jeremia() # this test requires Jeremia version 0.0.4+ if j.version() < b"0.0.4": return print('submitting document with annotations...') result = j.submit_document(DOC) # check: all annotations have been preserved print(result) assert len(result['annotation']) == 4 # check: annotations for annotation in result['annotation']: # title if annotation['md5sum'] == '8e3f3deac5e6c01dab521c07e3a60d7b': assert annotation['start'] == 0 or annotation['start'] == 11 assert annotation['end'] == 3 or annotation['end'] == 24 # first body sentence elif annotation['md5sum'] == 'ffafdc744dcda3d58ab6eafc86ad99b1': assert annotation['start'] == 0 assert annotation['end'] == 3 # second body sentence with adjusted indices elif annotation['md5sum'] == '25faaf0960a68ae741125ca436b330ee': assert annotation['start'] == 29 assert annotation['end'] == 31
def test_batch_processing(self): j = Jeremia() docs = j.submit_documents(self.DOCS) self.assertEqual(len(docs), 20)
# single document result = j.submit_document(DOCS[0]) assert REFERENCE_SINGLE == result def test_suite(): ''' add support for calling Jeremia tests as part of a test suite ''' suite = unittest.TestSuite() suite.addTest(unittest.makeSuite(JeremiaTest, 'test')) return suite if __name__ == '__main__': if len(argv) > 1: txt = argv[1] docs = {'id': '192292', 'body': txt, 'title': '', 'format': 'text/html', 'header': {'test': 'testvalue'}} j = Jeremia() docs['body_annotation'] = [ {'start': 0, 'end': 3, 'key': 'test annotation'}] l = j.submit_document(docs) print(l) else: unittest.main()
''' add support for calling Jeremia tests as part of a test suite ''' suite = unittest.TestSuite() suite.addTest(unittest.makeSuite(JeremiaTest, 'test')) return suite if __name__ == '__main__': if len(argv) > 1: txt = argv[1] docs = { 'id': '192292', 'body': txt, 'title': '', 'format': 'text/html', 'header': { 'test': 'testvalue' } } j = Jeremia() docs['body_annotation'] = [{ 'start': 0, 'end': 3, 'key': 'test annotation' }] l = j.submit_document(docs) print(l) else: unittest.main()