def test_pipeline_extract_content_extracts_fulltext_correctly(self): """ Tests the extract_content method. Checks that the full text written to disk matches the ful text that we expect to be written to disk. N. B. Do not let the name extract_content portray anything. It is simply to keep the same naming convention as the other workers. extract_content is the main method the worker will run. :return: no return """ self.dict_item['file_format'] = 'txt' pipeline_payload = [self.dict_item] return_payload = writer.extract_content(pipeline_payload) self.assertTrue(return_payload, 1) full_text = '' fulltext_content = reader.read_file( self.dict_item['meta_path'].replace('meta.json', 'fulltext.txt.gz'), json_format=False) self.assertEqual(self.dict_item['fulltext'], fulltext_content)
def test_forced_extraction(self): """ Tests that when a user specifies 'force_extract' that the full text is extracted regardless of its underlying reason for being or not being extracted. :return: no return """ sys.path.append(self.app.conf['PROJ_HOME']) from run import read_links_from_file # User loads the list of full text files and publishes them to the # first queue records = read_links_from_file(self.test_publish, force_extract=True, force_send=False) self.helper_get_details(self.test_publish) self.assertEqual( len(records.bibcode), self.nor, 'The number of records should match' ' the number of lines. It does not: ' '{0} [{1}]'.format(len(records.bibcode), self.nor)) self.assertTrue(len(records.payload) == 1) # Call the task to check if it should be extracted but mock the extraction task with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract: message = records.payload[0] tasks.task_check_if_extract(message) self.assertTrue(task_extract.called) expected = { 'UPDATE': 'FORCE_TO_EXTRACT', 'bibcode': 'test4', 'file_format': 'txt', 'ft_source': '{}/tests/test_unit/stub_data/test.txt'.format( self.app.conf['PROJ_HOME']), #'index_date': '2017-07-07T14:39:11.271432Z', 'meta_path': '{}/tests/test_unit/stub_data/te/st/4/meta.json'.format( self.app.conf['PROJ_HOME']), 'provider': 'TEST' } actual = task_extract.call_args[0][0] self.assertTrue(set(expected).issubset(actual)) self.assertTrue('index_date' in actual) with patch.object(tasks.task_output_results, 'delay', return_value=None) as task_output_results: with patch.object(tasks.task_identify_facilities, 'delay', return_value=None) as task_identify_facilities: # Now we do call the extraction task with the proper arguments tasks.task_extract(actual) self.assertTrue(task_output_results.called) # After the extractor, the meta writer should write all the payloads to # disk in the correct folders for path in self.expected_paths: meta_path = os.path.join(path, 'meta.json') self.assertTrue(os.path.exists(meta_path), 'Meta file not created: {0}'.format(path)) if os.path.exists(meta_path): with open(meta_path, 'r') as meta_file: meta_content = meta_file.read() self.assertTrue( 'FORCE_TO_EXTRACT' in meta_content, 'meta file does not contain the right extract keyword: {0}' .format(meta_content)) fulltext_path = os.path.join(path, 'fulltext.txt.gz') self.assertTrue(os.path.exists(fulltext_path), 'Full text file not created: %s'.format(path)) if os.path.exists(fulltext_path): fulltext_content = reader.read_file(fulltext_path, json_format=False) self.assertEqual(fulltext_content, "Introduction THIS IS AN INTERESTING TITLE")
def test_extra_acknowledment(self): """ Submits a file to the RabbitMQ that contains a bibcode that should result in an acknowlegements file is created. It checks that this file is created and then removes all the content created by the tests. :return: no return """ sys.path.append(self.app.conf['PROJ_HOME']) from run import read_links_from_file # User loads the list of full text files and publishes them to the # first queue records = read_links_from_file(self.test_publish, force_extract=False, force_send=False) self.helper_get_details(self.test_publish) self.assertEqual( len(records.bibcode), self.nor, 'The number of records should match' ' the number of lines. It does not: ' '{0} [{1}]'.format(len(records.bibcode), self.nor)) self.assertTrue(len(records.payload) == 1) # Call the task to check if it should be extracted but mock the extraction task with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract: message = records.payload[0] tasks.task_check_if_extract(message) self.assertTrue(task_extract.called) expected = { 'UPDATE': 'NOT_EXTRACTED_BEFORE', 'bibcode': 'test1', 'file_format': 'xml', 'ft_source': '{}/tests/test_integration/stub_data/full_test_elsevier.xml'. format(self.app.conf['PROJ_HOME']), #'index_date': '2017-07-07T14:39:11.271432Z', 'meta_path': '{}/tests/test_unit/stub_data/te/st/1/meta.json'.format( self.app.conf['PROJ_HOME']), 'provider': 'Elsevier' } actual = task_extract.call_args[0][0] self.assertTrue(set(expected).issubset(actual)) self.assertTrue('index_date' in actual) with patch.object(tasks.task_output_results, 'delay', return_value=None) as task_output_results: with patch.object(tasks.task_identify_facilities, 'delay', return_value=None) as task_identify_facilities: # Now we do call the extraction task with the proper arguments tasks.task_extract(actual) self.assertTrue(task_output_results.called) # After the extractor, the meta writer should write all the payloads to # disk in the correct folders for path in self.expected_paths: meta_path = os.path.join(path, 'meta.json') self.assertTrue(os.path.exists(meta_path), 'Meta file not created: {0}'.format(path)) if os.path.exists(meta_path): with open(meta_path, 'r') as meta_file: meta_content = meta_file.read() self.assertTrue( 'NOT_EXTRACTED_BEFORE' in meta_content, 'meta file does not contain the right extract keyword: {0}' .format(meta_content)) fulltext_path = os.path.join(path, 'fulltext.txt.gz') self.assertTrue(os.path.exists(fulltext_path), 'Full text file not created: %s'.format(path)) # unless changed, tests/test_integration/stub_data/full_test_elsevier.xml if os.path.exists(fulltext_path): fulltext_content = reader.read_file(fulltext_path, json_format=False) self.assertEqual( fulltext_content, '1 Introduction JOURNAL CONTENT Acknowledgments THANK YOU Appendix A APPENDIX TITLE APPENDIX' ) acknowledgments_path = os.path.join(path, 'acknowledgements.txt.gz') self.assertTrue(os.path.exists(acknowledgments_path), 'Full text file not created: %s'.format(path)) if os.path.exists(acknowledgments_path): acknowledgements_content = reader.read_file( acknowledgments_path, json_format=False) self.assertEqual(acknowledgements_content, "Acknowledgments THANK YOU")
def test_extraction_wrong_fulltext_filename(self): """ Publishes a packet that contains a bibcode that has a full text content path that differs to the one that was used the previous time full text content was extracted. Then it ensures all the files generated are removed. :return: no return """ sys.path.append(self.app.conf['PROJ_HOME']) from run import read_links_from_file # User loads the list of full text files and publishes them to the # first queue records = read_links_from_file(self.test_publish, force_extract=False, force_send=False) self.helper_get_details(self.test_publish) self.assertEqual( len(records.bibcode), self.nor, 'The number of records should match' ' the number of lines. It does not: ' '{0} [{1}]'.format(len(records.bibcode), self.nor)) self.assertTrue(len(records.payload) == 1) # Make the fake data to use if not os.path.exists(self.meta_path): os.makedirs(self.meta_path) test_meta_content = { 'index_date': datetime.utcnow().isoformat() + 'Z', 'bibcode': 'test4', 'provider': 'mnras', 'ft_source': 'wrong_source' } with open(self.test_expected, 'w') as test_meta_file: json.dump(test_meta_content, test_meta_file) # Call the task to check if it should be extracted but mock the extraction task with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract: message = records.payload[0] tasks.task_check_if_extract(message) self.assertTrue(task_extract.called) expected = { 'UPDATE': 'DIFFERING_FULL_TEXT', 'bibcode': 'test4', 'file_format': 'txt', 'ft_source': '{}/tests/test_unit/stub_data/test.txt'.format( self.app.conf['PROJ_HOME']), #'index_date': '2017-07-07T14:39:11.271432Z', 'meta_path': '{}/tests/test_unit/stub_data/te/st/4/meta.json'.format( self.app.conf['PROJ_HOME']), 'provider': 'TEST' } actual = task_extract.call_args[0][0] self.assertTrue(set(expected).issubset(actual)) self.assertTrue('index_date' in actual) with patch.object(tasks.task_output_results, 'delay', return_value=None) as task_output_results: with patch.object(tasks.task_identify_facilities, 'delay', return_value=None) as task_identify_facilities: # Now we do call the extraction task with the proper arguments tasks.task_extract(actual) self.assertTrue(task_output_results.called) # After the extractor, the meta writer should write all the payloads to # disk in the correct folders for path in self.expected_paths: meta_path = os.path.join(path, 'meta.json') self.assertTrue(os.path.exists(meta_path), 'Meta file not created: {0}'.format(path)) if os.path.exists(meta_path): with open(meta_path, 'r') as meta_file: meta_content = meta_file.read() self.assertTrue( 'DIFFERING_FULL_TEXT' in meta_content, 'meta file does not contain the right extract keyword: {0}' .format(meta_content)) fulltext_path = os.path.join(path, 'fulltext.txt.gz') self.assertTrue(os.path.exists(fulltext_path), 'Full text file not created: %s'.format(path)) if os.path.exists(fulltext_path): fulltext_content = reader.read_file(fulltext_path, json_format=False) self.assertEqual(fulltext_content, "Introduction THIS IS AN INTERESTING TITLE")
def test_full_range_of_file_format_extraction(self): """ Submits a file containing all the relevant document types to the RabbitMQ instance. Runs all the relevant workers, and then checks that content was extracted. Finally, it cleans up any files or paths created. :return: no return """ sys.path.append(self.app.conf['PROJ_HOME']) from run import read_links_from_file if self.grobid_service is not None: httpretty.enable() expected_grobid_fulltext = "<hello/>" httpretty.register_uri(httpretty.POST, self.grobid_service, body=expected_grobid_fulltext, status=200) # User loads the list of full text files and publishes them to the # first queue records = read_links_from_file(self.test_publish, force_extract=False, force_send=False) self.helper_get_details(self.test_publish) self.assertEqual( len(records.bibcode), self.nor, 'The number of records should match' ' the number of lines. It does not: ' '{0} [{1}]'.format(len(records.bibcode), self.nor)) self.assertTrue(len(records.payload) == 6) # Make the fake data to use if not os.path.exists(self.meta_path): os.makedirs(self.meta_path) # Call the task to check if it should be extracted but mock the extraction task with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract: extraction_arguments_set = [] expected_update = 'NOT_EXTRACTED_BEFORE' for message in records.payload: tasks.task_check_if_extract(message) self.assertTrue(task_extract.called) actual = task_extract.call_args[0][0] self.assertEqual(actual['UPDATE'], expected_update, 'This should be %s, but is in fact: {0}' .format(expected_update, actual['UPDATE'])) extraction_arguments_set.append(actual) with patch.object(tasks.task_output_results, 'delay', return_value=None) as task_output_results: with patch.object(tasks.task_identify_facilities, 'delay', return_value=None) as task_identify_facilities: # Now we do call the extraction task with the proper arguments for arguments in extraction_arguments_set: #if arguments['ft_source'].endswith('.pdf') is False: tasks.task_extract(arguments) self.assertTrue(task_output_results.called) # After the extractor, the meta writer should write all the payloads to # disk in the correct folders for i, path in enumerate(self.expected_paths): meta_path = os.path.join(path, 'meta.json') self.assertTrue( os.path.exists(meta_path), 'Meta file not created: {0}'.format(path) ) if os.path.exists(meta_path): with open(meta_path, 'r') as meta_file: meta_content = meta_file.read() self.assertTrue( 'NOT_EXTRACTED_BEFORE' in meta_content, 'meta file does not contain the right extract keyword: {0}' .format(meta_content) ) fulltext_path = os.path.join(path, 'fulltext.txt.gz') self.assertTrue( os.path.exists(fulltext_path), 'Full text file not created: %s'.format(path) ) if os.path.exists(fulltext_path): fulltext_content = reader.read_file(fulltext_path, json_format=False) expected_fulltext_content = ( u"Introduction THIS IS AN INTERESTING TITLE", u"Introduction THIS IS AN INTERESTING TITLE", u"I. INTRODUCTION INTRODUCTION GOES HERE Manual Entry TABLE I. TEXT a NOTES a TEXT\nAPPENDIX: APPENDIX TITLE GOES HERE APPENDIX CONTENT", u'1 Introduction JOURNAL CONTENT Acknowledgments THANK YOU Appendix A APPENDIX TITLE APPENDIX', u"No Title AA 999, 999-999 (1999) DOI: 99.9999/9999-9999:99999999 TITLE AUTHOR AFFILIATION Received 99 MONTH 1999 / Accepted 99 MONTH 1999 Abstract ABSTRACT Key words: KEYWORD INTRODUCTION SECTION Table 1: TABLE TABLE (1) COPYRIGHT", #u"Introduction\nTHIS IS AN INTERESTING TITLE\n", # PDFBox u"Introduction THIS IS AN INTERESTING TITLE", # pdftotext ) self.assertEqual(fulltext_content, expected_fulltext_content[i]) grobid_fulltext_path = os.path.join(path, 'grobid_fulltext.xml') if os.path.exists(grobid_fulltext_path): with open(grobid_fulltext_path, 'r') as grobid_fulltext_file: grobid_fulltext_content = grobid_fulltext_file.read() self.assertEqual(grobid_fulltext_content, expected_grobid_fulltext)