def task_extract(message): """ Extracts the full text from the given location and pushes to the writing queue. """ logger.debug('Extract content: %s', message) if not isinstance(message, list): message = [message] results = extraction.extract_content(message, extract_pdf_script=app.conf['EXTRACT_PDF_SCRIPT']) logger.debug('Results: %s', results) for r in results: logger.debug("Calling 'write_content' with '%s'", str(r)) # Write locally to filesystem writer.write_content(r) # Send results to master msg = { 'bibcode': r['bibcode'], 'body': r['fulltext'], } for x in ('acknowledgements', 'dataset'): if x in r and r[x]: msg[x] = r[x] logger.debug("Calling 'task_output_results' with '%s'", msg) task_output_results.delay(msg) # Send results to master only if fulltext is not an empty string if r['fulltext'] != "": logger.debug("Calling 'task_output_results' with '%s'", msg) task_output_results.delay(msg)
def task_extract(message): """ Extracts the full text from the given location and pushes to the writing queue. """ logger.debug('Extract content: %s', message) if not isinstance(message, list): message = [message] results = extraction.extract_content( message, extract_pdf_script=app.conf['EXTRACT_PDF_SCRIPT']) logger.debug('Results: %s', results) for r in results: logger.debug("Calling 'write_content' with '%s'", str(r)) # Write locally to filesystem writer.write_content(r) # Send results to master msg = { 'bibcode': r['bibcode'], 'body': r['fulltext'], } for x in ('acknowledgements', 'dataset'): if x in r and r[x]: msg[x] = r[x] logger.debug("Calling 'task_output_results' with '%s'", msg) task_output_results.delay(msg) # Send results to master only if fulltext is not an empty string if r['fulltext'] != "": logger.debug("Calling 'task_output_results' with '%s'", msg) task_output_results.delay(msg)
def task_extract(message): """ Extracts the full text from the given location and pushes to the writing queue. """ logger.debug('Extract content: %s', message) if not isinstance(message, list): message = [message] results = extraction.extract_content(message, extract_pdf_script=app.conf['EXTRACT_PDF_SCRIPT']) logger.debug('Results: %s', results) for r in results: logger.debug("Calling 'write_content' with '%s'", str(r)) # Write locally to filesystem writer.write_content(r) # Send results to master msg = { 'bibcode': r['bibcode'], 'body': r['fulltext'], } for x in ('acknowledgements', 'dataset', 'facility'): if x in r and r[x]: msg[x] = r[x] # Call task without checking if fulltext is empty # to ensure other components (acks, etc) are output/sent to master logger.debug("Calling 'task_output_results' with '%s'", msg) logger.info("Calling task_output_results...") task_output_results.delay(msg) if app.conf['RUN_NER_FACILITIES_AFTER_EXTRACTION']: # perform named-entity recognition task_identify_facilities.delay(message)
def test_that_we_can_extract_all_content_from_payload_input(self): """ Tests the extract_content method. This checks that all of the XML meta data defined in settings.py is extracted from the stub XML data. :return: no return """ pay_load = [self.dict_item] content = extraction.extract_content(pay_load) self.assertTrue( set(rules.META_CONTENT['teixml'].keys()).issubset( content[0].keys()))
def test_that_we_can_extract_all_content_from_payload_input(self): """ Tests the extract_content method. This checks that all of the XML meta data defined in settings.py is extracted from the stub XML data. :return: no return """ pay_load = [self.dict_item] content = extraction.extract_content(pay_load) self.assertTrue( set(rules.META_CONTENT['teixml'].keys()).issubset(content[0].keys()) )
def test_that_we_can_extract_all_content_from_payload_input(self): """ Tests the extract_content method. This checks that all of the XML meta data defined in settings.py is extracted from the stub XML data. :return: no return """ file_path = '{0}/{1}'.format(self.app.conf['FULLTEXT_EXTRACT_PATH'], self.test_stub_xml) pay_load = [self.dict_item] content = extraction.extract_content(pay_load) self.assertTrue( set(rules.META_CONTENT['xml'].keys()).issubset(content[0].keys()))
def test_multi_file(self): """ some entries in fulltext/all.links specify multiple files typically the first has text from the article while the rest have the text from tables :return: no return """ self.dict_item = {'ft_source': self.test_multi_file, 'file_format': 'xml', 'provider': 'MNRAS', 'bibcode': 'test'} content = extraction.extract_content([self.dict_item]) # does the fulltext contain two copies of the file's contents self.assertEqual(2, content[0]['fulltext'].count('Entry 1'))
def test_that_we_can_extract_all_content_from_payload_input(self): """ Tests the extract_content method. This checks that all of the XML meta data defined in settings.py is extracted from the stub XML data. :return: no return """ file_path = '{0}/{1}'.format(self.app.conf['FULLTEXT_EXTRACT_PATH'], self.test_stub_xml) pay_load = [self.dict_item] content = extraction.extract_content(pay_load) self.assertTrue( set(rules.META_CONTENT['xml'].keys()).issubset(content[0].keys()) )
def task_extract_grobid(message): """ Extracts the structured full text from the given location """ logger.debug('Extract grobid content: %s', message) if not isinstance(message, list): message = [message] # Mofiy file format to force the use of GrobidPDFExtractor for msg in message: msg['file_format'] += "-grobid" results = extraction.extract_content(message, grobid_service=app.conf['GROBID_SERVICE']) logger.debug('Grobid results: %s', results) for r in results: logger.debug("Calling 'write_content' with '%s'", str(r)) # Write locally to filesystem writer.write_content(r)
def test_multi_file(self): """ some entries in fulltext/all.links specify multiple files typically the first has text from the article while the rest have the text from tables :return: no return """ self.dict_item = { 'ft_source': self.test_multi_file, 'file_format': 'xml', 'provider': 'MNRAS', 'bibcode': 'test' } content = extraction.extract_content([self.dict_item]) # does the fulltext contain two copies of the file's contents self.assertEqual(2, content[0]['fulltext'].count('Entry 1'))