def task_extract(message): """ Extracts the full text from the given location and pushes to the writing queue. """ logger.debug('Extract content: %s', message) if not isinstance(message, list): message = [message] results = extraction.extract_content(message, extract_pdf_script=app.conf['EXTRACT_PDF_SCRIPT']) logger.debug('Results: %s', results) for r in results: logger.debug("Calling 'write_content' with '%s'", str(r)) # Write locally to filesystem writer.write_content(r) # Send results to master msg = { 'bibcode': r['bibcode'], 'body': r['fulltext'], } for x in ('acknowledgements', 'dataset'): if x in r and r[x]: msg[x] = r[x] logger.debug("Calling 'task_output_results' with '%s'", msg) task_output_results.delay(msg) # Send results to master only if fulltext is not an empty string if r['fulltext'] != "": logger.debug("Calling 'task_output_results' with '%s'", msg) task_output_results.delay(msg)
def task_extract(message): """ Extracts the full text from the given location and pushes to the writing queue. """ logger.debug('Extract content: %s', message) if not isinstance(message, list): message = [message] results = extraction.extract_content( message, extract_pdf_script=app.conf['EXTRACT_PDF_SCRIPT']) logger.debug('Results: %s', results) for r in results: logger.debug("Calling 'write_content' with '%s'", str(r)) # Write locally to filesystem writer.write_content(r) # Send results to master msg = { 'bibcode': r['bibcode'], 'body': r['fulltext'], } for x in ('acknowledgements', 'dataset'): if x in r and r[x]: msg[x] = r[x] logger.debug("Calling 'task_output_results' with '%s'", msg) task_output_results.delay(msg) # Send results to master only if fulltext is not an empty string if r['fulltext'] != "": logger.debug("Calling 'task_output_results' with '%s'", msg) task_output_results.delay(msg)
def task_extract(message): """ Extracts the full text from the given location and pushes to the writing queue. """ logger.debug('Extract content: %s', message) if not isinstance(message, list): message = [message] results = extraction.extract_content(message, extract_pdf_script=app.conf['EXTRACT_PDF_SCRIPT']) logger.debug('Results: %s', results) for r in results: logger.debug("Calling 'write_content' with '%s'", str(r)) # Write locally to filesystem writer.write_content(r) # Send results to master msg = { 'bibcode': r['bibcode'], 'body': r['fulltext'], } for x in ('acknowledgements', 'dataset', 'facility'): if x in r and r[x]: msg[x] = r[x] # Call task without checking if fulltext is empty # to ensure other components (acks, etc) are output/sent to master logger.debug("Calling 'task_output_results' with '%s'", msg) logger.info("Calling task_output_results...") task_output_results.delay(msg) if app.conf['RUN_NER_FACILITIES_AFTER_EXTRACTION']: # perform named-entity recognition task_identify_facilities.delay(message)
def task_extract_grobid(message): """ Extracts the structured full text from the given location """ logger.debug('Extract grobid content: %s', message) if not isinstance(message, list): message = [message] # Mofiy file format to force the use of GrobidPDFExtractor for msg in message: msg['file_format'] += "-grobid" results = extraction.extract_content(message, grobid_service=app.conf['GROBID_SERVICE']) logger.debug('Grobid results: %s', results) for r in results: logger.debug("Calling 'write_content' with '%s'", str(r)) # Write locally to filesystem writer.write_content(r)
def test_loads_the_content_correctly_and_makes_full_text_file(self): """ Tests the write_content method. Checks that the full text file is created and saved to disk. :return: no return """ content = writer.write_content(self.dict_item) self.assertTrue(os.path.exists(self.full_text_file), msg=os.path.exists(self.full_text_file))
def test_loads_the_content_correctly_and_makes_folders(self): """ Tests the write_content method. Checks that the folder to contain the full text and meta data is created. :return: no return """ content = writer.write_content(self.dict_item) self.assertTrue(os.path.exists(self.bibcode_pair_tree), msg=os.path.exists(self.bibcode_pair_tree))