def test_that_file_should_be_updated_if_forced(self): """ If the dictionary contains a force value in the update keyword, then the worker should pass on the content regardless of whether it passes any other checks :return: no return """ FileInputStream_true = utils.FileInputStream(self.test_file_exists) FileInputStream_true.extract(force_extract=True) FileInputStream_false = utils.FileInputStream(self.test_file_exists) FileInputStream_false.extract(force_extract=False) payload_true = checker.check_if_extract( FileInputStream_true.payload, self.app.conf['FULLTEXT_EXTRACT_PATH'] ) first_doc_true = payload_true['PDF'][0] payload_false = checker.check_if_extract( FileInputStream_false.payload, self.app.conf['FULLTEXT_EXTRACT_PATH'] ) first_doc_false = payload_true['PDF'][0] self.assertTrue(first_doc_true['UPDATE'], 'FORCE_TO_EXTRACT') self.assertTrue(len(payload_false['PDF']) != 0) self.assertTrue(first_doc_false['UPDATE'], 'DIFFERING_FULL_TEXT') self.assertTrue(len(payload_false['PDF']) != 0)
def test_that_file_should_be_updated_if_forced(self): """ If the dictionary contains a force value in the update keyword, then the worker should pass on the content regardless of whether it passes any other checks :return: no return """ FileInputStream_true = utils.FileInputStream(self.test_file_exists) FileInputStream_true.extract(force_extract=True) FileInputStream_false = utils.FileInputStream(self.test_file_exists) FileInputStream_false.extract(force_extract=False) payload_true = checker.check_if_extract( FileInputStream_true.payload, self.app.conf['FULLTEXT_EXTRACT_PATH']) first_doc_true = payload_true['PDF'][0] payload_false = checker.check_if_extract( FileInputStream_false.payload, self.app.conf['FULLTEXT_EXTRACT_PATH']) first_doc_false = payload_true['PDF'][0] self.assertTrue(first_doc_true['UPDATE'], 'FORCE_TO_EXTRACT') self.assertTrue(len(payload_false['PDF']) != 0) self.assertTrue(first_doc_false['UPDATE'], 'DIFFERING_FULL_TEXT') self.assertTrue(len(payload_false['PDF']) != 0)
def test_output_dictionary_contains_everything_we_need(self): """ Tests the check_if_extract function. Runs the function on a stub file that contains one document and then ensures that the output dictionary contains all the expected meta data. It also checks that the correct file format has been associated to it. :return: no return """ FileInputStream = utils.FileInputStream(self.test_single_document) FileInputStream.extract() payload = checker.check_if_extract( FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH']) expected_content = [ 'ft_source', 'bibcode', 'provider', 'file_format', 'UPDATE', 'meta_path', 'index_date' ] if sys.version_info > (3, ): test_type = str else: test_type = unicode expected_content = [test_type(i) for i in expected_content] expected_content.sort() actual_content = list(payload['Standard'][0].keys()) actual_format = payload['Standard'][0]['file_format'] actual_content.sort() self.assertListEqual(actual_content, expected_content) self.assertEqual(actual_format, 'txt')
def test_output_dictionary_contains_everything_we_need(self): """ Tests the check_if_extract function. Runs the function on a stub file that contains one document and then ensures that the output dictionary contains all the expected meta data. It also checks that the correct file format has been associated to it. :return: no return """ FileInputStream = utils.FileInputStream(self.test_single_document) FileInputStream.extract() payload = checker.check_if_extract( FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH'] ) expected_content = ['ft_source', 'bibcode', 'provider', 'file_format', 'UPDATE', 'meta_path', 'index_date'] expected_content = [unicode(i) for i in expected_content] expected_content.sort() actual_content = payload['Standard'][0].keys() actual_format = payload['Standard'][0]['file_format'] actual_content.sort() self.assertListEqual(actual_content, expected_content) self.assertEqual(actual_format, 'txt')
def task_check_if_extract(message): """ Checks if the file needs to be extracted and pushes to the correct extraction queue. """ logger.debug('Checking content: %s', message) if not isinstance(message, list): message = [message] logger.debug("Calling 'check_if_extract' with message '%s' and path '%s'", message, app.conf['FULLTEXT_EXTRACT_PATH']) results = checker.check_if_extract(message, app.conf['FULLTEXT_EXTRACT_PATH']) logger.debug('Results: %s', results) if results: for key in results: if key == 'PDF' or key == 'Standard': for msg in results[key]: logger.debug("Calling 'task_extract' with message '%s'", msg) task_extract.delay(msg) if app.conf['GROBID_SERVICE'] is not None and key == 'PDF': logger.debug( "Calling 'task_extract_grobid' with message '%s'", msg) task_extract_grobid.delay(msg) else: logger.error('Unknown type: %s and message: %s', (key, results[key]))
def test_file_should_be_extracted(self): """ Tests the check_if_extract function. Calculates how many references there are to PDFs, and the remainder are 'Standard files', e.g., XML, HTTP, HTML, etc. From this payload, it runs check_if_extract and ensures that all of the outputs contain an expected UPDATE flag, and that there are the correct number of PDFs in the PDF queue, and the right number of StandardFiles in the StandardFiles queue. :return: no return """ FileInputStream = utils.FileInputStream(self.test_file) FileInputStream.extract() with open(self.test_file, 'r') as in_f: text = in_f.read() pdf_re = re.compile('pdf') pdf_number = len(pdf_re.findall(text)) standard_number = \ len([i for i in text.split('\n') if i != '']) - pdf_number payload = checker.check_if_extract( FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH'] ) pdf_payload = payload['PDF'] standard_payload = payload['Standard'] if pdf_payload: pdf_compare = [ content for content in payload['PDF'] if content['UPDATE'] in [u'STALE_CONTENT', u'DIFFERING_FULL_TEXT', u'MISSING_FULL_TEXT"' u'NOT_EXTRACTED_BEFORE'] ] else: pdf_compare = [] if standard_payload: standard_compare = [ content for content in payload['Standard'] if content['UPDATE'] in [u'STALE_CONTENT', u'DIFFERING_FULL_TEXT', u'MISSING_FULL_TEXT', u'NOT_EXTRACTED_BEFORE'] ] else: standard_compare = [] self.assertTrue(len(pdf_compare) == pdf_number, pdf_number) self.assertTrue(len(standard_compare) == standard_number)
def test_file_should_be_extracted(self): """ Tests the check_if_extract function. Calculates how many references there are to PDFs, and the remainder are 'Standard files', e.g., XML, HTTP, HTML, etc. From this payload, it runs check_if_extract and ensures that all of the outputs contain an expected UPDATE flag, and that there are the correct number of PDFs in the PDF queue, and the right number of StandardFiles in the StandardFiles queue. :return: no return """ FileInputStream = utils.FileInputStream(self.test_file) FileInputStream.extract() with open(self.test_file, 'r') as in_f: text = in_f.read() pdf_re = re.compile('pdf') pdf_number = len(pdf_re.findall(text)) standard_number = \ len([i for i in text.split('\n') if i != '']) - pdf_number payload = checker.check_if_extract( FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH']) pdf_payload = payload['PDF'] standard_payload = payload['Standard'] if pdf_payload: pdf_compare = [ content for content in payload['PDF'] if content['UPDATE'] in [ u'STALE_CONTENT', u'DIFFERING_FULL_TEXT', u'MISSING_FULL_TEXT"' u'NOT_EXTRACTED_BEFORE' ] ] else: pdf_compare = [] if standard_payload: standard_compare = [ content for content in payload['Standard'] if content['UPDATE'] in [ u'STALE_CONTENT', u'DIFFERING_FULL_TEXT', u'MISSING_FULL_TEXT', u'NOT_EXTRACTED_BEFORE' ] ] else: standard_compare = [] self.assertTrue(len(pdf_compare) == pdf_number, pdf_number) self.assertTrue(len(standard_compare) == standard_number)
def test_that_no_payload_gets_sent_if_there_is_no_content(self): """ Tests check_if_extract function. The stub data contains no PDF files. This means there should be nothing inside the PDF list returned within the payload. If there is, there is a problem. :return: no return """ FileInputStream = utils.FileInputStream(self.test_single_document) FileInputStream.extract() payload = checker.check_if_extract( FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH']) self.assertFalse(payload['PDF']) self.assertTrue(len(payload['Standard']) != 0)
def test_that_no_payload_gets_sent_if_there_is_no_content(self): """ Tests check_if_extract function. The stub data contains no PDF files. This means there should be nothing inside the PDF list returned within the payload. If there is, there is a problem. :return: no return """ FileInputStream = utils.FileInputStream(self.test_single_document) FileInputStream.extract() payload = checker.check_if_extract( FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH'] ) self.assertFalse(payload['PDF']) self.assertTrue(len(payload['Standard']) != 0)
def task_check_if_extract(message): """ Checks if the file needs to be extracted and pushes to the correct extraction queue. """ logger.debug('Checking content: %s', message) if not isinstance(message, list): message = [message] logger.debug("Calling 'check_if_extract' with message '%s' and path '%s'", message, app.conf['FULLTEXT_EXTRACT_PATH']) results = checker.check_if_extract(message, app.conf['FULLTEXT_EXTRACT_PATH']) logger.debug('Results: %s', results) if results: for key in results: if key == 'PDF' or key == 'Standard': for msg in results[key]: logger.debug("Calling 'task_extract' with message '%s'", msg) task_extract.delay(msg) if app.conf['GROBID_SERVICE'] is not None and key == 'PDF': logger.debug("Calling 'task_extract_grobid' with message '%s'", msg) task_extract_grobid.delay(msg) else: logger.error('Unknown type: %s and message: %s', (key, results[key]))