Пример #1
0
    def test_pipeline_extract_content_extracts_fulltext_correctly(self):
        """
        Tests the extract_content method. Checks that the full text written to
        disk matches the ful text that we expect to be written to disk.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['file_format'] = 'txt'
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload, 1)

        full_text = ''
        fulltext_content = reader.read_file(
            self.dict_item['meta_path'].replace('meta.json',
                                                'fulltext.txt.gz'),
            json_format=False)

        self.assertEqual(self.dict_item['fulltext'], fulltext_content)
Пример #2
0
    def test_forced_extraction(self):
        """
        Tests that when a user specifies 'force_extract' that the full text
        is extracted regardless of its underlying reason for being or not
        being extracted.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish,
                                       force_extract=True,
                                       force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay',
                          return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {
                'UPDATE':
                'FORCE_TO_EXTRACT',
                'bibcode':
                'test4',
                'file_format':
                'txt',
                'ft_source':
                '{}/tests/test_unit/stub_data/test.txt'.format(
                    self.app.conf['PROJ_HOME']),
                #'index_date': '2017-07-07T14:39:11.271432Z',
                'meta_path':
                '{}/tests/test_unit/stub_data/te/st/4/meta.json'.format(
                    self.app.conf['PROJ_HOME']),
                'provider':
                'TEST'
            }
            actual = task_extract.call_args[0][0]
            self.assertTrue(set(expected).issubset(actual))
            self.assertTrue('index_date' in actual)

        with patch.object(tasks.task_output_results,
                          'delay',
                          return_value=None) as task_output_results:
            with patch.object(tasks.task_identify_facilities,
                              'delay',
                              return_value=None) as task_identify_facilities:
                # Now we do call the extraction task with the proper arguments
                tasks.task_extract(actual)
                self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for path in self.expected_paths:
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(os.path.exists(meta_path),
                            'Meta file not created: {0}'.format(path))

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'FORCE_TO_EXTRACT' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content))

            fulltext_path = os.path.join(path, 'fulltext.txt.gz')
            self.assertTrue(os.path.exists(fulltext_path),
                            'Full text file not created: %s'.format(path))

            if os.path.exists(fulltext_path):
                fulltext_content = reader.read_file(fulltext_path,
                                                    json_format=False)
                self.assertEqual(fulltext_content,
                                 "Introduction THIS IS AN INTERESTING TITLE")
    def test_extra_acknowledment(self):
        """
        Submits a file to the RabbitMQ that contains a bibcode that should
        result in an acknowlegements file is created. It checks that this file
        is created and then removes all the content created by the tests.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish,
                                       force_extract=False,
                                       force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay',
                          return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {
                'UPDATE':
                'NOT_EXTRACTED_BEFORE',
                'bibcode':
                'test1',
                'file_format':
                'xml',
                'ft_source':
                '{}/tests/test_integration/stub_data/full_test_elsevier.xml'.
                format(self.app.conf['PROJ_HOME']),
                #'index_date': '2017-07-07T14:39:11.271432Z',
                'meta_path':
                '{}/tests/test_unit/stub_data/te/st/1/meta.json'.format(
                    self.app.conf['PROJ_HOME']),
                'provider':
                'Elsevier'
            }
            actual = task_extract.call_args[0][0]
            self.assertTrue(set(expected).issubset(actual))
            self.assertTrue('index_date' in actual)

        with patch.object(tasks.task_output_results,
                          'delay',
                          return_value=None) as task_output_results:
            with patch.object(tasks.task_identify_facilities,
                              'delay',
                              return_value=None) as task_identify_facilities:
                # Now we do call the extraction task with the proper arguments
                tasks.task_extract(actual)
                self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for path in self.expected_paths:
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(os.path.exists(meta_path),
                            'Meta file not created: {0}'.format(path))

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'NOT_EXTRACTED_BEFORE' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content))

            fulltext_path = os.path.join(path, 'fulltext.txt.gz')
            self.assertTrue(os.path.exists(fulltext_path),
                            'Full text file not created: %s'.format(path))

            # unless changed, tests/test_integration/stub_data/full_test_elsevier.xml
            if os.path.exists(fulltext_path):
                fulltext_content = reader.read_file(fulltext_path,
                                                    json_format=False)
                self.assertEqual(
                    fulltext_content,
                    '1 Introduction JOURNAL CONTENT Acknowledgments THANK YOU Appendix A APPENDIX TITLE APPENDIX'
                )

            acknowledgments_path = os.path.join(path,
                                                'acknowledgements.txt.gz')
            self.assertTrue(os.path.exists(acknowledgments_path),
                            'Full text file not created: %s'.format(path))

            if os.path.exists(acknowledgments_path):
                acknowledgements_content = reader.read_file(
                    acknowledgments_path, json_format=False)
                self.assertEqual(acknowledgements_content,
                                 "Acknowledgments THANK YOU")
Пример #4
0
    def test_extraction_wrong_fulltext_filename(self):
        """
        Publishes a packet that contains a bibcode that has a full text content
        path that differs to the one that was used the previous time full text
        content was extracted. Then it ensures all the files generated are
        removed.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish,
                                       force_extract=False,
                                       force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Make the fake data to use
        if not os.path.exists(self.meta_path):
            os.makedirs(self.meta_path)

        test_meta_content = {
            'index_date': datetime.utcnow().isoformat() + 'Z',
            'bibcode': 'test4',
            'provider': 'mnras',
            'ft_source': 'wrong_source'
        }
        with open(self.test_expected, 'w') as test_meta_file:
            json.dump(test_meta_content, test_meta_file)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay',
                          return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {
                'UPDATE':
                'DIFFERING_FULL_TEXT',
                'bibcode':
                'test4',
                'file_format':
                'txt',
                'ft_source':
                '{}/tests/test_unit/stub_data/test.txt'.format(
                    self.app.conf['PROJ_HOME']),
                #'index_date': '2017-07-07T14:39:11.271432Z',
                'meta_path':
                '{}/tests/test_unit/stub_data/te/st/4/meta.json'.format(
                    self.app.conf['PROJ_HOME']),
                'provider':
                'TEST'
            }
            actual = task_extract.call_args[0][0]
            self.assertTrue(set(expected).issubset(actual))
            self.assertTrue('index_date' in actual)

        with patch.object(tasks.task_output_results,
                          'delay',
                          return_value=None) as task_output_results:
            with patch.object(tasks.task_identify_facilities,
                              'delay',
                              return_value=None) as task_identify_facilities:
                # Now we do call the extraction task with the proper arguments
                tasks.task_extract(actual)
                self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for path in self.expected_paths:
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(os.path.exists(meta_path),
                            'Meta file not created: {0}'.format(path))

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'DIFFERING_FULL_TEXT' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content))

            fulltext_path = os.path.join(path, 'fulltext.txt.gz')
            self.assertTrue(os.path.exists(fulltext_path),
                            'Full text file not created: %s'.format(path))

            if os.path.exists(fulltext_path):
                fulltext_content = reader.read_file(fulltext_path,
                                                    json_format=False)
                self.assertEqual(fulltext_content,
                                 "Introduction THIS IS AN INTERESTING TITLE")
Пример #5
0
    def test_full_range_of_file_format_extraction(self):
        """
        Submits a file containing all the relevant document types to the
        RabbitMQ instance. Runs all the relevant workers, and then checks that
        content was extracted. Finally, it cleans up any files or paths created.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        if self.grobid_service is not None:
            httpretty.enable()
            expected_grobid_fulltext = "<hello/>"
            httpretty.register_uri(httpretty.POST, self.grobid_service,
                           body=expected_grobid_fulltext,
                           status=200)

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish, force_extract=False, force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 6)

        # Make the fake data to use
        if not os.path.exists(self.meta_path):
            os.makedirs(self.meta_path)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract:
            extraction_arguments_set = []
            expected_update = 'NOT_EXTRACTED_BEFORE'
            for message in records.payload:
                tasks.task_check_if_extract(message)
                self.assertTrue(task_extract.called)
                actual = task_extract.call_args[0][0]
                self.assertEqual(actual['UPDATE'], expected_update,
                        'This should be %s, but is in fact: {0}'
                        .format(expected_update, actual['UPDATE']))
                extraction_arguments_set.append(actual)

        with patch.object(tasks.task_output_results, 'delay', return_value=None) as task_output_results:
            with patch.object(tasks.task_identify_facilities, 'delay', return_value=None) as task_identify_facilities:
                # Now we do call the extraction task with the proper arguments
                for arguments in extraction_arguments_set:
                    #if arguments['ft_source'].endswith('.pdf') is False:
                    tasks.task_extract(arguments)
                    self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for i, path in enumerate(self.expected_paths):
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(
                os.path.exists(meta_path),
                'Meta file not created: {0}'.format(path)
            )

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'NOT_EXTRACTED_BEFORE' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content)
                )

            fulltext_path = os.path.join(path, 'fulltext.txt.gz')
            self.assertTrue(
                os.path.exists(fulltext_path),
                'Full text file not created: %s'.format(path)
            )

            if os.path.exists(fulltext_path):
                fulltext_content = reader.read_file(fulltext_path, json_format=False)
                expected_fulltext_content = (
                        u"Introduction THIS IS AN INTERESTING TITLE",
                        u"Introduction THIS IS AN INTERESTING TITLE",
                        u"I. INTRODUCTION INTRODUCTION GOES HERE Manual Entry TABLE I. TEXT a NOTES a TEXT\nAPPENDIX: APPENDIX TITLE GOES HERE APPENDIX CONTENT",
                        u'1 Introduction JOURNAL CONTENT Acknowledgments THANK YOU Appendix A APPENDIX TITLE APPENDIX',
                        u"No Title AA 999, 999-999 (1999) DOI: 99.9999/9999-9999:99999999 TITLE AUTHOR AFFILIATION Received 99 MONTH 1999 / Accepted 99 MONTH 1999 Abstract ABSTRACT Key words: KEYWORD INTRODUCTION SECTION Table 1: TABLE TABLE (1) COPYRIGHT",
                        #u"Introduction\nTHIS IS AN INTERESTING TITLE\n", # PDFBox
                        u"Introduction THIS IS AN INTERESTING TITLE", # pdftotext
                        )

                self.assertEqual(fulltext_content, expected_fulltext_content[i])

            grobid_fulltext_path = os.path.join(path, 'grobid_fulltext.xml')
            if os.path.exists(grobid_fulltext_path):
                with open(grobid_fulltext_path, 'r') as grobid_fulltext_file:
                    grobid_fulltext_content = grobid_fulltext_file.read()
                self.assertEqual(grobid_fulltext_content, expected_grobid_fulltext)