Пример #1
0
    def test_pipeline_extract_content_extracts_meta_text_correctly(self):
        """
        Tests the extract_content method. Checks that the meta.json file written
        to disk contains the content that we expect to be there.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['file_format'] = 'txt'
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload, 1)

        meta_dict = {}
        with open(self.dict_item['meta_path'], 'r') as meta_file:
            meta_dict = json.load(meta_file)

        self.assertEqual(self.dict_item['ft_source'], meta_dict['ft_source'])
        self.assertEqual(self.dict_item['bibcode'], meta_dict['bibcode'])
        self.assertEqual(self.dict_item['provider'], meta_dict['provider'])
        self.assertEqual(self.dict_item['UPDATE'], meta_dict['UPDATE'])
Пример #2
0
    def pipeline_extract(self, format_):
        """
        Helper function that writes a meta.json and checks that the content on
        disk matches what we expect to be there.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :param format_: file format to be in the meta.json
        :return: no return
        """

        self.dict_item['file_format'] = format_
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload == '["MNRAS2014"]')

        meta_dict = {}
        with open(self.dict_item['meta_path'], 'r') as meta_file:
            meta_dict = json.load(meta_file)

        self.assertEqual(self.dict_item['ft_source'], meta_dict['ft_source'])
        self.assertEqual(self.dict_item['bibcode'], meta_dict['bibcode'])
        self.assertEqual(self.dict_item['provider'], meta_dict['provider'])
        self.assertEqual(self.dict_item['UPDATE'], meta_dict['UPDATE'])
Пример #3
0
    def test_pipeline_extract_content_extracts_fulltext_correctly(self):
        """
        Tests the extract_content method. Checks that the full text written to
        disk matches the ful text that we expect to be written to disk.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['file_format'] = 'txt'
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload, 1)

        full_text = ''
        with open(
                self.dict_item['meta_path']
                        .replace('meta.json', 'fulltext.txt'), 'r'
        ) as full_text_file:

            full_text = full_text_file.read()

        self.assertEqual(self.dict_item['fulltext'], full_text)
Пример #4
0
    def test_pipeline_extract_content_extracts_fulltext_correctly(self):
        """
        Tests the extract_content method. Checks that the full text written to
        disk matches the ful text that we expect to be written to disk.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['file_format'] = 'txt'
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload, 1)

        full_text = ''
        with open(
                self.dict_item['meta_path'].replace('meta.json',
                                                    'fulltext.txt'),
                'r') as full_text_file:

            full_text = full_text_file.read()

        self.assertEqual(self.dict_item['fulltext'], full_text)
Пример #5
0
    def test_temporary_file_is_made_and_moved(self):
        """
        Tests the extract_content method. Checks that when the worker writes to
        disk, that it first generates a temporary output file, and then moves
        that file to the expected output name.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        writer.extract_content([self.dict_item])
        os.remove(self.meta_file)

        temp_path = self.meta_file.replace('meta.json', '')
        temp_file_name = writer.write_to_temp_file(self.dict_item, temp_path)
        self.assertTrue(os.path.exists(temp_file_name))

        writer.move_temp_file_to_file(temp_file_name, self.meta_file)
        self.assertFalse(os.path.exists(temp_file_name))
        self.assertTrue(os.path.exists(self.meta_file))
Пример #6
0
    def test_temporary_file_is_made_and_moved(self):
        """
        Tests the extract_content method. Checks that when the worker writes to
        disk, that it first generates a temporary output file, and then moves
        that file to the expected output name.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        writer.extract_content([self.dict_item])
        os.remove(self.meta_file)

        temp_path = self.meta_file.replace('meta.json', '')
        temp_file_name = writer.write_to_temp_file(self.dict_item, temp_path)
        self.assertTrue(os.path.exists(temp_file_name))

        writer.move_temp_file_to_file(temp_file_name, self.meta_file)
        self.assertFalse(os.path.exists(temp_file_name))
        self.assertTrue(os.path.exists(self.meta_file))
Пример #7
0
    def test_write_worker_returns_content(self):
        """
        Tests the extract_content method. Checks that the payload that the
        worker returns, that will go on to another RabbitMQ queue, is in the
        format that we expect.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        payload = writer.extract_content([self.dict_item])
        self.assertTrue(payload == '["MNRAS2014"]',
                        'Length does not match: {0}'.format(payload))
Пример #8
0
    def test_write_worker_returns_content(self):
        """
        Tests the extract_content method. Checks that the payload that the
        worker returns, that will go on to another RabbitMQ queue, is in the
        format that we expect.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        payload = writer.extract_content([self.dict_item])
        self.assertTrue(
            payload == '["MNRAS2014"]', 'Length does not match: {0}'
            .format(payload)
        )
Пример #9
0
    def pipeline_extract(self, format_):
        """
        Helper function that writes a meta.json and checks that the content on
        disk matches what we expect to be there.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :param format_: file format to be in the meta.json
        :return: no return
        """

        self.dict_item['file_format'] = format_
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload == '["MNRAS2014"]')

        meta_dict = {}
        with open(self.dict_item['meta_path'], 'r') as meta_file:
            meta_dict = json.load(meta_file)

        self.assertEqual(
            self.dict_item['ft_source'],
            meta_dict['ft_source']
        )
        self.assertEqual(
            self.dict_item['bibcode'],
            meta_dict['bibcode']
        )
        self.assertEqual(
            self.dict_item['provider'],
            meta_dict['provider']
        )
        self.assertEqual(
            self.dict_item['UPDATE'],
            meta_dict['UPDATE']
        )
Пример #10
0
    def test_acknowledgements_file_is_created(self):
        """
        Tests the extract_content method. Checks that both a fulltext.txt and a
        acknowledgements.txt file is created (if there is actual content for the
        acknowledgements).

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['acknowledgements'] = "Thank you"
        return_payload = writer.extract_content([self.dict_item])

        self.assertTrue(os.path.exists(self.full_text_file),
                        msg=os.path.exists(self.full_text_file))
        self.assertTrue(os.path.exists(self.acknowledgement_file),
                        msg=os.path.exists(self.acknowledgement_file))
Пример #11
0
    def test_acknowledgements_file_is_created(self):
        """
        Tests the extract_content method. Checks that both a fulltext.txt and a
        acknowledgements.txt file is created (if there is actual content for the
        acknowledgements).

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['acknowledgements'] = "Thank you"
        return_payload = writer.extract_content([self.dict_item])

        self.assertTrue(os.path.exists(self.full_text_file),
                        msg=os.path.exists(self.full_text_file))
        self.assertTrue(os.path.exists(self.acknowledgement_file),
                        msg=os.path.exists(self.acknowledgement_file))
Пример #12
0
    def test_pipeline_extract_content_extracts_meta_text_correctly(self):
        """
        Tests the extract_content method. Checks that the meta.json file written
        to disk contains the content that we expect to be there.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['file_format'] = 'txt'
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload, 1)

        meta_dict = {}
        with open(self.dict_item['meta_path'], 'r') as meta_file:
            meta_dict = json.load(meta_file)

        self.assertEqual(
            self.dict_item['ft_source'],
            meta_dict['ft_source']
        )
        self.assertEqual(
            self.dict_item['bibcode'],
            meta_dict['bibcode']
        )
        self.assertEqual(
            self.dict_item['provider'],
            meta_dict['provider']
        )
        self.assertEqual(
            self.dict_item['UPDATE'],
            meta_dict['UPDATE']
        )