示例#1
0
    def test_that_file_should_be_updated_if_forced(self):
        """
        If the dictionary contains a force value in the update keyword, then
        the worker should pass on the content regardless of whether it passes
        any other checks
        :return: no return
        """

        FileInputStream_true = utils.FileInputStream(self.test_file_exists)
        FileInputStream_true.extract(force_extract=True)

        FileInputStream_false = utils.FileInputStream(self.test_file_exists)
        FileInputStream_false.extract(force_extract=False)

        payload_true = checker.check_if_extract(
            FileInputStream_true.payload,
            self.app.conf['FULLTEXT_EXTRACT_PATH'])
        first_doc_true = payload_true['PDF'][0]

        payload_false = checker.check_if_extract(
            FileInputStream_false.payload,
            self.app.conf['FULLTEXT_EXTRACT_PATH'])
        first_doc_false = payload_true['PDF'][0]

        self.assertTrue(first_doc_true['UPDATE'], 'FORCE_TO_EXTRACT')
        self.assertTrue(len(payload_false['PDF']) != 0)

        self.assertTrue(first_doc_false['UPDATE'], 'DIFFERING_FULL_TEXT')
        self.assertTrue(len(payload_false['PDF']) != 0)
示例#2
0
    def test_file_should_be_updated_if_missing_fulltext(self):
        """
        Tests the meta_needs_update function. Loads some meta content from disk,
        and then copies everything but the full text path to a new meta content
        dictionary. The meta_needs_update function should then determine that
        there is no full text content and supply the MISSING_FULL_TEXT flag.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_file_exists)
        FileInputStream.extract()

        meta_content = checker.load_meta_file(
            FileInputStream.payload[0], self.app.conf['FULLTEXT_EXTRACT_PATH'])

        new_meta_content = {}

        for key in meta_content.keys():
            if key != 'ft_source':
                new_meta_content[key] = meta_content[key]

        updated = checker.meta_needs_update(
            FileInputStream, new_meta_content,
            self.app.conf['FULLTEXT_EXTRACT_PATH'])

        self.assertEqual(
            updated, 'MISSING_FULL_TEXT',
            'The ft_source should need updating, not {0}'.format(updated))
示例#3
0
    def test_file_extract_meta(self):
        """
        Tests the load_meta_file function. Should load the meta file that exists
        on disk and checks that there is actually some content extracted.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_file_exists)
        FileInputStream.extract()

        payload = FileInputStream.payload[0]

        content = checker.load_meta_file(
            payload, self.app.conf['FULLTEXT_EXTRACT_PATH'])

        self.assertTrue(
            len(content) > 0, 'Did not extract the meta data correctly')
        self.assertEqual(
            content, {
                u'bibcode': u'foobar',
                u'provider': u'MNRAS',
                u'index_date': u'2015-01-21T16:18:40.249050Z',
                u'ft_source': u'tests/test_unit/stub_data/te/st/test.pdf'
            })
示例#4
0
    def test_output_dictionary_contains_everything_we_need(self):
        """
        Tests the check_if_extract function. Runs the function on a stub file
        that contains one document and then ensures that the output dictionary
        contains all the expected meta data. It also checks that the correct
        file format has been associated to it.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_single_document)
        FileInputStream.extract()

        payload = checker.check_if_extract(
            FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH'])

        expected_content = [
            'ft_source', 'bibcode', 'provider', 'file_format', 'UPDATE',
            'meta_path', 'index_date'
        ]
        if sys.version_info > (3, ):
            test_type = str
        else:
            test_type = unicode

        expected_content = [test_type(i) for i in expected_content]
        expected_content.sort()

        actual_content = list(payload['Standard'][0].keys())
        actual_format = payload['Standard'][0]['file_format']

        actual_content.sort()
        self.assertListEqual(actual_content, expected_content)
        self.assertEqual(actual_format, 'txt')
示例#5
0
    def test_file_should_be_updated_if_content_is_stale(self):
        """
        Tests the meta_needs_update function. Loads meta data from the disk, and
        finds the full text path. Opens the full text file and writes some fresh
        content so that it is more new than the meta data file. This will result
        in the meta_needs_update supplying the STALE_CONTENT flag.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_file_exists)
        FileInputStream.extract()

        # Ensure the PDF more new than the meta.json
        payload = FileInputStream.payload[0]
        with open(payload['ft_source'], 'w') as not_stale:
            not_stale.write('PDF')

        # Not a nicer way to do this without cleaning up some tests
        meta_content = checker.load_meta_file(
            payload, self.app.conf['FULLTEXT_EXTRACT_PATH'])

        meta_content['ft_source'] \
            = os.path.join(self.app.conf['PROJ_HOME'], meta_content['ft_source'])

        updated = checker.meta_needs_update(
            payload, meta_content, self.app.conf['FULLTEXT_EXTRACT_PATH'])

        self.assertEqual(
            updated, 'STALE_CONTENT',
            'The file content should be stale, not {0} ({1}\n{2})'.format(
                updated, payload, meta_content))
示例#6
0
    def test_file_should_be_updated_if_content_differs_to_input(self):
        """
        Tests the meta_needs_update function. Loads some meta content from disk,
        and then modifies the full text path to be different. The meta_needs_
        update function should then determine that the full text content differs
        and supply the DIFFERING_FULL_TEXT flag.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_file_exists)
        FileInputStream.extract()
        payload = FileInputStream.payload[0]

        meta_content = checker.load_meta_file(
            payload, self.app.conf['FULLTEXT_EXTRACT_PATH'])

        meta_content['ft_source'] = ''

        updated = checker.meta_needs_update(
            payload, meta_content, self.app.conf['FULLTEXT_EXTRACT_PATH'])

        self.assertEqual(
            updated, 'DIFFERING_FULL_TEXT',
            'The ft_source should need updating, not {0}'.format(updated))
示例#7
0
    def test_file_should_be_extracted(self):
        """
        Tests the check_if_extract function. Calculates how many references
        there are to PDFs, and the remainder are 'Standard files', e.g., XML,
        HTTP, HTML, etc. From this payload, it runs check_if_extract and ensures
        that all of the outputs contain an expected UPDATE flag, and that there
        are the correct number of PDFs in the PDF queue, and the right number of
        StandardFiles in the StandardFiles queue.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_file)
        FileInputStream.extract()

        with open(self.test_file, 'r') as in_f:
            text = in_f.read()
        pdf_re = re.compile('pdf')
        pdf_number = len(pdf_re.findall(text))
        standard_number = \
            len([i for i in text.split('\n') if i != '']) - pdf_number

        payload = checker.check_if_extract(
            FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH'])
        pdf_payload = payload['PDF']
        standard_payload = payload['Standard']

        if pdf_payload:

            pdf_compare = [
                content for content in payload['PDF'] if content['UPDATE'] in [
                    u'STALE_CONTENT', u'DIFFERING_FULL_TEXT',
                    u'MISSING_FULL_TEXT"'
                    u'NOT_EXTRACTED_BEFORE'
                ]
            ]

        else:
            pdf_compare = []

        if standard_payload:

            standard_compare = [
                content for content in payload['Standard']
                if content['UPDATE'] in [
                    u'STALE_CONTENT', u'DIFFERING_FULL_TEXT',
                    u'MISSING_FULL_TEXT', u'NOT_EXTRACTED_BEFORE'
                ]
            ]

        else:
            standard_compare = []

        self.assertTrue(len(pdf_compare) == pdf_number, pdf_number)
        self.assertTrue(len(standard_compare) == standard_number)
示例#8
0
def read_links_from_file(file_input, force_extract=False, force_send=False):
    """
    Opens the link file given and parses the content into a set of lists.

    :param file_input: path to the link file
    :param force_extract: did the user bypass the internal checks
    :param force_send: always send results to master, even for already extracted files
    :return: file stream type (see utils.py)
    """

    FileInputStream = utils.FileInputStream(file_input)
    FileInputStream.extract(force_extract=force_extract, force_send=force_send)

    return FileInputStream
示例#9
0
    def test_file_stream_input_extract_list(self):
        """
        Tests the extract method. It checks that it parses the content of the
        file correctly by checking each of the attributes set in the class.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_file_stub)
        ext = FileInputStream.extract()

        self.assertIn('2015MNRAS.446.4239E', FileInputStream.bibcode)
        self.assertIn(os.path.join(self.app.conf['PROJ_HOME'], 'test.pdf'),
                      FileInputStream.full_text_path)
        self.assertIn('MNRAS', FileInputStream.provider)
示例#10
0
    def test_that_no_payload_gets_sent_if_there_is_no_content(self):
        """
        Tests check_if_extract function. The stub data contains no PDF files.
        This means there should be nothing inside the PDF list returned within
        the payload. If there is, there is a problem.

        :return: no return
        """
        FileInputStream = utils.FileInputStream(self.test_single_document)
        FileInputStream.extract()

        payload = checker.check_if_extract(
            FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH'])

        self.assertFalse(payload['PDF'])
        self.assertTrue(len(payload['Standard']) != 0)
示例#11
0
    def test_file_not_extracted_before(self):
        """
        Tests the meta_output_exists function. It should find that there is not
        already a meta file that exists, which is defined in test_file_stub.

        :return: no return
        """
        FileInputStream = utils.FileInputStream(self.test_file_stub)
        FileInputStream.extract()

        payload = FileInputStream.payload[0]

        exists = checker.meta_output_exists(
            payload, self.app.conf['FULLTEXT_EXTRACT_PATH'])

        self.assertEqual(exists, False)
示例#12
0
    def test_file_stream_input_extract_file(self):
        """
        Tests the extract method. It checks that the number of rows extracted
        by the class is actually the number of rows inside the file by
        explicitly opening the file and reading the number of lines.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_file)
        FileInputStream.extract()

        with open(self.test_file, 'r') as f:
            nor = len(f.readlines())

        self.assertEqual(
            len(FileInputStream.bibcode), nor,
            'Did not extract the correct number of records from the input file'
        )
示例#13
0
    def test_file_extracted_before(self):
        """
        Tests the meta_output_exists function. It should find that there is
        already a meta file that exists, which is defined in test_file_exists.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_file_exists)
        FileInputStream.extract()

        payload = FileInputStream.payload[0]

        exists = checker.meta_output_exists(
            payload, self.app.conf['FULLTEXT_EXTRACT_PATH'])

        self.assertEqual(
            exists, True,
            'Could not establish that this file has been extracted before')