Пример #1
0
    def test_that_file_should_be_updated_if_forced(self):
        """
        If the dictionary contains a force value in the update keyword, then
        the worker should pass on the content regardless of whether it passes
        any other checks
        :return: no return
        """

        FileInputStream_true = utils.FileInputStream(self.test_file_exists)
        FileInputStream_true.extract(force_extract=True)

        FileInputStream_false = utils.FileInputStream(self.test_file_exists)
        FileInputStream_false.extract(force_extract=False)

        payload_true = checker.check_if_extract(
            FileInputStream_true.payload,
            self.app.conf['FULLTEXT_EXTRACT_PATH']
        )
        first_doc_true = payload_true['PDF'][0]

        payload_false = checker.check_if_extract(
            FileInputStream_false.payload,
            self.app.conf['FULLTEXT_EXTRACT_PATH']
        )
        first_doc_false = payload_true['PDF'][0]


        self.assertTrue(first_doc_true['UPDATE'],
                        'FORCE_TO_EXTRACT')
        self.assertTrue(len(payload_false['PDF']) != 0)

        self.assertTrue(first_doc_false['UPDATE'],
                        'DIFFERING_FULL_TEXT')
        self.assertTrue(len(payload_false['PDF']) != 0)
Пример #2
0
    def test_that_file_should_be_updated_if_forced(self):
        """
        If the dictionary contains a force value in the update keyword, then
        the worker should pass on the content regardless of whether it passes
        any other checks
        :return: no return
        """

        FileInputStream_true = utils.FileInputStream(self.test_file_exists)
        FileInputStream_true.extract(force_extract=True)

        FileInputStream_false = utils.FileInputStream(self.test_file_exists)
        FileInputStream_false.extract(force_extract=False)

        payload_true = checker.check_if_extract(
            FileInputStream_true.payload,
            self.app.conf['FULLTEXT_EXTRACT_PATH'])
        first_doc_true = payload_true['PDF'][0]

        payload_false = checker.check_if_extract(
            FileInputStream_false.payload,
            self.app.conf['FULLTEXT_EXTRACT_PATH'])
        first_doc_false = payload_true['PDF'][0]

        self.assertTrue(first_doc_true['UPDATE'], 'FORCE_TO_EXTRACT')
        self.assertTrue(len(payload_false['PDF']) != 0)

        self.assertTrue(first_doc_false['UPDATE'], 'DIFFERING_FULL_TEXT')
        self.assertTrue(len(payload_false['PDF']) != 0)
Пример #3
0
    def test_output_dictionary_contains_everything_we_need(self):
        """
        Tests the check_if_extract function. Runs the function on a stub file
        that contains one document and then ensures that the output dictionary
        contains all the expected meta data. It also checks that the correct
        file format has been associated to it.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_single_document)
        FileInputStream.extract()

        payload = checker.check_if_extract(
            FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH'])

        expected_content = [
            'ft_source', 'bibcode', 'provider', 'file_format', 'UPDATE',
            'meta_path', 'index_date'
        ]
        if sys.version_info > (3, ):
            test_type = str
        else:
            test_type = unicode

        expected_content = [test_type(i) for i in expected_content]
        expected_content.sort()

        actual_content = list(payload['Standard'][0].keys())
        actual_format = payload['Standard'][0]['file_format']

        actual_content.sort()
        self.assertListEqual(actual_content, expected_content)
        self.assertEqual(actual_format, 'txt')
Пример #4
0
    def test_output_dictionary_contains_everything_we_need(self):
        """
        Tests the check_if_extract function. Runs the function on a stub file
        that contains one document and then ensures that the output dictionary
        contains all the expected meta data. It also checks that the correct
        file format has been associated to it.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_single_document)
        FileInputStream.extract()

        payload = checker.check_if_extract(
            FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH']
        )

        expected_content = ['ft_source', 'bibcode',
                            'provider', 'file_format',
                            'UPDATE', 'meta_path',
                            'index_date']
        expected_content = [unicode(i) for i in expected_content]
        expected_content.sort()

        actual_content = payload['Standard'][0].keys()
        actual_format = payload['Standard'][0]['file_format']

        actual_content.sort()
        self.assertListEqual(actual_content, expected_content)
        self.assertEqual(actual_format, 'txt')
Пример #5
0
def task_check_if_extract(message):
    """
    Checks if the file needs to be extracted and pushes to the correct
    extraction queue.
    """
    logger.debug('Checking content: %s', message)
    if not isinstance(message, list):
        message = [message]

    logger.debug("Calling 'check_if_extract' with message '%s' and path '%s'",
                 message, app.conf['FULLTEXT_EXTRACT_PATH'])

    results = checker.check_if_extract(message,
                                       app.conf['FULLTEXT_EXTRACT_PATH'])
    logger.debug('Results: %s', results)
    if results:
        for key in results:
            if key == 'PDF' or key == 'Standard':
                for msg in results[key]:
                    logger.debug("Calling 'task_extract' with message '%s'",
                                 msg)
                    task_extract.delay(msg)
                    if app.conf['GROBID_SERVICE'] is not None and key == 'PDF':
                        logger.debug(
                            "Calling 'task_extract_grobid' with message '%s'",
                            msg)
                        task_extract_grobid.delay(msg)
            else:
                logger.error('Unknown type: %s and message: %s',
                             (key, results[key]))
Пример #6
0
    def test_file_should_be_extracted(self):
        """
        Tests the check_if_extract function. Calculates how many references
        there are to PDFs, and the remainder are 'Standard files', e.g., XML,
        HTTP, HTML, etc. From this payload, it runs check_if_extract and ensures
        that all of the outputs contain an expected UPDATE flag, and that there
        are the correct number of PDFs in the PDF queue, and the right number of
        StandardFiles in the StandardFiles queue.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_file)
        FileInputStream.extract()

        with open(self.test_file, 'r') as in_f:
            text = in_f.read()
        pdf_re = re.compile('pdf')
        pdf_number = len(pdf_re.findall(text))
        standard_number = \
            len([i for i in text.split('\n') if i != '']) - pdf_number

        payload = checker.check_if_extract(
            FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH']

        )
        pdf_payload = payload['PDF']
        standard_payload = payload['Standard']

        if pdf_payload:

            pdf_compare = [
                content for content in payload['PDF']
                if content['UPDATE']
                in [u'STALE_CONTENT',
                    u'DIFFERING_FULL_TEXT',
                    u'MISSING_FULL_TEXT"'
                    u'NOT_EXTRACTED_BEFORE']
            ]

        else:
            pdf_compare = []

        if standard_payload:

            standard_compare = [
                content for content in payload['Standard']
                if content['UPDATE']
                in [u'STALE_CONTENT',
                    u'DIFFERING_FULL_TEXT',
                    u'MISSING_FULL_TEXT',
                    u'NOT_EXTRACTED_BEFORE']
            ]

        else:
            standard_compare = []

        self.assertTrue(len(pdf_compare) == pdf_number, pdf_number)
        self.assertTrue(len(standard_compare) == standard_number)
Пример #7
0
    def test_file_should_be_extracted(self):
        """
        Tests the check_if_extract function. Calculates how many references
        there are to PDFs, and the remainder are 'Standard files', e.g., XML,
        HTTP, HTML, etc. From this payload, it runs check_if_extract and ensures
        that all of the outputs contain an expected UPDATE flag, and that there
        are the correct number of PDFs in the PDF queue, and the right number of
        StandardFiles in the StandardFiles queue.

        :return: no return
        """

        FileInputStream = utils.FileInputStream(self.test_file)
        FileInputStream.extract()

        with open(self.test_file, 'r') as in_f:
            text = in_f.read()
        pdf_re = re.compile('pdf')
        pdf_number = len(pdf_re.findall(text))
        standard_number = \
            len([i for i in text.split('\n') if i != '']) - pdf_number

        payload = checker.check_if_extract(
            FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH'])
        pdf_payload = payload['PDF']
        standard_payload = payload['Standard']

        if pdf_payload:

            pdf_compare = [
                content for content in payload['PDF'] if content['UPDATE'] in [
                    u'STALE_CONTENT', u'DIFFERING_FULL_TEXT',
                    u'MISSING_FULL_TEXT"'
                    u'NOT_EXTRACTED_BEFORE'
                ]
            ]

        else:
            pdf_compare = []

        if standard_payload:

            standard_compare = [
                content for content in payload['Standard']
                if content['UPDATE'] in [
                    u'STALE_CONTENT', u'DIFFERING_FULL_TEXT',
                    u'MISSING_FULL_TEXT', u'NOT_EXTRACTED_BEFORE'
                ]
            ]

        else:
            standard_compare = []

        self.assertTrue(len(pdf_compare) == pdf_number, pdf_number)
        self.assertTrue(len(standard_compare) == standard_number)
Пример #8
0
    def test_that_no_payload_gets_sent_if_there_is_no_content(self):
        """
        Tests check_if_extract function. The stub data contains no PDF files.
        This means there should be nothing inside the PDF list returned within
        the payload. If there is, there is a problem.

        :return: no return
        """
        FileInputStream = utils.FileInputStream(self.test_single_document)
        FileInputStream.extract()

        payload = checker.check_if_extract(
            FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH'])

        self.assertFalse(payload['PDF'])
        self.assertTrue(len(payload['Standard']) != 0)
Пример #9
0
    def test_that_no_payload_gets_sent_if_there_is_no_content(self):
        """
        Tests check_if_extract function. The stub data contains no PDF files.
        This means there should be nothing inside the PDF list returned within
        the payload. If there is, there is a problem.

        :return: no return
        """
        FileInputStream = utils.FileInputStream(self.test_single_document)
        FileInputStream.extract()

        payload = checker.check_if_extract(
            FileInputStream.payload, self.app.conf['FULLTEXT_EXTRACT_PATH']
        )

        self.assertFalse(payload['PDF'])
        self.assertTrue(len(payload['Standard']) != 0)
Пример #10
0
def task_check_if_extract(message):
    """
    Checks if the file needs to be extracted and pushes to the correct
    extraction queue.
    """
    logger.debug('Checking content: %s', message)
    if not isinstance(message, list):
        message = [message]

    logger.debug("Calling 'check_if_extract' with message '%s' and path '%s'", message, app.conf['FULLTEXT_EXTRACT_PATH'])

    results = checker.check_if_extract(message, app.conf['FULLTEXT_EXTRACT_PATH'])
    logger.debug('Results: %s', results)
    if results:
        for key in results:
            if key == 'PDF' or key == 'Standard':
                for msg in results[key]:
                    logger.debug("Calling 'task_extract' with message '%s'", msg)
                    task_extract.delay(msg)
                    if app.conf['GROBID_SERVICE'] is not None and key == 'PDF':
                        logger.debug("Calling 'task_extract_grobid' with message '%s'", msg)
                        task_extract_grobid.delay(msg)
            else:
                logger.error('Unknown type: %s and message: %s', (key, results[key]))