Пример #1
0
def task_extract(message):
    """
    Extracts the full text from the given location and pushes to the writing
    queue.
    """
    logger.debug('Extract content: %s', message)
    if not isinstance(message, list):
        message = [message]

    results = extraction.extract_content(message, extract_pdf_script=app.conf['EXTRACT_PDF_SCRIPT'])
    logger.debug('Results: %s', results)
    for r in results:
        logger.debug("Calling 'write_content' with '%s'", str(r))
        # Write locally to filesystem
        writer.write_content(r)

        # Send results to master
        msg = {
                'bibcode': r['bibcode'],
                'body': r['fulltext'],
                }
        for x in ('acknowledgements', 'dataset'):
            if x in r and r[x]:
                msg[x] = r[x]

        logger.debug("Calling 'task_output_results' with '%s'", msg)
        task_output_results.delay(msg)

        # Send results to master only if fulltext is not an empty string
        if r['fulltext'] != "":
            logger.debug("Calling 'task_output_results' with '%s'", msg)
            task_output_results.delay(msg)
Пример #2
0
def task_extract(message):
    """
    Extracts the full text from the given location and pushes to the writing
    queue.
    """
    logger.debug('Extract content: %s', message)
    if not isinstance(message, list):
        message = [message]

    results = extraction.extract_content(
        message, extract_pdf_script=app.conf['EXTRACT_PDF_SCRIPT'])
    logger.debug('Results: %s', results)
    for r in results:
        logger.debug("Calling 'write_content' with '%s'", str(r))
        # Write locally to filesystem
        writer.write_content(r)

        # Send results to master
        msg = {
            'bibcode': r['bibcode'],
            'body': r['fulltext'],
        }
        for x in ('acknowledgements', 'dataset'):
            if x in r and r[x]:
                msg[x] = r[x]

        logger.debug("Calling 'task_output_results' with '%s'", msg)
        task_output_results.delay(msg)

        # Send results to master only if fulltext is not an empty string
        if r['fulltext'] != "":
            logger.debug("Calling 'task_output_results' with '%s'", msg)
            task_output_results.delay(msg)
Пример #3
0
def task_extract(message):
    """
    Extracts the full text from the given location and pushes to the writing
    queue.
    """
    logger.debug('Extract content: %s', message)
    if not isinstance(message, list):
        message = [message]

    results = extraction.extract_content(message, extract_pdf_script=app.conf['EXTRACT_PDF_SCRIPT'])
    logger.debug('Results: %s', results)
    for r in results:

        logger.debug("Calling 'write_content' with '%s'", str(r))
        # Write locally to filesystem
        writer.write_content(r)

        # Send results to master
        msg = {
                'bibcode': r['bibcode'],
                'body': r['fulltext'],
                }
        for x in ('acknowledgements', 'dataset', 'facility'):
            if x in r and r[x]:
                msg[x] = r[x]

        # Call task without checking if fulltext is empty
        # to ensure other components (acks, etc) are output/sent to master
        logger.debug("Calling 'task_output_results' with '%s'", msg)
        logger.info("Calling task_output_results...")
        task_output_results.delay(msg)

    if app.conf['RUN_NER_FACILITIES_AFTER_EXTRACTION']:
        # perform named-entity recognition
        task_identify_facilities.delay(message)
Пример #4
0
    def test_that_we_can_extract_all_content_from_payload_input(self):
        """
        Tests the extract_content method. This checks that all of the XML meta
        data defined in settings.py is extracted from the stub XML data.

        :return: no return
        """

        pay_load = [self.dict_item]

        content = extraction.extract_content(pay_load)

        self.assertTrue(
            set(rules.META_CONTENT['teixml'].keys()).issubset(
                content[0].keys()))
Пример #5
0
    def test_that_we_can_extract_all_content_from_payload_input(self):
        """
        Tests the extract_content method. This checks that all of the XML meta
        data defined in settings.py is extracted from the stub XML data.

        :return: no return
        """

        pay_load = [self.dict_item]

        content = extraction.extract_content(pay_load)

        self.assertTrue(
            set(rules.META_CONTENT['teixml'].keys()).issubset(content[0].keys())
        )
Пример #6
0
    def test_that_we_can_extract_all_content_from_payload_input(self):
        """
        Tests the extract_content method. This checks that all of the XML meta
        data defined in settings.py is extracted from the stub XML data.

        :return: no return
        """

        file_path = '{0}/{1}'.format(self.app.conf['FULLTEXT_EXTRACT_PATH'],
                                     self.test_stub_xml)
        pay_load = [self.dict_item]

        content = extraction.extract_content(pay_load)

        self.assertTrue(
            set(rules.META_CONTENT['xml'].keys()).issubset(content[0].keys()))
Пример #7
0
    def test_multi_file(self):
        """
        some entries in fulltext/all.links specify multiple files

        typically the first has text from the article while the rest have the text from tables

        :return: no return
        """
        self.dict_item = {'ft_source': self.test_multi_file,
                          'file_format': 'xml',
                          'provider': 'MNRAS',
                          'bibcode': 'test'}

        content = extraction.extract_content([self.dict_item])
        # does the fulltext contain two copies of the file's contents
        self.assertEqual(2, content[0]['fulltext'].count('Entry 1'))
Пример #8
0
    def test_that_we_can_extract_all_content_from_payload_input(self):
        """
        Tests the extract_content method. This checks that all of the XML meta
        data defined in settings.py is extracted from the stub XML data.

        :return: no return
        """

        file_path = '{0}/{1}'.format(self.app.conf['FULLTEXT_EXTRACT_PATH'],
                                     self.test_stub_xml)
        pay_load = [self.dict_item]

        content = extraction.extract_content(pay_load)

        self.assertTrue(
            set(rules.META_CONTENT['xml'].keys()).issubset(content[0].keys())
        )
Пример #9
0
    def task_extract_grobid(message):
        """
        Extracts the structured full text from the given location
        """
        logger.debug('Extract grobid content: %s', message)
        if not isinstance(message, list):
            message = [message]

        # Mofiy file format to force the use of GrobidPDFExtractor
        for msg in message:
            msg['file_format'] += "-grobid"

        results = extraction.extract_content(message, grobid_service=app.conf['GROBID_SERVICE'])
        logger.debug('Grobid results: %s', results)
        for r in results:
            logger.debug("Calling 'write_content' with '%s'", str(r))
            # Write locally to filesystem
            writer.write_content(r)
Пример #10
0
    def test_multi_file(self):
        """
        some entries in fulltext/all.links specify multiple files 

        typically the first has text from the article while the rest have the text from tables

        :return: no return
        """
        self.dict_item = {
            'ft_source': self.test_multi_file,
            'file_format': 'xml',
            'provider': 'MNRAS',
            'bibcode': 'test'
        }

        content = extraction.extract_content([self.dict_item])
        # does the fulltext contain two copies of the file's contents
        self.assertEqual(2, content[0]['fulltext'].count('Entry 1'))
Пример #11
0
    def task_extract_grobid(message):
        """
        Extracts the structured full text from the given location
        """
        logger.debug('Extract grobid content: %s', message)
        if not isinstance(message, list):
            message = [message]

        # Mofiy file format to force the use of GrobidPDFExtractor
        for msg in message:
            msg['file_format'] += "-grobid"

        results = extraction.extract_content(message, grobid_service=app.conf['GROBID_SERVICE'])
        logger.debug('Grobid results: %s', results)
        for r in results:
            logger.debug("Calling 'write_content' with '%s'", str(r))
            # Write locally to filesystem
            writer.write_content(r)