Exemplo n.º 1
0
def test_harvesting_arxiv_workflow_accepted(
    mocked, db_only_app, record_oai_arxiv_plots):
    """Test a full harvesting workflow."""
    from invenio_workflows import (
        start, WorkflowEngine, ObjectStatus, workflow_object_class
    )
    from dojson.contrib.marc21.utils import create_record
    from invenio_db import db
    from inspirehep.dojson.hep import hep
    from inspirehep.modules.converter.xslt import convert

    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(
        record_oai_arxiv_plots,
        "oaiarXiv2marcxml.xsl"
    )
    record_marc = create_record(record_oai_arxiv_plots_marcxml)
    record_json = hep.do(record_marc)
    workflow_uuid = None
    with db_only_app.app_context():
        workflow_uuid = start('article', [record_json])

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]

        assert obj.status == ObjectStatus.HALTED
        assert obj.data_type == "hep"

        # Files should have been attached (tarball + pdf)
        assert obj.files["1407.7587.pdf"]
        assert obj.files["1407.7587.tar.gz"]

        # A publication note should have been extracted
        pub_info = obj.data.get('publication_info')
        assert pub_info
        assert pub_info[0]
        assert pub_info[0].get('year') == "2014"
        assert pub_info[0].get('journal_title') == "J. Math. Phys."

        # This record should not have been touched yet
        assert "approved" not in obj.extra_data

        # Now let's resolve it as accepted and continue
        # FIXME Should be accept, but record validation prevents us.
        obj.remove_action()
        obj.extra_data["approved"] = True
        obj.extra_data["core"] = True
        obj.save()

        db.session.commit()

    with db_only_app.app_context():
        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]
        obj_id = obj.id
        obj.continue_workflow()

        obj = workflow_object_class.get(obj_id)
        # It was accepted
        assert obj.status == ObjectStatus.COMPLETED
Exemplo n.º 2
0
def spawn_arXiv_workflow_from_oai_harvest(request, records, name, **kwargs):
    """Receive a list of harvested arXiv records and schedule workflow."""
    from flask import current_app
    from invenio_workflows import start, workflows

    if request.endpoint not in ARXIV_URLS:
        # This is not arXiv
        return

    spider = kwargs.get('spider')
    workflow = kwargs.get('workflow')
    if spider or workflow:
        # Taken care of by inspire-crawler
        return

    workflow = "article"

    if workflow not in workflows:
        current_app.logger.warning(
            "{0} not in available workflows. Skipping workflow {1}.".format(
                workflow, name
            )
        )
        return

    for record in records:
        recxml = six.text_type(record)
        marcxml = convert(recxml, "oaiarXiv2marcxml.xsl")
        record = create_record(marcxml)
        hep_record = hep.do(record)
        start.delay(workflow, data=[hep_record])
Exemplo n.º 3
0
def spawn_arXiv_workflow_from_oai_harvest(request, records, name, **kwargs):
    """Receive a list of harvested arXiv records and schedule workflow."""
    from flask import current_app
    from invenio_workflows import start, workflows

    if request.endpoint not in ARXIV_URLS:
        # This is not arXiv
        return

    spider = kwargs.get('spider')
    workflow = kwargs.get('workflow')
    if spider or workflow:
        # Taken care of by inspire-crawler
        return

    workflow = "article"

    if workflow not in workflows:
        current_app.logger.warning(
            "{0} not in available workflows. Skipping workflow {1}.".format(
                workflow, name
            )
        )
        return

    for record in records:
        recxml = six.text_type(record)
        marcxml = convert(recxml, "oaiarXiv2marcxml.xsl")
        record = create_record(marcxml)
        hep_record = hep.do(record)
        start.delay(workflow, data=[hep_record])
Exemplo n.º 4
0
    def _author_list(obj, eng):
        from inspirehep.modules.converter.xslt import convert

        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)
        arxiv_id = get_arxiv_id_from_record(record)
        existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id))

        if not existing_file:
            # We download it
            tarball = get_tarball_for_model(eng, arxiv_id)

            if tarball is None:
                obj.log.error("No tarball found")
                return
            add_file_by_name(model, tarball)
        else:
            tarball = existing_file.get_syspath()

        sub_dir = os.path.abspath("{0}_files".format(tarball))
        try:
            file_list = untar(tarball, sub_dir)
        except InvalidTarball:
            obj.log.error("Invalid tarball {0}".format(tarball))
            return
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))

        xml_files_list = [filename for filename in file_list
                          if filename.endswith(".xml")]
        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors_xml = convert(xml_content, stylesheet)
                authorlist_record = get_json_from_marcxml(authors_xml)[0]
                record.update(authorlist_record)
                obj.update_task_results(
                    "authors",
                    [{
                        "name": "authors",
                        "results": authorlist_record["authors"]
                    }]
                )
                obj.update_task_results(
                    "number_of_authors",
                    [{
                        "name": "number_of_authors",
                        "results": authorlist_record["number_of_authors"]
                    }]
                )
                break
        model.update()
def test_harvesting_arxiv_workflow_accepted(mocked, small_app,
                                            record_oai_arxiv_plots):
    """Test a full harvesting workflow."""
    from invenio_workflows import (start, WorkflowEngine, ObjectStatus,
                                   workflow_object_class)
    from dojson.contrib.marc21.utils import create_record
    from invenio_db import db
    from inspirehep.dojson.hep import hep
    from inspirehep.modules.converter.xslt import convert

    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots,
                                             "oaiarXiv2marcxml.xsl")
    record_marc = create_record(record_oai_arxiv_plots_marcxml)
    record_json = hep.do(record_marc)
    workflow_uuid = None
    with small_app.app_context():
        workflow_uuid = start('article', [record_json])

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]

        assert obj.status == ObjectStatus.HALTED
        assert obj.data_type == "hep"

        # Files should have been attached (tarball + pdf)
        assert obj.files["1407.7587.pdf"]
        assert obj.files["1407.7587.tar.gz"]

        # A publication note should have been extracted
        pub_info = obj.data.get('publication_info')
        assert pub_info
        assert pub_info[0]
        assert pub_info[0].get('year') == "2014"
        assert pub_info[0].get('journal_title') == "J. Math. Phys."

        # This record should not have been touched yet
        assert "approved" not in obj.extra_data

        # Now let's resolve it as accepted and continue
        # FIXME Should be accept, but record validation prevents us.
        obj.remove_action()
        obj.extra_data["approved"] = True
        obj.extra_data["core"] = True
        obj.save()

        db.session.commit()

    with small_app.app_context():
        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]
        obj_id = obj.id
        obj.continue_workflow()

        obj = workflow_object_class.get(obj_id)
        # It was accepted
        assert obj.status == ObjectStatus.COMPLETED
Exemplo n.º 6
0
def already_harvested_on_legacy_record():
    """Provide record fixture."""
    record_oai_arxiv_plots = pkg_resources.resource_string(
        __name__,
        os.path.join(
            'fixtures',
            'oai_arxiv_record_already_on_legacy.xml'
        )
    )
    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(
        record_oai_arxiv_plots,
        "oaiarXiv2marcxml.xsl"
    )
    record_marc = create_record(record_oai_arxiv_plots_marcxml)
    json_data = hep.do(record_marc)

    return json_data
Exemplo n.º 7
0
def record():
    """Provide record fixture."""
    record_oai_arxiv_plots = pkg_resources.resource_string(
        __name__,
        os.path.join(
            'fixtures',
            'oai_arxiv_record_with_plots.xml'
        )
    )
    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(
        record_oai_arxiv_plots,
        "oaiarXiv2marcxml.xsl"
    )
    record_marc = create_record(record_oai_arxiv_plots_marcxml)
    json_data = hep.do(record_marc)

    if 'preprint_date' in json_data:
        json_data['preprint_date'] = datetime.date.today().isoformat()

    return json_data
Exemplo n.º 8
0
def spawn_arXiv_workflow_from_oai_harvest(request, records, name, **kwargs):
    """Receive a list of harvested arXiv records and schedule workflow."""
    from flask import current_app
    from invenio_workflows import start, workflows

    if not request.endpoint == "http://export.arxiv.org/oai2":
        return

    workflow = "arxiv_ingestion"

    if workflow not in workflows:
        current_app.logger.warning(
            "{0} not in available workflows. Skipping workflow {1}.".format(
                workflow, name
            )
        )
        return

    for record in records:
        recxml = six.text_type(record)
        marcxml = convert(recxml, "oaiarXiv2marcxml.xsl")
        record = create_record(marcxml)
        hep_record = hep.do(record)
        start.delay(workflow, data=[hep_record])
Exemplo n.º 9
0
def test_harvesting_arxiv_workflow_rejected(
    mocked_api_request_beard_block, mocked_api_request_magpie,
    mocked_api_request_beard, mocked_download,
    app, record_oai_arxiv_plots):
    """Test a full harvesting workflow."""
    from invenio_workflows import (
        start, WorkflowEngine, ObjectStatus, workflow_object_class
    )
    from dojson.contrib.marc21.utils import create_record
    from invenio_db import db
    from inspirehep.dojson.hep import hep
    from inspirehep.modules.converter.xslt import convert

    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(
        record_oai_arxiv_plots,
        "oaiarXiv2marcxml.xsl"
    )
    record_marc = create_record(record_oai_arxiv_plots_marcxml)
    record_json = hep.do(record_marc)

    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
    }

    workflow_uuid = None
    with app.app_context():
        with mock.patch.dict(app.config, extra_config):
            workflow_uuid = start('article', [record_json])

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]

        assert obj.status == ObjectStatus.HALTED
        assert obj.data_type == "hep"

        # Files should have been attached (tarball + pdf, and plots)
        assert obj.files["1407.7587.pdf"]
        assert obj.files["1407.7587.tar.gz"]

        assert len(obj.files) > 2

        # A publication note should have been extracted
        pub_info = obj.data.get('publication_info')
        assert pub_info
        assert pub_info[0]
        assert pub_info[0].get('year') == "2014"
        assert pub_info[0].get('journal_title') == "J. Math. Phys."

        # A prediction should have been made
        prediction = obj.extra_data.get("relevance_prediction")
        assert prediction
        assert prediction['decision'] == "Rejected"
        assert prediction['scores']['Rejected'] == 0.8358207729691823

        experiments_prediction = obj.extra_data.get("experiments_prediction")
        assert experiments_prediction
        assert experiments_prediction['experiments'] == [
            ['CMS', 0.7549515247344971]
        ]

        keywords_prediction = obj.extra_data.get("keywords_prediction")
        assert keywords_prediction
        assert {"label": "galaxy", "score": 0.29424679279327393,
                "accept": True} in keywords_prediction['keywords']

        # This record should not have been touched yet
        assert "approved" not in obj.extra_data

        # Now let's resolve it as accepted and continue
        # FIXME Should be accept, but record validation prevents us.
        obj.remove_action()
        obj.extra_data["approved"] = False
        # obj.extra_data["core"] = True
        obj.save()

        db.session.commit()

    with app.app_context():
        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]
        obj_id = obj.id
        obj.continue_workflow()

        obj = workflow_object_class.get(obj_id)
        # It was rejected
        assert obj.status == ObjectStatus.COMPLETED
Exemplo n.º 10
0
def test_harvesting_arxiv_workflow_rejected(mocked_api_request_beard_block,
                                            mocked_api_request_magpie,
                                            mocked_api_request_beard,
                                            mocked_download, small_app,
                                            record_oai_arxiv_plots):
    """Test a full harvesting workflow."""

    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots,
                                             "oaiarXiv2marcxml.xsl")
    record_marc = create_record(record_oai_arxiv_plots_marcxml)
    record_json = hep.do(record_marc)

    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
    }

    workflow_uuid = None
    with small_app.app_context():
        with mock.patch.dict(small_app.config, extra_config):
            workflow_uuid = start('article', [record_json])

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]

        assert obj.status == ObjectStatus.HALTED
        assert obj.data_type == "hep"

        # Files should have been attached (tarball + pdf, and plots)
        assert obj.files["1407.7587.pdf"]
        assert obj.files["1407.7587.tar.gz"]

        assert len(obj.files) > 2

        # A publication note should have been extracted
        pub_info = obj.data.get('publication_info')
        assert pub_info
        assert pub_info[0]
        assert pub_info[0].get('year') == 2014
        assert pub_info[0].get('journal_title') == "J. Math. Phys."

        # A prediction should have been made
        prediction = obj.extra_data.get("relevance_prediction")
        assert prediction
        assert prediction['decision'] == "Rejected"
        assert prediction['scores']['Rejected'] == 0.8358207729691823

        experiments_prediction = obj.extra_data.get("experiments_prediction")
        assert experiments_prediction
        assert experiments_prediction['experiments'] == [[
            'CMS', 0.7549515247344971
        ]]

        keywords_prediction = obj.extra_data.get("keywords_prediction")
        assert keywords_prediction
        assert {
            "label": "galaxy",
            "score": 0.29424679279327393,
            "accept": True
        } in keywords_prediction['keywords']

        # This record should not have been touched yet
        assert "approved" not in obj.extra_data

        # Now let's resolve it as accepted and continue
        # FIXME Should be accept, but record validation prevents us.
        obj.remove_action()
        obj.extra_data["approved"] = False
        # obj.extra_data["core"] = True
        obj.save()

        db.session.commit()

    with small_app.app_context():
        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]
        obj_id = obj.id
        obj.continue_workflow()

        obj = workflow_object_class.get(obj_id)
        # It was rejected
        assert obj.status == ObjectStatus.COMPLETED