예제 #1
0
def test_arxiv_author_list_handles_auto_ignore_comment():
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1703.09986.tar.gz'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz':
        AttrDict({'file': AttrDict({
            'uri': filename,
        })})
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    assert default_arxiv_author_list(obj, eng) is None
예제 #2
0
def test_arxiv_author_list_with_missing_tarball():
    schema = load_schema('hep')

    eprints_subschema = schema['properties']['arxiv_eprints']
    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    validate(data['arxiv_eprints'], eprints_subschema)

    extra_data = {}
    files = MockFiles({
        'jessica.jones.tar.gz':
        AttrDict({'file': AttrDict({
            'uri': 'alias.investigations',
        })})
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()
    expected_message = \
        'Skipping author list extraction, no tarball with name "1703.09986.tar.gz" found'

    assert default_arxiv_author_list(obj, eng) is None

    assert expected_message in obj.log._info.getvalue()
예제 #3
0
def test_arxiv_author_list_logs_on_error(mock_untar):
    mock_untar.side_effect = InvalidTarball

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1605.07707'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-th',
                ],
                'value': '1605.07707',
            },
        ],
    }  # synthethic data
    extra_data = {}
    files = MockFiles({
        '1605.07707.tar.gz':
        AttrDict({'file': AttrDict({
            'uri': filename,
        })})
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    assert default_arxiv_author_list(obj, eng) is None
    assert '1605.07707' in obj.log._info.getvalue()
def test_arxiv_author_list_handles_multiple_author_xml_files():
    schema = load_schema('hep')
    eprints_subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1703.09986.multiple_author_lists.tar.gz'))

    data = {
        '$schema': 'http://localhost:5000/hep.json',
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    validate(data['arxiv_eprints'], eprints_subschema)

    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz': AttrDict({
            'file': AttrDict({
                'uri': filename,
            })
        })
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()
    default_arxiv_author_list(obj, eng)

    authors_subschema = schema['properties']['authors']
    expected_authors = [
        {
            'affiliations': [{'value': 'Yerevan Phys. Inst.'}],
            'ids': [
                {'value': 'INSPIRE-00312131', 'schema': 'INSPIRE ID'},
                {'value': 'CERN-432142', 'schema': 'CERN'},
            ],
            'full_name': 'Sirunyan, Albert M',
        },
        {
            'affiliations': [{'value': 'Yerevan Phys. Inst.'}],
            'ids': [
                {'value': 'INSPIRE-00312132', 'schema': 'INSPIRE ID'},
                {'value': 'CERN-432143', 'schema': 'CERN'},
            ],
            'full_name': 'Weary, Jake',
        }
    ]
    validate(expected_authors, authors_subschema)

    assert obj.data.get('authors') == expected_authors
예제 #5
0
def test_arxiv_author_list_handles_multiple_author_xml_files():
    schema = load_schema('hep')
    eprints_subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1703.09986.multiple_author_lists.tar.gz'))

    data = {
        '$schema': 'http://localhost:5000/hep.json',
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    validate(data['arxiv_eprints'], eprints_subschema)

    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz': AttrDict({
            'file': AttrDict({
                'uri': filename,
            })
        })
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()
    default_arxiv_author_list(obj, eng)

    authors_subschema = schema['properties']['authors']
    expected_authors = [
        {
            'affiliations': [{'value': 'Yerevan Phys. Inst.'}],
            'ids': [
                {'value': 'INSPIRE-00312131', 'schema': 'INSPIRE ID'},
                {'value': 'CERN-432142', 'schema': 'CERN'},
            ],
            'full_name': 'Sirunyan, Albert M',
        },
        {
            'affiliations': [{'value': 'Yerevan Phys. Inst.'}],
            'ids': [
                {'value': 'INSPIRE-00312132', 'schema': 'INSPIRE ID'},
                {'value': 'CERN-432143', 'schema': 'CERN'},
            ],
            'full_name': 'Weary, Jake',
        }
    ]
    validate(expected_authors, authors_subschema)

    assert obj.data.get('authors') == expected_authors
예제 #6
0
def test_arxiv_author_list_does_not_produce_latex():
    schema = load_schema('hep')

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1802.03388.tar.gz'))

    eprints_subschema = schema['properties']['arxiv_eprints']
    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1802.03388',
            },
        ],
    }
    validate(data['arxiv_eprints'], eprints_subschema)

    extra_data = {}
    files = MockFiles({
        '1802.03388.tar.gz': AttrDict({'file': AttrDict({'uri': filename})})
    })

    authors_subschema = schema['properties']['authors']
    expected_authors = [
        {
            'affiliations': [{'value': 'Lund U.'}],
            'ids': [
                {
                    'value': 'INSPIRE-00061248',
                    'schema': 'INSPIRE ID'
                }
            ],
            'full_name': u'Åkesson, Torsten Paul Ake'
        },
    ]
    validate(expected_authors, authors_subschema)

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    assert default_arxiv_author_list(obj, eng) is None
    assert obj.data.get('authors') == expected_authors
예제 #7
0
def test_arxiv_author_list_logs_on_error(mock_os, mock_untar):
    mock_untar.side_effect = InvalidTarball

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-th',
                ],
                'value': '1605.07707',
            },
        ],
    }  # synthethic data
    extra_data = {}
    files = MockFiles({
        '1605.07707.tar.gz':
        AttrDict({
            'file':
            AttrDict({
                'uri': 'http://export.arxiv.org/e-print/1605.07707',
            })
        })
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    try:
        temporary_dir = mkdtemp()
        mock_os.path.abspath.return_value = temporary_dir

        assert default_arxiv_author_list(obj, eng) is None

        expected = 'Invalid tarball http://export.arxiv.org/e-print/1605.07707 for arxiv_id 1605.07707'
        result = obj.log._error.getvalue()

        assert expected == result
    finally:
        rmtree(temporary_dir)
def test_arxiv_author_list_only_overrides_authors():
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1703.09986.tar.gz'))

    data = {
        '$schema': 'http://localhost:5000/hep.json',
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    validate(data['arxiv_eprints'], subschema)

    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz': AttrDict({
            'file': AttrDict({
                'uri': filename,
            })
        })
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()
    default_arxiv_author_list(obj, eng)

    assert 'arxiv_eprints' in obj.data
    assert obj.data['arxiv_eprints'] == data['arxiv_eprints']
    assert '$schema' in obj.data
    assert obj.data['$schema'] == data['$schema']
예제 #9
0
def test_arxiv_author_list_only_overrides_authors():
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1703.09986.tar.gz'))

    data = {
        '$schema': 'http://localhost:5000/hep.json',
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    validate(data['arxiv_eprints'], subschema)

    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz': AttrDict({
            'file': AttrDict({
                'uri': filename,
            })
        })
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()
    default_arxiv_author_list(obj, eng)

    assert 'arxiv_eprints' in obj.data
    assert obj.data['arxiv_eprints'] == data['arxiv_eprints']
    assert '$schema' in obj.data
    assert obj.data['$schema'] == data['$schema']
def test_arxiv_author_list_handles_auto_ignore_comment(mock_os):
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1703.09986.tar.gz'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz': AttrDict({
            'file': AttrDict({
                'uri': filename,
            })
        })
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    try:
        temporary_dir = mkdtemp()
        mock_os.path.abspath.return_value = temporary_dir

        assert default_arxiv_author_list(obj, eng) is None
    finally:
        rmtree(temporary_dir)
def test_arxiv_author_list_logs_on_error(mock_untar):
    mock_untar.side_effect = InvalidTarball

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-th',
                ],
                'value': '1605.07707',
            },
        ],
    }  # synthethic data
    extra_data = {}
    files = MockFiles({
        '1605.07707.tar.gz': AttrDict({
            'file': AttrDict({
                'uri': 'http://export.arxiv.org/e-print/1605.07707',
            })
        })
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    assert default_arxiv_author_list(obj, eng) is None

    expected = 'Invalid tarball http://export.arxiv.org/e-print/1605.07707 for arxiv_id 1605.07707'
    result = obj.log._info.getvalue()

    assert expected == result
예제 #12
0
                set_core_in_extra_data,
            ],
            mark('auto-approved', False),
        ),
    ),
]

ENHANCE_RECORD = [
    IF(
        is_arxiv_paper,
        [
            populate_arxiv_document,
            arxiv_package_download,
            arxiv_plot_extract,
            arxiv_derive_inspire_categories,
            arxiv_author_list("authorlist2marcxml.xsl"),
        ]
    ),
    IF(
        is_submission,
        populate_submission_document,
    ),
    download_documents,
    normalize_journal_titles,
    refextract,
    count_reference_coreness,
    extract_journal_info,
    populate_journal_coverage,
    classify_paper(
        only_core_tags=False,
        spires=True,
예제 #13
0
class Article(object):
    """Article ingestion workflow for Literature collection."""
    name = "HEP"
    data_type = "hep"

    workflow = [
        # Make sure schema is set for proper indexing in Holding Pen
        set_schema,
        # Emit record signals to receive metadata enrichment
        emit_record_signals,
        # Query locally or via legacy search API to see if article
        # is already ingested and this is an update
        IF(article_exists, [
            mark('match-found', True),
        ]),
        IF_ELSE(
            is_submission,
            [
                # Article matching for submissions
                # ================================
                IF(pending_in_holding_pen, [
                    mark('already-in-holding-pen', True),
                ]),
                # Special RT integration for submissions
                # ======================================
                create_ticket(
                    template="literaturesuggest/tickets/curator_submitted.html",
                    queue="HEP_add_user",
                    context_factory=new_ticket_context,
                    ticket_id_key="ticket_id"),
                reply_ticket(
                    template="literaturesuggest/tickets/user_submitted.html",
                    context_factory=reply_ticket_context,
                    keep_new=True),
            ],
            [
                # Article matching for non-submissions
                # ====================================
                # Query holding pen to see if we already have this article ingested
                #
                # NOTE on updates:
                #     If the same article has been harvested before and the
                #     ingestion has been completed, process is continued
                #     to allow for updates.
                IF(pending_in_holding_pen, [
                    mark('already-in-holding-pen', True),
                    mark('delete', True),
                ]),
                IF(
                    is_arxiv_paper,
                    [
                        # FIXME: This filtering step should be removed when this
                        #        workflow includes arXiv CORE harvesting
                        IF(already_harvested, [
                            mark('already-ingested', True),
                            mark('stop', True),
                        ]),
                        # FIXME: This filtering step should be removed when:
                        #        old previously rejected records are treated
                        #        differently e.g. good auto-reject heuristics or better
                        #        time based filtering (5 days is quite random now).
                        IF(previously_rejected(), [
                            mark('already-ingested', True),
                            mark('stop', True),
                        ]),
                    ]),
                IF(is_marked('delete'),
                   [update_old_object, delete_self_and_stop_processing]),
                IF(is_marked('stop'), [stop_processing]),
            ]),
        #
        # Article Processing
        # ==================
        IF(is_arxiv_paper, [
            arxiv_fulltext_download,
            arxiv_plot_extract,
            arxiv_refextract,
            arxiv_author_list("authorlist2marcxml.xsl"),
        ]),
        extract_journal_info,
        classify_paper(
            taxonomy="HEPont.rdf",
            only_core_tags=False,
            spires=True,
            with_author_keywords=True,
        ),
        filter_core_keywords,
        guess_categories,
        IF(is_experimental_paper, [
            guess_experiments,
        ]),
        guess_keywords,
        # Predict action for a generic HEP paper based only on title
        # and abstract.
        guess_coreness,  # ("arxiv_skip_astro_title_abstract.pickle)
        # Check if we shall halt or auto-reject
        # =====================================
        # NOTE: User submissions are always relevant
        IF_ELSE(is_record_relevant, [
            halt_record(action="hep_approval"),
        ], [reject_record("Article automatically rejected"), stop_processing]),
        IF_ELSE(is_record_accepted, [
            IF(article_exists, [
                IF_ELSE(is_submission, [
                    reject_record('Article was already found on INSPIRE'),
                    stop_processing,
                    reply_ticket(
                        template=
                        "literaturesuggest/tickets/user_rejected_exists.html",
                        context_factory=reply_ticket_context),
                    close_ticket(ticket_id_key="ticket_id"),
                ], [
                    halt_record(action="merge_approval"),
                ]),
            ]),
            add_core,
            add_note_entry,
            filter_keywords,
            user_pdf_get,
            IF_ELSE(shall_push_remotely, [
                IF_ELSE(article_exists, [
                    prepare_update_payload(extra_data_key="update_payload"),
                    send_robotupload(marcxml_processor=hep2marc,
                                     mode="correct",
                                     extra_data_key="update_payload"),
                ], [
                    send_robotupload(marcxml_processor=hep2marc,
                                     mode="insert"),
                ])
            ], [store_record]),
            IF(is_submission, [
                IF(curation_ticket_needed, [
                    create_ticket(
                        template="literaturesuggest/tickets/curation_core.html",
                        queue="HEP_curation",
                        context_factory=curation_ticket_context,
                        ticket_id_key="curation_ticket_id")
                ]),
                reply_ticket(
                    template="literaturesuggest/tickets/user_accepted.html",
                    context_factory=reply_ticket_context),
            ]),
        ], [
            IF(is_submission,
               [reply_ticket(context_factory=reply_ticket_context)])
        ]),
        close_ticket(ticket_id_key="ticket_id")
    ]
예제 #14
0
        [stop_processing]
    ),
]


ENHANCE_RECORD = [
    # Article Processing
    # ==================
    IF(
        is_arxiv_paper,
        [
            arxiv_fulltext_download,
            arxiv_plot_extract,
            arxiv_refextract,
            arxiv_derive_inspire_categories,
            arxiv_author_list("authorlist2marcxml.xsl"),
        ]
    ),
    extract_journal_info,
    classify_paper(
        taxonomy="HEPont.rdf",
        only_core_tags=False,
        spires=True,
        with_author_keywords=True,
    ),
    filter_core_keywords,
    guess_categories,
    # TODO: adapt the output of guess_experiment so that it
    # can be stored in ElasticSearch (see issue #2054).
    # IF(
    #     is_experimental_paper,