def test_match_reference_for_data_config():
    """Test reference matcher for the JCAP and JHEP configuration"""

    cited_record_json = {
        '$schema': 'http://localhost:5000/schemas/records/data.json',
        '_collections': ['Data'],
        'control_number': 1,
        'dois': [{
            'value': '10.5281/zenodo.11020'
        }],
    }

    TestRecordMetadata.create_from_kwargs(json=cited_record_json,
                                          index_name='records-data',
                                          pid_type='dat')

    reference = {
        'reference': {
            'dois': ['10.5281/zenodo.11020'],
            'publication_info': {
                'year': 2007
            }
        }
    }

    reference = match_reference(reference)

    assert reference['record']['$ref'] == 'http://localhost:5000/api/data/1'
def test_match_reference_on_texkey():
    cited_record_json = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'control_number': 1,
        'document_type': ['article'],
        'texkeys': [
            'Giudice:2007fh',
        ],
        'titles': [{
            'title': 'The Strongly-Interacting Light Higgs'
        }],
    }

    TestRecordMetadata.create_from_kwargs(json=cited_record_json,
                                          index_name='records-hep')

    reference = {
        'reference': {
            'texkey': 'Giudice:2007fh',
        }
    }

    schema = load_schema('hep')
    subschema = schema['properties']['references']

    assert validate([reference], subschema) is None
    reference = match_reference(reference)

    assert reference['record'][
        '$ref'] == 'http://localhost:5000/api/literature/1'
    assert validate([reference], subschema) is None
def test_match_reference_ignores_deleted():
    cited_record_json = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'control_number': 1,
        'document_type': ['article'],
        'deleted': True,
        'dois': [{
            'value': '10.1371/journal.pone.0188398',
        }],
    }

    TestRecordMetadata.create_from_kwargs(json=cited_record_json,
                                          index_name='records-hep')

    reference = {
        'reference': {
            'dois': ['10.1371/journal.pone.0188398'],
        }
    }

    schema = load_schema('hep')
    subschema = schema['properties']['references']

    assert validate([reference], subschema) is None
    reference = match_reference(reference)

    assert 'record' not in reference
Пример #4
0
def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if 'references' in obj.data:
        extracted_raw_references = extract_references_from_raw_refs(obj.data['references'])
        extracted_raw_references = [match_reference(ref) for ref in extracted_raw_references]
        obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references))
        obj.data['references'] = dedupe_list(extracted_raw_references)
        return

    pdf_references, text_references = [], []
    source = get_source(obj.data)

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            pdf_references = extract_references_from_pdf(tmp_document, source)
            pdf_references = [match_reference(ref) for ref in pdf_references]
            pdf_references = dedupe_list(pdf_references)

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        text_references = extract_references_from_text(text, source)
        text_references = [match_reference(ref) for ref in text_references]
        text_references = dedupe_list(text_references)

    if len(pdf_references) == len(text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(pdf_references) > len(text_references):
        obj.log.info('Extracted %d references from PDF.', len(pdf_references))
        obj.data['references'] = pdf_references
    elif len(text_references) >= len(pdf_references):
        obj.log.info('Extracted %d references from text.', len(text_references))
        obj.data['references'] = text_references
def test_match_reference_for_jcap_and_jhep_config():
    """Test reference matcher for the JCAP and JHEP configuration"""

    cited_record_json = {
        '$schema':
        'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'control_number':
        1,
        'document_type': ['article'],
        'publication_info': [{
            'artid': '045',
            'journal_title': 'JHEP',
            'journal_volume': '06',
            'page_start': '045',
            'year': 2007
        }],
        'titles': [{
            'title': 'The Strongly-Interacting Light Higgs'
        }],
    }

    TestRecordMetadata.create_from_kwargs(json=cited_record_json,
                                          index_name='records-hep')

    reference = {
        'reference': {
            'publication_info': {
                'artid': '045',
                'journal_title': 'JHEP',
                'journal_volume': '06',
                'page_start': '045',
                'year': 2007
            }
        }
    }

    schema = load_schema('hep')
    subschema = schema['properties']['references']

    assert validate([reference], subschema) is None
    reference = match_reference(reference)

    assert reference['record'][
        '$ref'] == 'http://localhost:5000/api/literature/1'
    assert validate([reference], subschema) is None