Exemplo n.º 1
0
def convert_marcxml(source):
    """Convert MARC XML to JSON."""
    from dojson.contrib.marc21.utils import create_record, split_blob

    from inspirehep.dojson.utils import strip_empty_values
    from inspirehep.dojson.hep import hep
    from inspirehep.dojson.institutions import institutions
    from inspirehep.dojson.journals import journals
    from inspirehep.dojson.experiments import experiments
    from inspirehep.dojson.hepnames import hepnames
    from inspirehep.dojson.jobs import jobs
    from inspirehep.dojson.conferences import conferences

    for data in split_blob(source.read()):
        record = create_record(data)
        if _collection_in_record(record, 'institution'):
            yield strip_empty_values(institutions.do(record))
        elif _collection_in_record(record, 'experiment'):
            yield strip_empty_values(experiments.do(record))
        elif _collection_in_record(record, 'journals'):
            yield strip_empty_values(journals.do(record))
        elif _collection_in_record(record, 'hepnames'):
            yield strip_empty_values(hepnames.do(record))
        elif _collection_in_record(record, 'job') or \
                _collection_in_record(record, 'jobhidden'):
            yield strip_empty_values(jobs.do(record))
        elif _collection_in_record(record, 'conferences'):
            yield strip_empty_values(conferences.do(record))
        else:
            yield strip_empty_values(hep.do(record))
def test_superseded_institutions_from_110__x_z():
    snippet = (
        '<datafield tag="110" ind1=" " ind2=" ">'
        '  <subfield code="a">University of Pittsburgh</subfield>'
        '  <subfield code="t">U. Pittsburgh</subfield>'
        '  <subfield code="u">U. Pittsburgh (main)</subfield>'
        '  <subfield code="x">Pittsburgh U., Dept. Phil.</subfield>'
        '  <subfield code="x">Pittsburgh U., Med. School</subfield>'
        '  <subfield code="z">908047</subfield>'
        '  <subfield code="z">905042</subfield>'
        '</datafield>'
    ) # record/1272953
    expected = [
        {
            'curated_relation': True,
            'name': 'Pittsburgh U., Dept. Phil.',
            'record': {
                '$ref': 'http://localhost:5000/api/institutions/908047',
            },
            'relation_type': 'superseded',
        },
        {
            'curated_relation': True,
            'name': 'Pittsburgh U., Med. School',
            'record': {
                '$ref': 'http://localhost:5000/api/institutions/905042',
            },
            'relation_type': 'superseded',
        },
    ]
    result = institutions.do(create_record(snippet))

    assert expected == result['related_institutes']
def test_related_institutes_from__510_a_w_0():
    schema = load_schema('institutions')
    subschema = schema['properties']['related_institutes']

    snippet = (
        '<datafield tag="510" ind1=" " ind2=" ">'
        '  <subfield code="0">1385404</subfield>'
        '  <subfield code="a">U. Caen (main)</subfield>'
        '  <subfield code="w">t</subfield>'
        '</datafield>'
    )  # record/1430106

    expected = [
        {
            'curated_relation': True,
            'name': 'U. Caen (main)',
            'relation_type': 'parent',
            'record': {
                '$ref': 'http://localhost:5000/api/institutions/1385404',
            },
        },
    ]
    result = institutions.do(create_record(snippet))

    assert validate(result['related_institutes'], subschema) is None
    assert expected == result['related_institutes']
def test_address_from_marcxml_371__a_b_c_d_e_double_g():
    snippet = (
        '<datafield tag="371" ind1=" " ind2=" ">'
        '  <subfield code="a">Philosophenweg 16</subfield>'
        '  <subfield code="b">Heidelberg</subfield>'
        '  <subfield code="c">Baden-Wuerttemberg</subfield>'
        '  <subfield code="d">Germany</subfield>'
        '  <subfield code="e">69120</subfield>'
        '  <subfield code="g">DE</subfield>'
        '  <subfield code="g">DE</subfield>'
        '</datafield>'
    )

    expected = [
        {
            "city": "Heidelberg",
            "country": "Germany",
            "country_code": "DE",
            "state": "Baden-Wuerttemberg",
            "original_address": [
                "Philosophenweg 16",
            ],
            "postal_code": "69120",
        },
    ]
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['address']
def test_extra_words_from_410__decuple_g():
    snippet = (
        '<datafield tag="410" ind1=" " ind2=" ">'
        '  <subfield code="g">Institut Theoretische Physik,</subfield>'
        '  <subfield code="g">RWTH, Inst.</subfield>'
        '  <subfield code="g">institute A</subfield>'
        '  <subfield code="g">III. Physikalisches Institut, Technische Hochschule Aachen, Aachen, West</subfield>'
        '  <subfield code="g">physics</subfield>'
        '  <subfield code="g">52056</subfield>'
        '  <subfield code="g">D-52056</subfield>'
        '  <subfield code="g">DE-52056</subfield>'
        '  <subfield code="g">phys</subfield>'
        '  <subfield code="g">I. Physikalisches Institut</subfield>'
        '</datafield>'
    )  # record/902624

    expected = [
        'Institut Theoretische Physik,',
        'RWTH, Inst.',
        'institute A',
        'III. Physikalisches Institut, Technische Hochschule Aachen, Aachen, West',
        'physics',
        '52056',
        'D-52056',
        'DE-52056',
        'phys',
        'I. Physikalisches Institut',
    ]
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['extra_words']
def test_address_from_marcxml_371__a_b_c_d_double_e_g():
    snippet = (
        '<datafield tag="371" ind1=" " ind2=" ">'
        '  <subfield code="a">Philosophenweg 16</subfield>'
        '  <subfield code="b">Heidelberg</subfield>'
        '  <subfield code="c">Baden-Wuerttemberg</subfield>'
        '  <subfield code="d">Germany</subfield>'
        '  <subfield code="e">69120</subfield>'
        '  <subfield code="e">DE-119</subfield>'
        '  <subfield code="g">DE</subfield>'
        '</datafield>'
    )

    expected = [
        {
            'city': 'Heidelberg',
            'country': 'Germany',
            'country_code': 'DE',
            'state': 'Baden-Wuerttemberg',
            'original_address': [
                'Philosophenweg 16',
            ],
            'postal_code': '69120, DE-119',
        }
    ]
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['address']
def test_related_institutes_from__510_a_w_0_successor():
    schema = load_schema('institutions')
    subschema = schema['properties']['related_institutes']

    snippet = (
        '<datafield tag="510" ind1=" " ind2=" ">'
        '  <subfield code="0">911753</subfield>'
        '  <subfield code="a">HZB, Berlin</subfield>'
        '  <subfield code="w">b</subfield>'
        '</datafield>'
    )  # record/902831

    expected = [
        {
            'curated_relation': True,
            'name': 'HZB, Berlin',
            'relation_type': 'successor',
            'record': {
                '$ref': 'http://localhost:5000/api/institutions/911753',
            },
        },
    ]
    result = institutions.do(create_record(snippet))

    assert validate(result['related_institutes'], subschema) is None
    assert expected == result['related_institutes']
def test_address_from_marcxml_371__a_b_c_d_e_double_g():
    schema = load_schema('institutions')
    subschema = schema['properties']['address']

    snippet = (
        '<datafield tag="371" ind1=" " ind2=" ">'
        '  <subfield code="a">Philosophenweg 16</subfield>'
        '  <subfield code="b">Heidelberg</subfield>'
        '  <subfield code="c">Baden-Wuerttemberg</subfield>'
        '  <subfield code="d">Germany</subfield>'
        '  <subfield code="e">69120</subfield>'
        '  <subfield code="g">DE</subfield>'
        '  <subfield code="g">DE</subfield>'
        '</datafield>'
    )

    expected = [
        {
            'city': 'Heidelberg',
            'country_code': 'DE',
            'original_address': 'Philosophenweg 16',
            'postal_code': '69120',
            'state': 'Baden-Wuerttemberg',
        },
    ]
    result = institutions.do(create_record(snippet))

    assert validate(result['address'], subschema) is None
    assert expected == result['address']
def test_related_institutes_from__510_a_w_0_other():
    schema = load_schema('institutions')
    subschema = schema['properties']['related_institutes']

    snippet = (
        '<datafield tag="510" ind1=" " ind2=" ">'
        '  <subfield code="0">945696</subfield>'
        '  <subfield code="a">UMass Amherst</subfield>'
        '  <subfield code="w">r</subfield>'
        '</datafield>'
    )  # record/902971

    expected = [
        {
            'curated_relation': True,
            'name': 'UMass Amherst',
            'relation_type': 'other',
            'record': {
                '$ref': 'http://localhost:5000/api/institutions/945696',
            },
        },
    ]
    result = institutions.do(create_record(snippet))

    assert validate(result['related_institutes'], subschema) is None
    assert expected == result['related_institutes']
def test_ICN_legacy_ICN_institution_department_and_department_acryonym_from_110__a_b_t_u():
    schema = load_schema('institutions')

    snippet = (
        '<datafield tag="110" ind1=" " ind2=" ">'
        '  <subfield code="a">Université Libre de Bruxelles</subfield>'
        '  <subfield code="b">Physique Theorique et Mathematique (PTM)</subfield>'
        '  <subfield code="t">U. Libre Brussels, PTM</subfield>'
        '  <subfield code="u">Brussels U., PTM</subfield>'
        '</datafield>'
    )  # record/909579

    expected = {
        'ICN': [
            'U. Libre Brussels, PTM',
        ],
        'legacy_ICN': 'Brussels U., PTM',
        'institution': [
            u'Université Libre de Bruxelles',
        ],
        'department': [
            'Physique Theorique et Mathematique',
        ],
        'department_acronym': 'PTM'
    }
    result = institutions.do(create_record(snippet))

    for key in expected:
        assert validate(result[key], schema['properties'][key]) is None
        assert result[key] == expected[key]
def test_ICN_legacy_ICN_institution_and_institution_acronym_from_110__a_t_u():
    schema = load_schema('institutions')

    snippet = (
        '<datafield tag="110" ind1=" " ind2=" ">'
        '  <subfield code="a">European Organization for Nuclear Research (CERN)</subfield>'
        '  <subfield code="t">CERN, Geneva</subfield>'
        '  <subfield code="u">CERN</subfield>'
        '</datafield>'
    )  # record/902725

    expected = {
        'ICN': [
            'CERN, Geneva',
        ],
        'legacy_ICN': 'CERN',
        'institution': [
            'European Organization for Nuclear Research',
        ],
        'institution_acronym': 'CERN'
    }
    result = institutions.do(create_record(snippet))

    for key in expected:
        assert validate(result[key], schema['properties'][key]) is None
        assert result[key] == expected[key]
Exemplo n.º 12
0
def test_core_from_690c_a_core():
    snippet = (
        '<datafield tag="690" ind1="C" ind2=" ">'
        '  <subfield code="a">CORE</subfield>'
        '</datafield>'
    )  # record/902645

    result = strip_empty_values(institutions.do(create_record(snippet)))

    assert result['core']
def test_no_location_from_034__f():
    snippet = (
        '<datafield tag="034" ind1=" " ind2=" ">'
        '  <subfield code="f">50.7736</subfield>'
        '</datafield>'
    )  # synthetic data

    result = institutions.do(create_record(snippet))

    assert 'location' not in result
def test_core_from_690c_a_noncore():
    snippet = (
        '<datafield tag="690" ind1="C" ind2=" ">'
        '  <subfield code=a">NONCORE</subfield>'
        '</datafield>'
    )  # record/916025

    result = clean_record(institutions.do(create_record(snippet)))

    assert not result['core']
def test_no_location_from_034__d():
    snippet = (
        '<datafield tag="034" ind1=" " ind2=" ">'
        '  <subfield code="d">6.07532</subfield>'
        '</datafield>'
    )  # synthetic data

    result = clean_record(institutions.do(create_record(snippet)))

    assert 'location' not in result
Exemplo n.º 16
0
def test_no_location_from_invalid_034__d_f():
    snippet = (
        '<datafield tag="034" ind1=" " ind2=" ">'
        '  <subfield code="d">foo</subfield>'
        '  <subfield code="f">bar</subfield>'
        '</datafield>'
    )  # synthetic data

    result = strip_empty_values(institutions.do(create_record(snippet)))

    assert 'location' not in result
def test_timezone_from_043__t():
    snippet = (
        '<datafield tag="043" ind1=" " ind2=" ">'
        '  <subfield code="t">+05</subfield>'
        '</datafield>'
    )  # record/902635

    expected = ['+05']
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['timezone']
def test_no_location_from_034__double_d():
    snippet = (
        '<datafield tag="034" ind1=" " ind2=" ">'
        '   <subfield code="d">32.540776</subfield>'
        '   <subfield code="d">15.561010</subfield>'
        '</datafield>'
    )  # record/1442294

    result = institutions.do(create_record(snippet))

    assert 'location' not in result
def test_field_activity_from_372__a():
    snippet = (
        '<datafield tag="372" ind1=" " ind2=" ">'
        '  <subfield code="a">Research center</subfield>'
        '</datafield>'
    )

    expected = ['Research Center']
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['field_activity']
Exemplo n.º 20
0
def create_record(data, force=False, dry_run=False):
    record = marc_create_record(data)
    recid = None
    if '001' in record:
        recid = int(record['001'][0])
    if not dry_run and recid:
        prod_record = InspireProdRecords(recid=recid)
        prod_record.marcxml = data
    try:
        if _collection_in_record(record, 'institution'):
            json = strip_empty_values(institutions.do(record))
        elif _collection_in_record(record, 'experiment'):
            json = strip_empty_values(experiments.do(record))
        elif _collection_in_record(record, 'journals'):
            json = strip_empty_values(journals.do(record))
        elif _collection_in_record(record, 'hepnames'):
            json = strip_empty_values(hepnames.do(record))
        elif _collection_in_record(record, 'job') or \
                _collection_in_record(record, 'jobhidden'):
            json = strip_empty_values(jobs.do(record))
        elif _collection_in_record(record, 'conferences'):
            json = strip_empty_values(conferences.do(record))
        else:
            json = strip_empty_values(hep.do(record))
        if dry_run:
            return recid, json

        if force and any(key in json for key in ('control_number', 'recid')):
            try:
                control_number = json['control_number']
            except KeyError:
                control_number = json['recid']
            control_number = int(control_number)
            # Searches if record already exists.
            record = Record.get_record(control_number)
            if record is None:
                # Adds the record to the db session.
                rec = RecordModel(id=control_number)
                db.session.merge(rec)
                record = Record.create(json)
            else:
                record = Record(json, model=record.model)
                record.commit()
            if recid:
                prod_record.successful = True
                db.session.merge(prod_record)
            logger.info("Elaborated record {}".format(control_number))
            return control_number, dict(record)
    except Exception:
        if recid:
            prod_record.successful = False
            db.session.merge(prod_record)
            logger.exception("Error in elaborating record ID {}".format(recid))
        raise
def test_name_from_110__a():
    snippet = (
        '<datafield tag="110" ind1=" " ind2=" ">'
        '  <subfield code="a">Mid-America Christian U.</subfield>'
        '</datafield>'
    )  # record/1439728

    expected = [['Mid-America Christian U.']]
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['name']
def test_hidden_notes_from_595__a():
    snippet = (
        '<datafield tag="595" ind1=" " ind2=" ">'
        '  <subfield code="a">The Division is located inside the Department of Physics and Astronomy of the University of Catania Scientific Campus ("Città Universitaria" or "Cittadella"). Via Santa Sofia 64 95123 CATANIA</subfield>'
        '</datafield>'
    )  # record/902879

    expected = [u'The Division is located inside the Department of Physics and Astronomy of the University of Catania Scientific Campus ("Città Universitaria" or "Cittadella"). Via Santa Sofia 64 95123 CATANIA']
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['hidden_notes']
def test_non_public_notes_from_667__a():
    snippet = (
        '<datafield tag="667" ind1=" " ind2=" ">'
        '  <subfield code="a">Former ICN = Negev U.</subfield>'
        '</datafield>'
    )  # record/902663

    expected = ['Former ICN = Negev U.']
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['non_public_notes']
def test_location_from_034__f():
    snippet = (
        '<datafield tag="034" ind1=" " ind2=" ">'
        '  <subfield code="f">50.7736</subfield>'
        '</datafield>'
    )  # synthetic data

    expected = {
        'latitude': 50.7736,
    }
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['location']
def test_historical_data_from_6781_a():
    snippet = (
        '<datafield tag="678" ind1="1" ind2=" ">'
        '  <subfield code="a">Became IFH (Inst for Hochenergiephysik)in 1968. Since 1992 the official name of the Inst. is simply DESY Zeuthen. Changed 1/26/99 AMR</subfield>'
        '</datafield>'
    )  # record/902666

    expected = [
        'Became IFH (Inst for Hochenergiephysik)in 1968. Since 1992 the official name of the Inst. is simply DESY Zeuthen. Changed 1/26/99 AMR'
    ]
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['historical_data']
def test_public_notes_from_680__a():
    snippet = (
        '<datafield tag="680" ind1=" " ind2=" ">'
        '  <subfield code="i">2nd address: Organisation Européenne pour la Recherche Nucléaire (CERN), F-01631 Prévessin Cedex, France</subfield>'
        '</datafield>'
    )  # record/902725

    expected = [
        u'2nd address: Organisation Européenne pour la Recherche Nucléaire (CERN), F-01631 Prévessin Cedex, France'
    ]
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['public_notes']
def test_name_variants_from_410__9_with_invalid_source():
    snippet = (
        '<datafield tag="410" ind1=" " ind2=" ">'
        '  <subfield code="9">Tech</subfield>'
        '  <subfield code="a">CIIT</subfield>'
        '  <subfield code="g">Inst</subfield>'
        '</datafield>'
    )  # record/1338296

    expected = {}
    result = institutions.do(create_record(snippet))

    assert expected == result.get('name_variants', {})
def test_name_from_110__b_t_u():
    snippet = (
        '<datafield tag="110" ind1=" " ind2=" ">'
        '   <subfield code="b">Institute of Physics</subfield>'
        '   <subfield code="t">Inst. Phys., Belgrade</subfield>'
        '   <subfield code="u">Belgrade, Inst. Phys.</subfield>'
        '</datafield>'
    )   # record/903416

    expected = [['Belgrade, Inst. Phys.', 'Inst. Phys., Belgrade']]
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['name']
def test_name_from_110__a_b_u():
    snippet = (
        '<datafield tag="110" ind1=" " ind2=" ">'
        '  <subfield code="a">Fukushima University</subfield>'
        '  <subfield code="b">Department of Physics</subfield>'
        '  <subfield code="u">Fukushima U.</subfield>'
        '</datafield>'
    )  # record/902812

    expected = [['Fukushima University', 'Fukushima U.']]
    result = clean_record(institutions.do(create_record(snippet)))

    assert expected == result['name']
Exemplo n.º 30
0
def test_location_from_034__d():
    snippet = (
        '<datafield tag="034" ind1=" " ind2=" ">'
        '  <subfield code="d">6.07532</subfield>'
        '</datafield>'
    )  # synthetic data

    expected = {
        'longitude': 6.07532,
    }
    result = strip_empty_values(institutions.do(create_record(snippet)))

    assert expected == result['location']