def convert_marcxml(source): """Convert MARC XML to JSON.""" from dojson.contrib.marc21.utils import create_record, split_blob from inspirehep.dojson.utils import strip_empty_values from inspirehep.dojson.hep import hep from inspirehep.dojson.institutions import institutions from inspirehep.dojson.journals import journals from inspirehep.dojson.experiments import experiments from inspirehep.dojson.hepnames import hepnames from inspirehep.dojson.jobs import jobs from inspirehep.dojson.conferences import conferences for data in split_blob(source.read()): record = create_record(data) if _collection_in_record(record, 'institution'): yield strip_empty_values(institutions.do(record)) elif _collection_in_record(record, 'experiment'): yield strip_empty_values(experiments.do(record)) elif _collection_in_record(record, 'journals'): yield strip_empty_values(journals.do(record)) elif _collection_in_record(record, 'hepnames'): yield strip_empty_values(hepnames.do(record)) elif _collection_in_record(record, 'job') or \ _collection_in_record(record, 'jobhidden'): yield strip_empty_values(jobs.do(record)) elif _collection_in_record(record, 'conferences'): yield strip_empty_values(conferences.do(record)) else: yield strip_empty_values(hep.do(record))
def test_multiple_issn_from_marcxml_022(): """Test multiple ISSNs.""" snippet = ( '<record>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2349-2716</subfield>' ' <subfield code="b">Online</subfield>' ' </datafield>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2349-6088</subfield>' ' <subfield code="b">Print</subfield>' ' </datafield>' '</record>' ) expected = [ { 'medium': 'online', 'value': '2349-2716', }, { 'medium': 'print', 'value': '2349-6088', }, ] result = clean_record(journals.do(create_record(snippet))) assert expected == result['issn']
def test_coden_from_double_030__a_2(): schema = load_schema('journals') subschema = schema['properties']['coden'] snippet = ( '<record>' ' <datafield tag="030" ind1=" " ind2=" ">' ' <subfield code="2">CODEN</subfield>' ' <subfield code="a">00686</subfield>' ' </datafield>' ' <datafield tag="030" ind1=" " ind2=" ">' ' <subfield code="2">CODEN</subfield>' ' <subfield code="a">VLUFB</subfield>' ' </datafield>' '</record>' ) # record/1213834 expected = [ '00686', 'VLUFB', ] result = journals.do(create_record(snippet)) assert validate(result['coden'], subschema) is None assert expected == result['coden']
def test_issn_from_double_022__a_b(): schema = load_schema('journals') subschema = schema['properties']['issn'] snippet = ( '<record>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">1812-9471</subfield>' ' <subfield code="b">Print</subfield>' ' </datafield>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">1817-5805</subfield>' ' <subfield code="b">Online</subfield>' ' </datafield>' '</record>' ) # record/1513418 expected = [ { 'medium': 'print', 'value': '1812-9471', }, { 'medium': 'online', 'value': '1817-5805', }, ] result = journals.do(create_record(snippet)) assert validate(result['issn'], subschema) is None assert expected == result['issn']
def test_publisher_from_643__b(): snippet = ('<datafield tag="643" ind1=" " ind2=" ">' ' <subfield code="b">ANITA PUBLICATIONS, INDIA</subfield>' '</datafield>') # record/1211888 expected = [ 'ANITA PUBLICATIONS, INDIA', ] result = journals.do(create_record(snippet)) assert expected == result['publisher']
def test_issn_from_marcxml_022_with_b_no_a(): """Test ISSN in wrong subfield.""" snippet = ('<record>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="b">9780486632827</subfield>' ' </datafield> ' '</record>') result = journals.do(create_record(snippet)) assert 'issn' not in result
def create_record(data, force=False, dry_run=False): record = marc_create_record(data) recid = None if '001' in record: recid = int(record['001'][0]) if not dry_run and recid: prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = data try: if _collection_in_record(record, 'institution'): json = strip_empty_values(institutions.do(record)) elif _collection_in_record(record, 'experiment'): json = strip_empty_values(experiments.do(record)) elif _collection_in_record(record, 'journals'): json = strip_empty_values(journals.do(record)) elif _collection_in_record(record, 'hepnames'): json = strip_empty_values(hepnames.do(record)) elif _collection_in_record(record, 'job') or \ _collection_in_record(record, 'jobhidden'): json = strip_empty_values(jobs.do(record)) elif _collection_in_record(record, 'conferences'): json = strip_empty_values(conferences.do(record)) else: json = strip_empty_values(hep.do(record)) if dry_run: return recid, json if force and any(key in json for key in ('control_number', 'recid')): try: control_number = json['control_number'] except KeyError: control_number = json['recid'] control_number = int(control_number) # Searches if record already exists. record = Record.get_record(control_number) if record is None: # Adds the record to the db session. rec = RecordModel(id=control_number) db.session.merge(rec) record = Record.create(json) else: record = Record(json, model=record.model) record.commit() if recid: prod_record.successful = True db.session.merge(prod_record) logger.info("Elaborated record {}".format(control_number)) return control_number, dict(record) except Exception: if recid: prod_record.successful = False db.session.merge(prod_record) logger.exception("Error in elaborating record ID {}".format(recid)) raise
def test_coden_from_030__a_2(): snippet = ('<datafield tag="030" ind1=" " ind2=" ">' ' <subfield code="2">CODEN</subfield>' ' <subfield code="a">HERAS</subfield>' '</datafield>') # record/1211568 expected = [ 'HERAS', ] result = journals.do(create_record(snippet)) assert expected == result['coden']
def test_publisher_from_643__b(): snippet = ( '<datafield tag="643" ind1=" " ind2=" ">' ' <subfield code="b">ANITA PUBLICATIONS, INDIA</subfield>' '</datafield>' ) # record/1211888 expected = [ 'ANITA PUBLICATIONS, INDIA', ] result = clean_record(journals.do(create_record(snippet))) assert expected == result['publisher']
def test_issn_from_marcxml_022_with_b_no_a(): """Test ISSN in wrong subfield.""" snippet = ( '<record>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="b">9780486632827</subfield>' ' </datafield> ' '</record>' ) result = clean_record(journals.do(create_record(snippet))) assert 'issn' not in result
def test_coden_from_030__a_2(): snippet = ( '<datafield tag="030" ind1=" " ind2=" ">' ' <subfield code="2">CODEN</subfield>' ' <subfield code="a">HERAS</subfield>' '</datafield>' ) # record/1211568 expected = [ 'HERAS', ] result = clean_record(journals.do(create_record(snippet))) assert expected == result['coden']
def create_record(recid, record, force=False, dry_run=False, validation=False): """Create record from marc21 model.""" errors = "" if _collection_in_record(record, 'institution'): json = strip_empty_values(institutions.do(record)) elif _collection_in_record(record, 'experiment'): json = strip_empty_values(experiments.do(record)) elif _collection_in_record(record, 'journals'): json = strip_empty_values(journals.do(record)) elif _collection_in_record(record, 'hepnames'): json = strip_empty_values(hepnames.do(record)) elif _collection_in_record(record, 'job') or \ _collection_in_record(record, 'jobhidden'): json = strip_empty_values(jobs.do(record)) elif _collection_in_record(record, 'conferences'): json = strip_empty_values(conferences.do(record)) else: json = strip_empty_values(hep.do(record)) if validation: try: validate(json) except ValidationError as err: errors = "ValidationError: Record {0}: {1}".format(recid, err) current_app.logger.warning(errors) if dry_run: return errors, recid, json if force and any(key in json for key in ('control_number', 'recid')): try: control_number = json['control_number'] except KeyError: control_number = json['recid'] control_number = int(control_number) # Searches if record already exists. with db.session.begin_nested(): record = Record.get_record(control_number) if record is None: # Adds the record to the db session. rec = RecordModel(id=control_number) db.session.merge(rec) record = Record.create(json) else: record = Record(json, model=record.model) record.commit() logger.info("Elaborated record {}".format(control_number)) return errors, control_number, dict(record)
def test_publisher_from_643__b(): schema = load_schema('journals') subschema = schema['properties']['publisher'] snippet = ( '<datafield tag="643" ind1=" " ind2=" ">' ' <subfield code="b">ANITA PUBLICATIONS, INDIA</subfield>' '</datafield>' ) # record/1211888 expected = ['ANITA PUBLICATIONS, INDIA'] result = journals.do(create_record(snippet)) assert validate(result['publisher'], subschema) is None assert expected == result['publisher']
def test_short_titles_from_marcxml_711(): snippet = ('<record>' ' <datafield tag="711" ind1=" " ind2=" ">' ' <subfield code="a">Phys.Rev.ST Accel.Beams</subfield>' ' </datafield>' '</record>') expected = [ { 'title': 'Phys.Rev.ST Accel.Beams', }, ] result = journals.do(create_record(snippet)) assert expected == result['short_titles']
def test_issn_from_022__a_b_electronic(): snippet = ('<datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2469-9888</subfield>' ' <subfield code="b">electronic</subfield>' '</datafield>') # record/1415879 expected = [ { 'comment': 'electronic', 'medium': 'online', 'value': '2469-9888', }, ] result = journals.do(create_record(snippet)) assert expected == result['issn']
def test_title_variants_from_marcxml_730(): snippet = ( '<record>' ' <datafield tag="730" ind1=" " ind2=" ">' ' <subfield code="a">PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS</subfield>' ' </datafield>' '</record>') expected = [ { 'title': 'PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS' }, ] result = journals.do(create_record(snippet)) assert expected == result['title_variants']
def test_titles_from_marcxml_130_with_single_a(): snippet = ( '<record>' ' <datafield tag="130" ind1=" " ind2=" ">' ' <subfield code="a">Physical Review Special Topics - Accelerators and Beams</subfield>' ' </datafield>' '</record>') expected = [ { 'title': 'Physical Review Special Topics - Accelerators and Beams', }, ] result = journals.do(create_record(snippet)) assert expected == result['titles']
def test_issn_from_marcxml_022_with_a(): """Test simple ISSN without medium.""" snippet = ('<record>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2213-1337</subfield>' ' </datafield> ' '</record>') expected = [ { 'value': '2213-1337', }, ] result = journals.do(create_record(snippet)) assert expected == result['issn']
def test_coden_from_030__a_2(): schema = load_schema('journals') subschema = schema['properties']['coden'] snippet = ( '<datafield tag="030" ind1=" " ind2=" ">' ' <subfield code="2">CODEN</subfield>' ' <subfield code="a">HERAS</subfield>' '</datafield>' ) # record/1211568 expected = ['HERAS'] result = journals.do(create_record(snippet)) assert validate(result['coden'], subschema) is None assert expected == result['coden']
def test_publisher_from_double_643__b(): snippet = ('<record>' ' <datafield tag="643" ind1=" " ind2=" ">' ' <subfield code="b">Elsevier</subfield>' ' </datafield>' ' <datafield tag="643" ind1=" " ind2=" ">' ' <subfield code="b">Science Press</subfield>' ' </datafield>' '</record>') # record/1212635 expected = [ 'Elsevier', 'Science Press', ] result = journals.do(create_record(snippet)) assert expected == result['publisher']
def test_title_variants_from_730__a(): schema = load_schema('journals') subschema = schema['properties']['title_variants'] snippet = ( '<datafield tag="730" ind1=" " ind2=" ">' ' <subfield code="a">PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS</subfield>' '</datafield>' ) # record/1212820 expected = [ {'title': 'PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS'}, ] result = journals.do(create_record(snippet)) assert validate(result['title_variants'], subschema) is None assert expected == result['title_variants']
def overdo_marc_dict(record): """Convert MARC Groupable Ordered Dict into JSON.""" if _collection_in_record(record, 'institution'): return clean_record(institutions.do(record)) elif _collection_in_record(record, 'experiment'): return clean_record(experiments.do(record)) elif _collection_in_record(record, 'journals'): return clean_record(journals.do(record)) elif _collection_in_record(record, 'hepnames'): return clean_record(hepnames.do(record)) elif _collection_in_record(record, 'job') or \ _collection_in_record(record, 'jobhidden'): return clean_record(jobs.do(record)) elif _collection_in_record(record, 'conferences'): return clean_record(conferences.do(record)) else: return clean_record(hep.do(record))
def test_titles_from_marcxml_130_with_single_a(): snippet = ( '<record>' ' <datafield tag="130" ind1=" " ind2=" ">' ' <subfield code="a">Physical Review Special Topics - Accelerators and Beams</subfield>' ' </datafield>' '</record>' ) expected = [ { 'title': 'Physical Review Special Topics - Accelerators and Beams', }, ] result = clean_record(journals.do(create_record(snippet))) assert expected == result['titles']
def test_short_titles_from_711__a(): schema = load_schema('journals') subschema = schema['properties']['short_titles'] snippet = ( '<datafield tag="711" ind1=" " ind2=" ">' ' <subfield code="a">Phys.Rev.ST Accel.Beams</subfield>' '</datafield>' ) # record/1212820 expected = [ {'title': 'Phys.Rev.ST Accel.Beams'}, ] result = journals.do(create_record(snippet)) assert validate(result['short_titles'], subschema) is None assert expected == result['short_titles']
def test_title_variants_from_marcxml_730(): snippet = ( '<record>' ' <datafield tag="730" ind1=" " ind2=" ">' ' <subfield code="a">PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS</subfield>' ' </datafield>' '</record>' ) expected = [ { 'title': 'PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS' }, ] result = clean_record(journals.do(create_record(snippet))) assert expected == result['title_variants']
def overdo_marc_dict(record): """Convert MARC Groupable Ordered Dict into JSON.""" if _collection_in_record(record, 'institution'): return institutions.do(record) elif _collection_in_record(record, 'experiment'): return experiments.do(record) elif _collection_in_record(record, 'journals'): return journals.do(record) elif _collection_in_record(record, 'hepnames'): return hepnames.do(record) elif _collection_in_record(record, 'job') or \ _collection_in_record(record, 'jobhidden'): return jobs.do(record) elif _collection_in_record(record, 'conferences'): return conferences.do(record) else: return hep.do(record)
def test_issn_from_022__a(): schema = load_schema('journals') subschema = schema['properties']['issn'] snippet = ( '<datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2213-1337</subfield>' '</datafield> ' ) # record/1445059 expected = [ {'value': '2213-1337'}, ] result = journals.do(create_record(snippet)) assert validate(result['issn'], subschema) is None assert expected == result['issn']
def test_journal_titles_from_130__a(): schema = load_schema('journals') subschema = schema['properties']['journal_titles'] snippet = ( '<datafield tag="130" ind1=" " ind2=" ">' ' <subfield code="a">Physical Review Special Topics - Accelerators and Beams</subfield>' '</datafield>' ) expected = [ {'title': 'Physical Review Special Topics - Accelerators and Beams'}, ] result = journals.do(create_record(snippet)) assert validate(result['journal_titles'], subschema) is None assert expected == result['journal_titles']
def test_short_titles_from_marcxml_711(): snippet = ( '<record>' ' <datafield tag="711" ind1=" " ind2=" ">' ' <subfield code="a">Phys.Rev.ST Accel.Beams</subfield>' ' </datafield>' '</record>' ) expected = [ { 'title': 'Phys.Rev.ST Accel.Beams', }, ] result = clean_record(journals.do(create_record(snippet))) assert expected == result['short_titles']
def test_issn_from_marcxml_022_with_a(): """Test simple ISSN without medium.""" snippet = ( '<record>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2213-1337</subfield>' ' </datafield> ' '</record>' ) expected = [ { 'value': '2213-1337', }, ] result = clean_record(journals.do(create_record(snippet))) assert expected == result['issn']
def test_issn_from_022__a_b_electronic(): snippet = ( '<datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2469-9888</subfield>' ' <subfield code="b">electronic</subfield>' '</datafield>' ) # record/1415879 expected = [ { 'comment': 'electronic', 'medium': 'online', 'value': '2469-9888', }, ] result = clean_record(journals.do(create_record(snippet))) assert expected == result['issn']
def test_issn_from_marcxml_022_with_a_and_b(): """Test ISSN with medium normalization.""" snippet = ('<record>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2213-1337</subfield>' ' <subfield code="b">Print</subfield>' ' </datafield>' '</record>') expected = [ { 'medium': 'print', 'value': '2213-1337', }, ] result = journals.do(create_record(snippet)) assert expected == result['issn']
def test_titles_from_marcxml_130_with_a_and_b(): snippet = ( '<record>' ' <datafield tag="130" ind1=" " ind2=" ">' ' <subfield code="a">Humana Mente</subfield>' ' <subfield code="b">Journal of Philosophical Studies</subfield>' ' </datafield>' '</record>') expected = [ { 'title': 'Humana Mente', 'subtitle': 'Journal of Philosophical Studies', }, ] result = journals.do(create_record(snippet)) assert expected == result['titles']
def test_publisher_from_double_643__b(): snippet = ( '<record>' ' <datafield tag="643" ind1=" " ind2=" ">' ' <subfield code="b">Elsevier</subfield>' ' </datafield>' ' <datafield tag="643" ind1=" " ind2=" ">' ' <subfield code="b">Science Press</subfield>' ' </datafield>' '</record>' ) # record/1212635 expected = [ 'Elsevier', 'Science Press', ] result = clean_record(journals.do(create_record(snippet))) assert expected == result['publisher']
def test_coden_from_double_030__a_2(): snippet = ('<record>' ' <datafield tag="030" ind1=" " ind2=" ">' ' <subfield code="2">CODEN</subfield>' ' <subfield code="a">00686</subfield>' ' </datafield>' ' <datafield tag="030" ind1=" " ind2=" ">' ' <subfield code="2">CODEN</subfield>' ' <subfield code="a">VLUFB</subfield>' ' </datafield>' '</record>') expected = [ '00686', 'VLUFB', ] result = journals.do(create_record(snippet)) assert expected == result['coden']
def test_titles_from_marcxml_130_with_a_and_b(): snippet = ( '<record>' ' <datafield tag="130" ind1=" " ind2=" ">' ' <subfield code="a">Humana Mente</subfield>' ' <subfield code="b">Journal of Philosophical Studies</subfield>' ' </datafield>' '</record>' ) expected = [ { 'title': 'Humana Mente', 'subtitle': 'Journal of Philosophical Studies', }, ] result = clean_record(journals.do(create_record(snippet))) assert expected == result['titles']
def test_issn_from_marcxml_022_with_a_and_b(): """Test ISSN with medium normalization.""" snippet = ( '<record>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2213-1337</subfield>' ' <subfield code="b">Print</subfield>' ' </datafield>' '</record>' ) expected = [ { 'medium': 'print', 'value': '2213-1337', }, ] result = clean_record(journals.do(create_record(snippet))) assert expected == result['issn']
def test_journal_titles_from_130__a_b(): schema = load_schema('journals') subschema = schema['properties']['journal_titles'] snippet = ( '<datafield tag="130" ind1=" " ind2=" ">' ' <subfield code="a">Humana Mente</subfield>' ' <subfield code="b">Journal of Philosophical Studies</subfield>' '</datafield>' ) expected = [ { 'title': 'Humana Mente', 'subtitle': 'Journal of Philosophical Studies', }, ] result = journals.do(create_record(snippet)) assert validate(result['journal_titles'], subschema) is None assert expected == result['journal_titles']
def test_coden_from_double_030__a_2(): snippet = ( '<record>' ' <datafield tag="030" ind1=" " ind2=" ">' ' <subfield code="2">CODEN</subfield>' ' <subfield code="a">00686</subfield>' ' </datafield>' ' <datafield tag="030" ind1=" " ind2=" ">' ' <subfield code="2">CODEN</subfield>' ' <subfield code="a">VLUFB</subfield>' ' </datafield>' '</record>' ) expected = [ '00686', 'VLUFB', ] result = clean_record(journals.do(create_record(snippet))) assert expected == result['coden']
def test_issn_from_022__a_b_handles_electronic(): schema = load_schema('journals') subschema = schema['properties']['issn'] snippet = ( '<datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2469-9888</subfield>' ' <subfield code="b">electronic</subfield>' '</datafield>' ) # record/1415879 expected = [ { 'comment': 'electronic', 'medium': 'online', 'value': '2469-9888', }, ] result = journals.do(create_record(snippet)) assert validate(result['issn'], subschema) is None assert expected == result['issn']
def test_issn_from_marcxml_022_with_a_and_b_and_comment(): """Test ISSN with medium normalization. The original 'b' value will be stored in 'comment'. """ snippet = ('<record>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2213-1337</subfield>' ' <subfield code="b">ebook</subfield>' ' </datafield>' '</record>') expected = [ { 'medium': 'online', 'value': '2213-1337', 'comment': 'ebook', }, ] result = journals.do(create_record(snippet)) assert expected == result['issn']
def test_multiple_title_variants_from_marcxml_730(): snippet = ( '<record>' ' <datafield tag="730" ind1=" " ind2=" ">' ' <subfield code="a">PHYS REV SPECIAL TOPICS ACCELERATORS BEAMS</subfield>' ' </datafield>' ' <datafield tag="730" ind1=" " ind2=" ">' ' <subfield code="a">PHYSICS REVIEW ST ACCEL BEAMS</subfield>' ' </datafield>' '</record>' ) expected = [ { 'title': 'PHYS REV SPECIAL TOPICS ACCELERATORS BEAMS', }, { 'title': 'PHYSICS REVIEW ST ACCEL BEAMS', }, ] result = strip_empty_values(journals.do(create_record(snippet))) assert expected == result['title_variants']
def create_record(record, force=True, dry_run=False): """Create record from marc21 model.""" errors = "" if _collection_in_record(record, 'institution'): json = strip_empty_values(institutions.do(record)) elif _collection_in_record(record, 'experiment'): json = strip_empty_values(experiments.do(record)) elif _collection_in_record(record, 'journals'): json = strip_empty_values(journals.do(record)) elif _collection_in_record(record, 'hepnames'): json = strip_empty_values(hepnames.do(record)) elif _collection_in_record(record, 'job') or \ _collection_in_record(record, 'jobhidden'): json = strip_empty_values(jobs.do(record)) elif _collection_in_record(record, 'conferences'): json = strip_empty_values(conferences.do(record)) else: json = strip_empty_values(hep.do(record)) if dry_run: return errors, json return json