def test_address_from_marcxml_371__a_b_c_d_double_e_g(): snippet = ( '<datafield tag="371" ind1=" " ind2=" ">' ' <subfield code="a">Philosophenweg 16</subfield>' ' <subfield code="b">Heidelberg</subfield>' ' <subfield code="c">Baden-Wuerttemberg</subfield>' ' <subfield code="d">Germany</subfield>' ' <subfield code="e">69120</subfield>' ' <subfield code="e">DE-119</subfield>' ' <subfield code="g">DE</subfield>' '</datafield>' ) expected = [ { 'city': 'Heidelberg', 'country': 'Germany', 'country_code': 'DE', 'state': 'Baden-Wuerttemberg', 'original_address': [ 'Philosophenweg 16', ], 'postal_code': '69120, DE-119', } ] result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['address']
def test_contact_details_from_multiple_marcxml_270(): snippet = ( '<record> ' ' <datafield tag="270" ind1=" " ind2=" ">' ' <subfield code="m">[email protected]</subfield>' ' <subfield code="p">Manfred Lindner</subfield>' ' </datafield>' ' <datafield tag="270" ind1=" " ind2=" ">' ' <subfield code="p">Wynton Marsalis</subfield>' ' </datafield>' '</record>' ) expected = [ { 'name': 'Manfred Lindner', 'email': '*****@*****.**', }, { 'name': 'Wynton Marsalis', }, ] result = strip_empty_values(conferences.do(create_record(snippet))) assert expected == result['contact_details']
def test_address_from_marcxml_371__a_b_c_d_e_double_g(): snippet = ( '<datafield tag="371" ind1=" " ind2=" ">' ' <subfield code="a">Philosophenweg 16</subfield>' ' <subfield code="b">Heidelberg</subfield>' ' <subfield code="c">Baden-Wuerttemberg</subfield>' ' <subfield code="d">Germany</subfield>' ' <subfield code="e">69120</subfield>' ' <subfield code="g">DE</subfield>' ' <subfield code="g">DE</subfield>' '</datafield>' ) expected = [ { "city": "Heidelberg", "country": "Germany", "country_code": "DE", "state": "Baden-Wuerttemberg", "original_address": [ "Philosophenweg 16", ], "postal_code": "69120", }, ] result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['address']
def test_address_from_multiple_marcxml__111_c(): snippet = ( '<record>' ' <datafield tag="111" ind1=" " ind2=" ">' ' <subfield code="c">Austin, Tex.</subfield>' ' </datafield>' ' <datafield tag="111" ind1=" " ind2=" ">' ' <subfield code="c">Den Haag, Nederlands</subfield>' ' </datafield>' '</record>' ) expected = [ { 'country_code': 'US', 'state': 'US-TX', 'original_address': 'Austin, Tex.' }, { 'country_code': 'NL', 'original_address': 'Den Haag, Nederlands' }, ] result = strip_empty_values(conferences.do(create_record(snippet))) assert expected == result['address']
def collections(self, key, value): """Collection this record belongs to.""" value = utils.force_list(value) def get_value(value): primary = '' if isinstance(value.get('a'), list): primary = value.get('a')[0] else: primary = value.get('a') return { 'primary': primary, 'secondary': value.get('b'), 'deleted': value.get('c'), } collections = self.get('collections', []) for val in value: collections.append(get_value(val)) contains_list = False for element in collections: for k, v in enumerate(element): if isinstance(element[v], list): contains_list = True break if contains_list: return strip_empty_values(collections) else: return inspire_dojson_utils.remove_duplicates_from_list_of_dicts( collections)
def update(): """View for INSPIRE author update form.""" from dojson.contrib.marc21.utils import create_record from inspirehep.dojson.hepnames import hepnames recid = request.values.get('recid', 0, type=int) data = {} if recid: try: url = os.path.join( current_app.config["AUTHORS_UPDATE_BASE_URL"], "record", str(recid), "export", "xm") xml = requests.get(url) record_regex = re.compile( r"\<record\>.*\<\/record\>", re.MULTILINE + re.DOTALL) xml_content = record_regex.search(xml.content).group() data = strip_empty_values( hepnames.do(create_record(xml_content))) # .encode("utf-8") convert_for_form(data) except requests.exceptions.RequestException: pass data["recid"] = recid else: return redirect(url_for("inspirehep_authors_holdingpen.new")) form = AuthorUpdateForm(data=data, is_update=True) ctx = { "action": url_for('.submitupdate'), "name": "authorUpdateForm", "id": "authorUpdateForm", } # FIXME create template in authors module return render_template('authors/forms/update_form.html', form=form, **ctx)
def update(): """View for INSPIRE author update form.""" from dojson.contrib.marc21.utils import create_record from inspirehep.dojson.hepnames import hepnames recid = request.values.get('recid', 0, type=int) data = {} if recid: try: url = os.path.join(current_app.config["AUTHORS_UPDATE_BASE_URL"], "record", str(recid), "export", "xm") xml = requests.get(url) record_regex = re.compile(r"\<record\>.*\<\/record\>", re.MULTILINE + re.DOTALL) xml_content = record_regex.search(xml.content).group() data = strip_empty_values(hepnames.do( create_record(xml_content))) # .encode("utf-8") convert_for_form(data) except requests.exceptions.RequestException: pass data["recid"] = recid else: return redirect(url_for("inspirehep_authors.new")) form = AuthorUpdateForm(data=data) ctx = { "action": url_for('.submitupdate'), "name": "authorUpdateForm", "id": "authorUpdateForm", } # FIXME create template in authors module return render_template('authors/forms/update_form.html', form=form, **ctx)
def test_positions_from_371__a_m_r_z(): snippet = ( '<datafield tag="371" ind1=" " ind2=" ">' ' <subfield code="a">Antwerp U.</subfield>' ' <subfield code="m">[email protected]</subfield>' ' <subfield code="r">SENIOR</subfield>' ' <subfield code="z">Current</subfield>' '</datafield>' ) # record/997958 expected = [ { 'curated_relation': False, 'email': '*****@*****.**', 'institution': { 'name': 'Antwerp U.', }, 'rank': 'SENIOR', '_rank': 'SENIOR', 'status': 'Current', }, ] result = strip_empty_values(hepnames.do(create_record(snippet))) assert expected == result['positions']
def test_extra_words_from_410__decuple_g(): snippet = ( '<datafield tag="410" ind1=" " ind2=" ">' ' <subfield code="g">Institut Theoretische Physik,</subfield>' ' <subfield code="g">RWTH, Inst.</subfield>' ' <subfield code="g">institute A</subfield>' ' <subfield code="g">III. Physikalisches Institut, Technische Hochschule Aachen, Aachen, West</subfield>' ' <subfield code="g">physics</subfield>' ' <subfield code="g">52056</subfield>' ' <subfield code="g">D-52056</subfield>' ' <subfield code="g">DE-52056</subfield>' ' <subfield code="g">phys</subfield>' ' <subfield code="g">I. Physikalisches Institut</subfield>' '</datafield>' ) # record/902624 expected = [ 'Institut Theoretische Physik,', 'RWTH, Inst.', 'institute A', 'III. Physikalisches Institut, Technische Hochschule Aachen, Aachen, West', 'physics', '52056', 'D-52056', 'DE-52056', 'phys', 'I. Physikalisches Institut', ] result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['extra_words']
def test_multiple_issn_from_marcxml_022(): """Test multiple ISSNs.""" snippet = ( '<record>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2349-2716</subfield>' ' <subfield code="b">Online</subfield>' ' </datafield>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2349-6088</subfield>' ' <subfield code="b">Print</subfield>' ' </datafield>' '</record>' ) expected = [ { 'medium': 'online', 'value': '2349-2716', }, { 'medium': 'print', 'value': '2349-6088', }, ] result = strip_empty_values(journals.do(create_record(snippet))) assert expected == result['issn']
def test_hidden_notes_from_595__a_9_and_595__double_a_9(): snippet = ( '<record>' ' <datafield tag="595" ind1=" " ind2=" ">' ' <subfield code="9">SPIRES-HIDDEN</subfield>' ' <subfield code="a">Title changed from ALLCAPS</subfield>' ' </datafield>' ' <datafield tag="595" ind1=" " ind2=" ">' ' <subfield code="9">SPIRES-HIDDEN</subfield>' ' <subfield code="a">TeXtitle from script</subfield>' ' <subfield code="a">no affiliation (not clear pn the fulltext)</subfield>' ' </datafield>' '</record>' ) # record/109310 expected = [ { 'source': 'SPIRES-HIDDEN', 'value': 'Title changed from ALLCAPS', }, { 'source': 'SPIRES-HIDDEN', 'value': 'TeXtitle from script', }, { 'source': 'SPIRES-HIDDEN', 'value': 'no affiliation (not clear pn the fulltext)', }, ] result = strip_empty_values(hep.do(create_record(snippet))) assert expected == result['hidden_notes']
def convert_marcxml(source): """Convert MARC XML to JSON.""" from dojson.contrib.marc21.utils import create_record, split_blob from inspirehep.dojson.utils import strip_empty_values from inspirehep.dojson.hep import hep from inspirehep.dojson.institutions import institutions from inspirehep.dojson.journals import journals from inspirehep.dojson.experiments import experiments from inspirehep.dojson.hepnames import hepnames from inspirehep.dojson.jobs import jobs from inspirehep.dojson.conferences import conferences for data in split_blob(source.read()): record = create_record(data) if _collection_in_record(record, 'institution'): yield strip_empty_values(institutions.do(record)) elif _collection_in_record(record, 'experiment'): yield strip_empty_values(experiments.do(record)) elif _collection_in_record(record, 'journals'): yield strip_empty_values(journals.do(record)) elif _collection_in_record(record, 'hepnames'): yield strip_empty_values(hepnames.do(record)) elif _collection_in_record(record, 'job') or \ _collection_in_record(record, 'jobhidden'): yield strip_empty_values(jobs.do(record)) elif _collection_in_record(record, 'conferences'): yield strip_empty_values(conferences.do(record)) else: yield strip_empty_values(hep.do(record))
def test_single_doi(): snippet_single_doi = ('<record><datafield tag="024" ind1="7" ind2=" ">' '<subfield code="2">DOI</subfield>' '<subfield code="a">10.1088/0264-9381/31/24/245004</subfield>' '</datafield></record>') x = create_record(snippet_single_doi) assert (strip_empty_values(hep.do(x))['dois'] == [{'value': '10.1088/0264-9381/31/24/245004'}])
def test_core_from_690c_a_noncore(): snippet = ( '<datafield tag="690" ind1="C" ind2=" ">' ' <subfield code=a">NONCORE</subfield>' '</datafield>' ) # record/916025 result = strip_empty_values(institutions.do(create_record(snippet))) assert not result['core']
def test_non_public_notes_from_667__a(): snippet = ( '<datafield tag="667" ind1=" " ind2=" ">' ' <subfield code="a">Former ICN = Negev U.</subfield>' '</datafield>' ) # record/902663 expected = ['Former ICN = Negev U.'] result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['non_public_notes']
def test_field_activity_from_372__a(): snippet = ( '<datafield tag="372" ind1=" " ind2=" ">' ' <subfield code="a">Research center</subfield>' '</datafield>' ) expected = ['Research center'] result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['field_activity']
def test_name_from_110__a(): snippet = ( '<datafield tag="110" ind1=" " ind2=" ">' ' <subfield code="a">Mid-America Christian U.</subfield>' '</datafield>' ) # record/1439728 expected = [['Mid-America Christian U.']] result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['name']
def test_hidden_notes_from_595__a(): snippet = ( '<datafield tag="595" ind1=" " ind2=" ">' ' <subfield code="a">The Division is located inside the Department of Physics and Astronomy of the University of Catania Scientific Campus ("Città Universitaria" or "Cittadella"). Via Santa Sofia 64 95123 CATANIA</subfield>' '</datafield>' ) # record/902879 expected = [u'The Division is located inside the Department of Physics and Astronomy of the University of Catania Scientific Campus ("Città Universitaria" or "Cittadella"). Via Santa Sofia 64 95123 CATANIA'] result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['hidden_notes']
def test_timezone_from_043__t(): snippet = ( '<datafield tag="043" ind1=" " ind2=" ">' ' <subfield code="t">+05</subfield>' '</datafield>' ) # record/902635 expected = ['+05'] result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['timezone']
def test_no_location_from_invalid_034__d_f(): snippet = ( '<datafield tag="034" ind1=" " ind2=" ">' ' <subfield code="d">foo</subfield>' ' <subfield code="f">bar</subfield>' '</datafield>' ) # synthetic data result = strip_empty_values(institutions.do(create_record(snippet))) assert 'location' not in result
def test_single_doi(): snippet_single_doi = ( '<record><datafield tag="024" ind1="7" ind2=" ">' '<subfield code="2">DOI</subfield>' '<subfield code="a">10.1088/0264-9381/31/24/245004</subfield>' '</datafield></record>') x = create_record(snippet_single_doi) assert (strip_empty_values(hep.do(x))['dois'] == [{ 'value': '10.1088/0264-9381/31/24/245004' }])
def test_name_from_110__a_b_u(): snippet = ( '<datafield tag="110" ind1=" " ind2=" ">' ' <subfield code="a">Fukushima University</subfield>' ' <subfield code="b">Department of Physics</subfield>' ' <subfield code="u">Fukushima U.</subfield>' '</datafield>' ) # record/902812 expected = [['Fukushima University', 'Fukushima U.']] result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['name']
def test_location_from_034__f(): snippet = ( '<datafield tag="034" ind1=" " ind2=" ">' ' <subfield code="f">50.7736</subfield>' '</datafield>' ) # synthetic data expected = { 'latitude': 50.7736, } result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['location']
def test_historical_data_from_6781_a(): snippet = ( '<datafield tag="678" ind1="1" ind2=" ">' ' <subfield code="a">Became IFH (Inst for Hochenergiephysik)in 1968. Since 1992 the official name of the Inst. is simply DESY Zeuthen. Changed 1/26/99 AMR</subfield>' '</datafield>' ) # record/902666 expected = [ 'Became IFH (Inst for Hochenergiephysik)in 1968. Since 1992 the official name of the Inst. is simply DESY Zeuthen. Changed 1/26/99 AMR' ] result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['historical_data']
def create_record(data, force=False, dry_run=False): record = marc_create_record(data) recid = None if '001' in record: recid = int(record['001'][0]) if not dry_run and recid: prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = data try: if _collection_in_record(record, 'institution'): json = strip_empty_values(institutions.do(record)) elif _collection_in_record(record, 'experiment'): json = strip_empty_values(experiments.do(record)) elif _collection_in_record(record, 'journals'): json = strip_empty_values(journals.do(record)) elif _collection_in_record(record, 'hepnames'): json = strip_empty_values(hepnames.do(record)) elif _collection_in_record(record, 'job') or \ _collection_in_record(record, 'jobhidden'): json = strip_empty_values(jobs.do(record)) elif _collection_in_record(record, 'conferences'): json = strip_empty_values(conferences.do(record)) else: json = strip_empty_values(hep.do(record)) if dry_run: return recid, json if force and any(key in json for key in ('control_number', 'recid')): try: control_number = json['control_number'] except KeyError: control_number = json['recid'] control_number = int(control_number) # Searches if record already exists. record = Record.get_record(control_number) if record is None: # Adds the record to the db session. rec = RecordModel(id=control_number) db.session.merge(rec) record = Record.create(json) else: record = Record(json, model=record.model) record.commit() if recid: prod_record.successful = True db.session.merge(prod_record) logger.info("Elaborated record {}".format(control_number)) return control_number, dict(record) except Exception: if recid: prod_record.successful = False db.session.merge(prod_record) logger.exception("Error in elaborating record ID {}".format(recid)) raise
def test_name_from_110__b_t_u(): snippet = ( '<datafield tag="110" ind1=" " ind2=" ">' ' <subfield code="b">Institute of Physics</subfield>' ' <subfield code="t">Inst. Phys., Belgrade</subfield>' ' <subfield code="u">Belgrade, Inst. Phys.</subfield>' '</datafield>' ) # record/903416 expected = [['Belgrade, Inst. Phys.', 'Inst. Phys., Belgrade']] result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['name']
def test_publisher_from_643__b(): snippet = ( '<datafield tag="643" ind1=" " ind2=" ">' ' <subfield code="b">ANITA PUBLICATIONS, INDIA</subfield>' '</datafield>' ) # record/1211888 expected = [ 'ANITA PUBLICATIONS, INDIA', ] result = strip_empty_values(journals.do(create_record(snippet))) assert expected == result['publisher']
def test_public_notes_from_680__a(): snippet = ( '<datafield tag="680" ind1=" " ind2=" ">' ' <subfield code="i">2nd address: Organisation Européenne pour la Recherche Nucléaire (CERN), F-01631 Prévessin Cedex, France</subfield>' '</datafield>' ) # record/902725 expected = [ u'2nd address: Organisation Européenne pour la Recherche Nucléaire (CERN), F-01631 Prévessin Cedex, France' ] result = strip_empty_values(institutions.do(create_record(snippet))) assert expected == result['public_notes']
def test_issn_from_marcxml_022_with_b_no_a(): """Test ISSN in wrong subfield.""" snippet = ( '<record>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="b">9780486632827</subfield>' ' </datafield> ' '</record>' ) result = strip_empty_values(journals.do(create_record(snippet))) assert 'issn' not in result
def test_coden_from_030__a_2(): snippet = ( '<datafield tag="030" ind1=" " ind2=" ">' ' <subfield code="2">CODEN</subfield>' ' <subfield code="a">HERAS</subfield>' '</datafield>' ) # record/1211568 expected = [ 'HERAS', ] result = strip_empty_values(journals.do(create_record(snippet))) assert expected == result['coden']
def references(self, key, value): """Produce list of references.""" value = utils.force_list(value) def get_value(value): recid = None number = '' year = '' if '0' in value: try: recid = int(value.get('0')) except: pass if 'o' in value: try: number = int(value.get('o')) except: pass if 'y' in value: try: year = int(value.get('y')) except: pass return { 'record': inspire_dojson_utils.get_record_ref(recid, 'literature'), 'texkey': value.get('1'), 'doi': value.get('a'), 'collaboration': utils.force_list(value.get('c')), 'editors': value.get('e'), 'authors': utils.force_list(value.get('h')), 'misc': utils.force_list(value.get('m')), 'number': number, 'isbn': value.get('i'), 'publisher': utils.force_list(value.get('p')), 'maintitle': value.get('q'), 'report_number': utils.force_list(value.get('r')), 'title': utils.force_list(value.get('t')), 'url': utils.force_list(value.get('u')), 'journal_pubnote': utils.force_list(value.get('s')), 'raw_reference': utils.force_list(value.get('x')), 'year': year, } references = self.get('references', []) for val in value: references.append(get_value(val)) return inspire_dojson_utils.remove_duplicates_from_list( strip_empty_values(references))
def test_experiment_names_and_affiliation_from_marcxml_119(): snippet = ( '<record>' ' <datafield tag="119" ind1=" " ind2=" ">' ' <subfield code="a">CERN-ALPHA</subfield>' ' <subfield code="u">CERN</subfield>' ' </datafield>' '</record>' ) result = strip_empty_values(experiments.do(create_record(snippet))) assert result['affiliation'][0] == 'CERN' assert result['experiment_names'][0]['title'] == 'CERN-ALPHA'
def references(self, key, value): """Produce list of references.""" value = utils.force_list(value) def get_value(value): recid = '' number = '' year = '' if '0' in value: try: recid = int(value.get('0')) except: pass if 'o' in value: try: number = int(value.get('o')) except: pass if 'y' in value: try: year = int(value.get('y')) except: pass return { 'recid': recid, 'texkey': value.get('1'), 'doi': value.get('a'), 'collaboration': utils.force_list(value.get('c')), 'editors': value.get('e'), 'authors': utils.force_list(value.get('h')), 'misc': utils.force_list(value.get('m')), 'number': number, 'isbn': value.get('i'), 'publisher': utils.force_list(value.get('p')), 'maintitle': value.get('q'), 'report_number': utils.force_list(value.get('r')), 'title': utils.force_list(value.get('t')), 'url': utils.force_list(value.get('u')), 'journal_pubnote': utils.force_list(value.get('s')), 'raw_reference': utils.force_list(value.get('x')), 'year': year, } references = self.get('references', []) for val in value: references.append(get_value(val)) return inspire_dojson_utils.remove_duplicates_from_list( strip_empty_values(references))
def test_duplicate_doi(): snippet_duplicate_doi = ( '<record><datafield tag="024" ind1="7" ind2=" ">' '<subfield code="2">DOI</subfield>' '<subfield code="9">bibmatch</subfield>' '<subfield code="a">10.1088/1475-7516/2015/03/044</subfield>' '</datafield>' '<datafield tag="024" ind1="7" ind2=" ">' '<subfield code="2">DOI</subfield>' '<subfield code="a">10.1088/1475-7516/2015/03/044</subfield>' '</datafield></record>') x = create_record(snippet_duplicate_doi) assert (strip_empty_values(hep.do(x))['dois'] == [{ 'source': 'bibmatch', 'value': '10.1088/1475-7516/2015/03/044' }, { 'value': '10.1088/1475-7516/2015/03/044' }])
def test_multiple_dois(): snippet_multiple_dois = ( '<record><datafield tag="024" ind1="7" ind2=" ">' '<subfield code="2">DOI</subfield>' '<subfield code="a">10.1103/PhysRevD.89.072002</subfield>' '</datafield>' '<datafield tag="024" ind1="7" ind2=" ">' '<subfield code="2">DOI</subfield>' '<subfield code="9">bibmatch</subfield>' '<subfield code="a">10.1103/PhysRevD.91.019903</subfield>' '</datafield></record>') x = create_record(snippet_multiple_dois) assert (strip_empty_values(hep.do(x))['dois'] == [{ 'value': '10.1103/PhysRevD.89.072002' }, { 'source': 'bibmatch', 'value': '10.1103/PhysRevD.91.019903' }])
def test_strip_empty_values(): obj = { '_foo': (), 'foo': (1, 2, 3), '_bar': [], 'bar': [1, 2, 3], '_baz': set(), 'baz': set([1, 2, 3]), 'qux': True, 'quux': False, 'plugh': 0, } expected = { 'foo': (1, 2, 3), 'bar': [1, 2, 3], 'baz': set([1, 2, 3]), 'qux': True, 'quux': False, 'plugh': 0, } result = strip_empty_values(obj) assert expected == result
def create_record(record, force=True, dry_run=False): """Create record from marc21 model.""" errors = "" if _collection_in_record(record, 'institution'): json = strip_empty_values(institutions.do(record)) elif _collection_in_record(record, 'experiment'): json = strip_empty_values(experiments.do(record)) elif _collection_in_record(record, 'journals'): json = strip_empty_values(journals.do(record)) elif _collection_in_record(record, 'hepnames'): json = strip_empty_values(hepnames.do(record)) elif _collection_in_record(record, 'job') or \ _collection_in_record(record, 'jobhidden'): json = strip_empty_values(jobs.do(record)) elif _collection_in_record(record, 'conferences'): json = strip_empty_values(conferences.do(record)) else: json = strip_empty_values(hep.do(record)) if dry_run: return errors, json return json
def test_strip_empty_values_returns_none_on_none(): assert strip_empty_values(None) is None