def transpose_to_marc21(record): Mrecord=Record(force_utf8=True) Mrecord.leader=record["_LEADER"] for field in record: if isint(field): if int(field)<10: if isinstance(record[field],list): for elem in record[field]: Mrecord.add_field(Field(tag=field,data=elem)) elif isinstance(record[field],str): Mrecord.add_field(Field(tag=field,data=record[field])) else: for subfield in record[field]: for ind, values in subfield.items(): indicators=[] subfields=[] for elem in values: for k,v in elem.items(): if isinstance(v,str): subfields.append(k) subfields.append(v) elif isinstance(v,list): for subfield_elem in v: subfields.append(k) subfields.append(subfield_elem) for elem in ind: indicators.append(elem) Mrecord.add_field(Field(tag=str(field), indicators=indicators, subfields=subfields)) return Mrecord.as_marc()
def element(self, element_dict, name=None): if not name: self._record = Record() self.element(element_dict, 'leader') elif name == 'leader': self._record.leader = element_dict[name] self.element(element_dict, 'fields') elif name == 'fields': fields = iter(element_dict[name]) for field in fields: tag, remaining = field.popitem() self._field = Field(tag) if self._field.is_control_field(): self._field.data = remaining else: self.element(remaining, 'subfields') self._field.indicators.extend( [remaining['ind1'], remaining['ind2']]) self._record.add_field(self._field) self.process_record(self._record) elif name == 'subfields': subfields = iter(element_dict[name]) for subfield in subfields: code, text = subfield.popitem() self._field.add_subfield(code, text)
def test_960_items_nonrepeatable_subfields(self): b = Record() b.add_field( Field(tag='960', indicators=[' ', ' '], subfields=[ 'i', 'TEST', 'i', 'TEST', 'l', 'TEST', 'l', 'TEST', 'p', '9.99', 'p', '9.99', 'q', 'TEST', 'q', 'TEST', 'o', 'TEST', 'o', 'TEST', 't', 'TEST', 't', 'TEST', 'r', 'TEST', 'r', 'TEST', 's', 'TEST', 's', 'TEST', 'v', 'TEST', 'v', 'TEST', 'n', 'TEST', 'n', 'TEST', 'v', 'TEST', 'v', 'TEST' ])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'bpl', ['specs_test.mrc'], self.bcl) self.assertFalse(passed) self.assertIn('"i" subfield is not repeatable.', report) self.assertIn('"l" subfield is not repeatable.', report) self.assertIn('"p" subfield is not repeatable.', report) self.assertIn('"q" subfield is not repeatable.', report) self.assertIn('"o" subfield is not repeatable.', report) self.assertIn('"t" subfield is not repeatable.', report) self.assertIn('"r" subfield is not repeatable.', report) self.assertIn('"s" subfield is not repeatable.', report) self.assertIn('"v" subfield is not repeatable.', report) self.assertIn('"n" subfield is not repeatable.', report)
def test_add_title(self): edition = self._edition() edition.title = "The Good Soldier" edition.sort_title = "Good Soldier, The" edition.subtitle = "A Tale of Passion" record = Record() Annotator.add_title(record, edition) [field] = record.get_fields("245") self._check_field( record, "245", { "a": edition.title, "b": edition.subtitle, "c": edition.author, }, ["0", "4"]) # If there's no subtitle or no author, those subfields are left out. edition.subtitle = None edition.author = None record = Record() Annotator.add_title(record, edition) [field] = record.get_fields("245") self._check_field( record, "245", { "a": edition.title, }, ["0", "4"]) eq_([], field.get_subfields("b")) eq_([], field.get_subfields("c"))
def startElementNS(self, name, qname, attrs): # NO Stricts try: element, parameter = name[1].split(".") except ValueError: element = name[1] if element == "rusmarc": self._record = Record() elif element == "mrk": self._record.leader = "" elif element.startswith("m_"): pass # See endElementNS for implementation elif element == "IND": self._indicators = parameter.replace("_", " ") self._field.subfields = [] elif element == "FIELD": self._field = Field(parameter, [" ", " "]) elif element == "SUBFIELD": self._subfield_code = parameter elif element == "RECORDS": pass else: raise RuntimeError("cannot process tag %s" % element) self._text = []
def test_add_simplified_genres(self): work = self._work(with_license_pool=True) fantasy, ignore = Genre.lookup(self._db, "Fantasy", autocreate=True) romance, ignore = Genre.lookup(self._db, "Romance", autocreate=True) work.genres = [fantasy, romance] record = Record() Annotator.add_simplified_genres(record, work) fields = record.get_fields("650") [fantasy_field, romance_field] = sorted(fields, key=lambda x: x.get_subfields("a")[0]) eq_(["0", "7"], fantasy_field.indicators) eq_("Fantasy", fantasy_field.get_subfields("a")[0]) eq_("Library Simplified", fantasy_field.get_subfields("2")[0]) eq_(["0", "7"], romance_field.indicators) eq_("Romance", romance_field.get_subfields("a")[0]) eq_("Library Simplified", romance_field.get_subfields("2")[0]) # It also works with a materialized work. self.add_to_materialized_view([work]) # The work is in the materialized view twice since it has two genres, # but we can use either one. [mw, ignore] = self._db.query(MaterializedWorkWithGenre).all() record = Record() Annotator.add_simplified_genres(record, mw) fields = record.get_fields("650") [fantasy_field, romance_field] = sorted(fields, key=lambda x: x.get_subfields("a")[0]) eq_(["0", "7"], fantasy_field.indicators) eq_("Fantasy", fantasy_field.get_subfields("a")[0]) eq_("Library Simplified", fantasy_field.get_subfields("2")[0]) eq_(["0", "7"], romance_field.indicators) eq_("Romance", romance_field.get_subfields("a")[0]) eq_("Library Simplified", romance_field.get_subfields("2")[0])
def filter_subject_headings(record: Record, librarySystemId: int) -> List[Field]: """ Removes subject heading tags that are not supported by our systems Args: record: pymarc.record.Record object Returns: list of tags """ approved_tags = [] subjects = record.subjects() for tag in subjects: # LCSH if tag.indicator2 == "0": approved_tags.append(tag) # Children's LCSH elif tag.indicator2 == "1": if librarySystemId == 1: approved_tags.append(tag) # source specified in $2 elif tag.indicator2 == "7": src_vocab = tag["2"] if src_vocab: if is_approved_vacabulary(src_vocab, librarySystemId): approved_tags.append(tag) record.remove_field(tag) return approved_tags
def test_add_series(self): edition = self._edition() edition.series = self._str edition.series_position = 5 record = Record() Annotator.add_series(record, edition) self._check_field(record, "490", { "a": edition.series, "v": str(edition.series_position), }, ["0", " "]) # If there's no series position, the same field is used without # the v subfield. edition.series_position = None record = Record() Annotator.add_series(record, edition) self._check_field(record, "490", { "a": edition.series, }, ["0", " "]) [field] = record.get_fields("490") eq_([], field.get_subfields("v")) # If there's no series, the field is left out. edition.series = None record = Record() Annotator.add_series(record, edition) eq_([], record.get_fields("490"))
def element(self, element_dict, name=None): """Converts a JSON `element_dict` to pymarc fields.""" if not name: self._record = Record() self.element(element_dict, "leader") elif name == "leader": self._record.leader = element_dict[name] self.element(element_dict, "fields") elif name == "fields": fields = iter(element_dict[name]) for field in fields: tag, remaining = field.popitem() self._field = Field(tag) if self._field.is_control_field(): self._field.data = remaining else: self.element(remaining, "subfields") self._field.indicators.extend( [remaining["ind1"], remaining["ind2"]]) self._record.add_field(self._field) self.process_record(self._record) elif name == "subfields": subfields = iter(element_dict[name]) for subfield in subfields: code, text = subfield.popitem() self._field.add_subfield(code, text)
def test_nypl_branches_BT_SERIES_YA_graphic_novel_compound_name(self): bib = Record() bib.leader = "00000nam a2200000u 4500" tags = [] tags.append(Field(tag="001", data="0001")) tags.append( Field(tag="245", indicators=["0", "0"], subfields=["a", "Test title"])) tags.append( Field( tag="091", indicators=[" ", " "], subfields=["a", "GRAPHIC GN FIC COMPOUND NAME"], )) for tag in tags: bib.add_ordered_field(tag) mod_bib = patches.bib_patches("nypl", "branches", "cat", "BT SERIES", bib) correct_indicators = [" ", " "] correct_subfields = [ "f", "GRAPHIC", "a", "GN FIC", "c", "COMPOUND NAME" ] self.assertEqual(correct_indicators, mod_bib.get_fields("091")[0].indicators) self.assertEqual(correct_subfields, mod_bib.get_fields("091")[0].subfields)
def test_add_contributors(self): author = "a" author2 = "b" translator = "c" # Edition with one author gets a 100 field and no 700 fields. edition = self._edition(authors=[author]) edition.sort_author = "sorted" record = Record() Annotator.add_contributors(record, edition) eq_([], record.get_fields("700")) self._check_field(record, "100", {"a": edition.sort_author}, ["1", " "]) # Edition with two authors and a translator gets three 700 fields and no 100 fields. edition = self._edition(authors=[author, author2]) edition.add_contributor(translator, Contributor.TRANSLATOR_ROLE) record = Record() Annotator.add_contributors(record, edition) eq_([], record.get_fields("100")) fields = record.get_fields("700") for field in fields: eq_(["1", " "], field.indicators) [author_field, author2_field, translator_field] = sorted(fields, key=lambda x: x.get_subfields("a")[0]) eq_(author, author_field.get_subfields("a")[0]) eq_(Contributor.PRIMARY_AUTHOR_ROLE, author_field.get_subfields("e")[0]) eq_(author2, author2_field.get_subfields("a")[0]) eq_(Contributor.AUTHOR_ROLE, author2_field.get_subfields("e")[0]) eq_(translator, translator_field.get_subfields("a")[0]) eq_(Contributor.TRANSLATOR_ROLE, translator_field.get_subfields("e")[0])
def test_nypl_branch_BT_SERIES_Spanish_prefix(self): bib = Record() bib.leader = "00000nam a2200000u 4500" tags = [] tags.append(Field(tag="001", data="0001")) tags.append( Field(tag="245", indicators=["0", "0"], subfields=["a", "Test title"])) tags.append( Field( tag="091", indicators=[" ", " "], subfields=["a", "J SPA E COMPOUND NAME"], )) for tag in tags: bib.add_ordered_field(tag) mod_bib = patches.bib_patches("nypl", "branches", "cat", "BT SERIES", bib) correct_indicators = [" ", " "] correct_subfields = ["p", "J SPA", "a", "E", "c", "COMPOUND NAME"] self.assertEqual(correct_indicators, mod_bib.get_fields("091")[0].indicators) self.assertEqual(correct_subfields, mod_bib.get_fields("091")[0].subfields)
def test_add_physical_description(self): book = self._edition() book.medium = Edition.BOOK_MEDIUM audio = self._edition() audio.medium = Edition.AUDIO_MEDIUM record = Record() Annotator.add_physical_description(record, book) self._check_field(record, "300", {"a": "1 online resource"}) self._check_field(record, "336", { "a": "text", "b": "txt", "2": "rdacontent", }) self._check_field(record, "337", { "a": "computer", "b": "c", "2": "rdamedia", }) self._check_field(record, "338", { "a": "online resource", "b": "cr", "2": "rdacarrier", }) self._check_field(record, "347", { "a": "text file", "2": "rda", }) self._check_field(record, "380", { "a": "eBook", "2": "tlcgt", }) record = Record() Annotator.add_physical_description(record, audio) self._check_field(record, "300", { "a": "1 sound file", "b": "digital", }) self._check_field(record, "336", { "a": "spoken word", "b": "spw", "2": "rdacontent", }) self._check_field(record, "337", { "a": "computer", "b": "c", "2": "rdamedia", }) self._check_field(record, "338", { "a": "online resource", "b": "cr", "2": "rdacarrier", }) self._check_field(record, "347", { "a": "audio file", "2": "rda", }) eq_([], record.get_fields("380"))
def tearDown(self): # Test MARC record # NYPL bib self.n_marc = Record() # BPL bib self.b_marc = Record()
def test_949_items_stat_code_incorrect(self): b = Record() b.add_field( Field(tag='949', indicators=[' ', '1'], subfields=['t', '600'])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'nypl', ['specs_test.mrc'], self.ncl) self.assertIn('"t" subfield has incorrect value.', report)
def test_949_items_empty_price_subfield(self): b = Record() b.add_field( Field(tag='949', indicators=[' ', '1'], subfields=['p', ''])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'nypl', ['specs_test.mrc'], self.ncl) self.assertIn('"p" subfield has incorrect price format.', report)
def test_960_items_incorrect_format(self): b = Record() b.add_field( Field(tag='960', indicators=[' ', ' '], subfields=['r', 'z'])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'bpl', ['specs_test.mrc'], self.bcl) self.assertIn('"r" subfield has incorrect value.', report)
def test_949_subfield_a_mandatory(self): b = Record() b.add_field(Field(tag='949', indicators=[' ', ' '], subfields=[])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'nypl', ['specs_test.mrc'], self.ncl) self.assertFalse(passed) self.assertIn('"a" subfield is mandatory.', report)
def test_960_items_correct_price_format(self): b = Record() b.add_field( Field(tag='960', indicators=[' ', '1'], subfields=['p', '9.99'])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'bpl', ['specs_test.mrc'], self.bcl) self.assertNotIn('"p" subfield has incorrect price format.', report)
def test_949_items_barcode_not_digits(self): b = Record() b.add_field( Field(tag='949', indicators=[' ', '1'], subfields=['i', 'TEST'])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'nypl', ['specs_test.mrc'], self.ncl) self.assertFalse(passed) self.assertIn('"i" subfield has incorrect barcode.', report)
def test_947_incorrect_subfield_a_value(self): b = Record() b.add_field( Field(tag='947', indicators=[' ', ' '], subfields=['a', 'TEST'])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'bpl', ['specs_test.mrc'], self.bcl) self.assertFalse(passed) self.assertIn('"a" subfield has incorrect value', report)
def test_960_items_mandatory(self): b = Record() b.add_field( Field(tag='960', indicators=[' ', '1'], subfields=['a', 'TEST'])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'bpl', ['specs_test.mrc'], self.bcl) self.assertFalse(passed) self.assertIn('"960 " mandatory tag not found.', report)
def test_bib_with_vendor_910_tag(self): bib = Record() bib.add_field( Field(tag="910", indicators=[" ", " "], subfields=["a", "foo"])) patches.bib_patches("nypl", "research", "acq", "Amalivre", bib) tags_910 = bib.get_fields("910") self.assertEqual(len(tags_910), 1) self.assertEqual(str(bib["910"]), "=910 \\\\$aRL")
def test_091_no_subfield_a(self): b = Record() b.add_field( Field(tag='099', indicators=[' ', ' '], subfields=['p', 'TEST'])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'bpl', ['specs_test.mrc'], self.bcl) self.assertFalse(passed) self.assertIn('"a" subfield is mandatory.', report)
def test_960_items_incorrect_location(self): b = Record() b.add_field( Field(tag='960', indicators=[' ', ' '], subfields=['l', 'mma0l'])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'bpl', ['specs_test.mrc'], self.bcl) self.assertFalse(passed) self.assertIn('"l" subfield has incorrect location code.', report)
def create_record(cls, work, annotator, force_create=False, integration=None): """Build a complete MARC record for a given work.""" if callable(annotator): annotator = annotator() if isinstance(work, BaseMaterializedWork): pool = work.license_pool else: pool = work.active_license_pool() if not pool: return None edition = pool.presentation_edition identifier = pool.identifier _db = Session.object_session(work) record = None existing_record = getattr(work, annotator.marc_cache_field) if existing_record and not force_create: record = Record(data=existing_record.encode('utf-8'), force_utf8=True) if not record: record = Record(leader=annotator.leader(work), force_utf8=True) annotator.add_control_fields(record, identifier, pool, edition) annotator.add_isbn(record, identifier) # TODO: The 240 and 130 fields are for translated works, so they can be grouped even # though they have different titles. We do not group editions of the same work in # different languages, so we can't use those yet. annotator.add_title(record, edition) annotator.add_contributors(record, edition) annotator.add_publisher(record, edition) annotator.add_physical_description(record, edition) annotator.add_audience(record, work) annotator.add_series(record, edition) annotator.add_system_details(record) annotator.add_ebooks_subject(record) data = record.as_marc() if isinstance(work, BaseMaterializedWork): setattr(pool.work, annotator.marc_cache_field, data) else: setattr(work, annotator.marc_cache_field, data) # Add additional fields that should not be cached. annotator.annotate_work_record(work, pool, edition, identifier, record, integration) return record
def test_949_subfield_a_incorrect_value(self): b = Record() b.add_field( Field(tag='949', indicators=[' ', ' '], subfields=['a', 'b2=a;'])) # missing * in the begining bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'nypl', ['specs_test.mrc'], self.ncl) self.assertFalse(passed) self.assertIn('"a" subfield has incorrect value', report)
def test_960_items_good_barcode(self): b = Record() b.add_field( Field(tag='960', indicators=[' ', '1'], subfields=['i', '34444987954328'])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'bpl', ['specs_test.mrc'], self.bcl) self.assertNotIn('"i" subfield has incorrect barcode.', report)
def test_unicode(self): record = Record() record.add_field(Field(245, ['1', '0'], ['a', unichr(0x1234)])) writer = MARCWriter(open('test/foo', 'w')) writer.write(record) writer.close() reader = MARCReader(open('test/foo')) record = reader.next() self.assertEqual(record['245']['a'], unichr(0x1234))
def callZ3950(search_id, target, depth=0): if target == 'UIU': print "UIUC NUMBER: ", search_id query = zoom.Query('PQF', '@attr 1=12 %s' % str(search_id)) database_address = 'z3950.carli.illinois.edu' username = '******' database_name = 'voyager' else: print "LC NUMBER: ", search_id query = zoom.Query('PQF', '@attr 1=9 %s' % str(formatLCCN(search_id))) database_address = 'lx2.loc.gov' username = '' if 'n' in search_id: database_name = 'NAF' else: database_name = 'SAF' # conn = establishZ3950Connection(database_address,210,username,database_name) res = queryZ3950(database_address, username, database_name, query) print len(res) print res if len(res) > 0: for r in res: valid_leader = checkLeader(r.data[:24]) if valid_leader: if len(res) > 1: try: new_record = Record(data=r.data) except UnicodeDecodeError: return (False, 'BROKEN CHARACTER IN RECORD') lccn = new_record.get_fields('001')[0].data.replace( " ", "") if lccn == search_id: marc_record = new_record fixNames(marc_record) else: try: marc_record = Record(data=r.data) except UnicodeDecodeError: return (False, 'BROKEN CHARACTER IN RECORD') fixNames(marc_record) else: return (False, 'BROKEN LEADER') return (marc_record, None) elif depth < 20: waitSixSeconds(datetime.datetime.now().time()) return callZ3950(search_id, target, depth=depth + 1) else: return (None, 'RECORD NOT FOUND')
def setUp(self): self.bib = Record() self.bib.leader = "00000nam a2200000u 4500" tags = [] tags.append(Field(tag="001", data="0001")) tags.append( Field(tag="245", indicators=["0", "0"], subfields=["a", "Test title"])) for tag in tags: self.bib.add_ordered_field(tag)
def test_947_nonrepeatable_subfield_a(self): b = Record() b.add_field( Field(tag='947', indicators=[' ', ' '], subfields=['a', 'TEST', 'a', 'TEST1'])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'bpl', ['specs_test.mrc'], self.bcl) self.assertFalse(passed) self.assertIn('"a" subfield is not repeatable.', report)
class TestFindMatches(unittest.TestCase): def setUp(self): self.bib1 = Record() self.bib1.add_field( Field( tag='245', indicators=['0', '0'], subfields=[ 'a', 'Test ' ])) self.bib1.add_field( Field( tag='901', indicators=[' ', ' '], subfields=[ 'a', 'abcd' ])) self.bib1.add_field( Field( tag='001', data='1234' )) self.bib2 = Record() self.bib2.add_field( Field( tag='245', indicators=['0', '0'], subfields=[ 'a', 'Test ' ])) self.bib2.add_field( Field( tag='901', indicators=[' ', ' '], subfields=[ 'a', 'abcd' ])) def test_2_matches(self): conditions = [('901', 'a', 'abcd'), ('001', None, '1234')] self.assertEqual( vendors.find_matches(self.bib1, conditions), 2) def test_only_1_match(self): conditions = [('901', 'a', 'abcd'), ('001', None, '12345')] self.assertEqual( vendors.find_matches(self.bib1, conditions), 1) def test_bib_missing_tag(self): conditions = [('901', 'a', 'abcd'), ('001', None, '1234')] self.assertEqual( vendors.find_matches(self.bib2, conditions), 1)
def test_writing_unicode(self): record = Record() record.add_field(Field(245, ['1', '0'], ['a', unichr(0x1234)])) record.leader = ' a ' writer = MARCWriter(open('test/foo', 'w')) writer.write(record) writer.close() reader = MARCReader(open('test/foo'), to_unicode=True) record = reader.next() self.assertEqual(record['245']['a'], unichr(0x1234)) os.remove('test/foo')
def test_091_subfields(self): b = Record() b.add_field( Field(tag='099', indicators=[' ', ' '], subfields=['a', 'TEST', 'a', 'TEST2'])) bibs.write_marc21('specs_test.mrc', b) passed, report = local_specs.local_specs_validation( 'bpl', ['specs_test.mrc'], self.bcl) self.assertFalse(passed) self.assertNotIn( '"099 ": tag occurance 1:\n\t"p" subfield is not repeatable.', report)
def test_writing_unicode(self): record = Record() record.add_field(Field(245, ['1', '0'], ['a', unichr(0x1234)])) record.leader = ' a ' writer = MARCWriter(open('test/foo', 'wb')) writer.write(record) writer.close() reader = MARCReader(open('test/foo', 'rb'), to_unicode=True) record = next(reader) self.assertEqual(record['245']['a'], unichr(0x1234)) reader.close() os.remove('test/foo')
def sort_6_subs(rec): msg = '' new_rec = Record(to_unicode=True, force_utf8=True) new_rec_fields = [] rec_fields = rec.get_fields() for field in rec_fields: script_field = False if not field.is_control_field() and (len(field.get_subfields('6')) > 0): # the field contains a subfield $6 script_field = True ind1 = field.indicator1 ind2 = field.indicator2 tag = field.tag first_sub = True # variable to keep track of whether you're on the first subfield in the field needs_sorted = True # variable to keep track of whether the field needs sorted or if the $6 is already correctly the first subfield field_subs = [] # list variable to capture all the subfields in the field *except* for the subfield $6 for subfield in field: # check if $6 is the first subfield - if so, the field is OK and does *not* need to be sorted if needs_sorted and first_sub and subfield[0] == '6': needs_sorted = False elif needs_sorted: if first_sub: # this is the first subfield and is *not* $6, so the field needs sorted - creates one instance of a new_field object only when the 1st subfield is encountered new_field = Field(tag=tag, indicators=[ind1,ind2], subfields=[]) # when subfield $6 is finally encountered in the field (not the 1st), add it to the new_field object now so it becomes the first subfield # Note: subfield[0] is the subfield code and subfield[1] is the subfield content for this subfield if subfield[0]=='6': new_field.add_subfield(subfield[0],subfield[1]) # if the subfield is *not* $6, add it to the list of subfields to be added later to the new_field else: field_subs.append([subfield[0],subfield[1]]) first_sub = False if needs_sorted: # then the $6 was *not* the 1st subfield and we need to now add the remaining subfields to the new_field object for sub in field_subs: # add the remaining subfields to the new_field object new_field.add_subfield(sub[0],sub[1]) new_rec_fields.append(new_field) # add the new field to the record if not script_field or not needs_sorted: new_rec_fields.append(field) for new_f in new_rec_fields: new_rec.add_field(new_f) return new_rec
def writeMetadataToMarc(data, MARCMapping, saveLocation): record = Record() for key in data: if key in MARCMapping: if(key == u'UUID'): field = Field( tag = MARCMapping[key], data = data[key]) else: field = Field( tag = MARCMapping[key][:3], subfields = [MARCMapping[key][3], data[key]], indicators=['0', '0']) record.add_field(field) writeRecordToFile(record, filename)
def faulty015(record: Record) -> bool: found = False for f in record.get_fields("015"): if "a" in f: if len(f["a"].split(' ')) > 1: found = True return found
def __next__(self): jobj = next(self.iter) rec = Record() rec.leader = jobj['leader'] for field in jobj['fields']: k,v = list(field.items())[0] if 'subfields' in v and hasattr(v,'update'): # flatten m-i-j dict to list in pymarc subfields = [] for sub in v['subfields']: for code,value in sub.items(): subfields.extend((code,value)) fld = Field(tag=k,subfields=subfields,indicators=[v['ind1'], v['ind2']]) else: fld = Field(tag=k,data=v) rec.add_field(fld) return rec
def empty020a(record: Record) -> bool: sf020a = [] if "020" in record: fields = record.get_fields("020") for f in fields: if "a" in f: sf020a.append(f.get_subfields("a").pop()) return sf020a == [""]
def phrasesInFields(record: Record, phrases: list, fields: list) -> bool: '''Takes a record object, a list of strings and a list of fields (as strings). Returns a boolean.''' for f in fields: for rf in record.get_fields(f): for p in phrases: if p in rf.value().lower(): return True return False
def test_add_formats(self): edition, pool = self._edition(with_license_pool=True) epub_no_drm, ignore = DeliveryMechanism.lookup( self._db, Representation.EPUB_MEDIA_TYPE, DeliveryMechanism.NO_DRM) pool.delivery_mechanisms[0].delivery_mechanism = epub_no_drm LicensePoolDeliveryMechanism.set( pool.data_source, pool.identifier, Representation.PDF_MEDIA_TYPE, DeliveryMechanism.ADOBE_DRM, RightsStatus.IN_COPYRIGHT) record = Record() Annotator.add_formats(record, pool) fields = record.get_fields("538") eq_(2, len(fields)) [pdf, epub] = sorted(fields, key=lambda x: x.get_subfields("a")[0]) eq_("Adobe PDF eBook", pdf.get_subfields("a")[0]) eq_([" ", " "], pdf.indicators) eq_("EPUB eBook", epub.get_subfields("a")[0]) eq_([" ", " "], epub.indicators)
def periodsMissing(record: Record) -> bool: fields = record.get_fields("100", "110", "700", "710") for f in fields: if "e" in f: functions = f.get_subfields("e") for func in functions: if func[-1].isalpha(): return True return False
def decode_record(self, record): r""" >>> reader = Reader('http://opac.uthsc.edu', 2) >>> raw = "\nLEADER 00000cas 2200517 a 4500 \n001 1481253 \n003 OCoLC \n005 19951109120000.0 \n008 750727c19589999fr qrzp b 0 b0fre d \n010 sn 86012727 \n022 0003-3995 \n030 AGTQAH \n035 0062827|bMULS|aPITT NO. 0639600000|asa64872000|bFULS \n040 MUL|cMUL|dFUL|dOCL|dCOO|dNYG|dHUL|dSER|dAIP|dNST|dAGL|dDLC\n |dTUM \n041 0 engfre|bgeritaspa \n042 nsdp \n049 TUMS \n069 1 A32025000 \n210 0 Ann. genet. \n222 0 Annales de genetique \n229 00 Annales de genetique \n229 Ann Genet \n242 00 Annals on genetics \n245 00 Annales de genetique. \n260 Paris :|bExpansion scientifique,|c1958-2004. \n300 v. :|bill. ;|c28 cm. \n310 Quarterly \n321 Two no. a year \n362 0 1,1958-47,2004. \n510 1 Excerpta medica \n510 1 Index medicus|x0019-3879 \n510 2 Biological abstracts|x0006-3169 \n510 2 Chemical abstracts|x0009-2258 \n510 2 Life sciences collection \n510 0 Bulletin signaletique \n510 0 Current contents \n546 French and English, with summaries in German, Italian, and\n Spanish. \n550 Journal of the Societe francaise de genetique. \n650 2 Genetics|vPeriodicals. \n710 2 Societ\xe9 fran\xe7aise de genetique. \n785 00 |tEuropean journal of medical genetics. \n856 41 |uhttp://library.uthsc.edu/ems/eresource/3581|zFull text \n at ScienceDirect: 43(1) Jan 2000 - 47(4) Dec 2004 \n936 Unknown|ajuin 1977 \n" >>> record = reader.decode_record(raw) >>> print record.title Annales de genetique """ pseudo_marc = record.strip().split('\n') raw_fields = [] if pseudo_marc[0][0:6] == 'LEADER': record = Record() record.leader = pseudo_marc[0][7:].strip() else: return None for field in pseudo_marc[1:]: tag = field[:3] data = unescape_entities(field[6:].decode('latin1')).encode('utf8') if tag.startswith(' '): # Additional field data needs to be prepended with an extra space # for certain fields ... #for special_tag in ('55','260'): # data = " %s" % (data,) if tag.startswith(special_tag) else data data = " %s" % (data.strip(),) raw_fields[-1]['value'] = "%s%s" % (raw_fields[-1]['value'], data) raw_fields[-1]['raw'] = "%s%s" % (raw_fields[-1]['raw'], field.strip()) else: data = data if (tag < '010' and tag.isdigit()) else "a%s" % (data,) raw_fields.append({ 'tag': tag, 'indicator1': field[3], 'indicator2': field[4], 'value': data.strip(), 'raw': field.strip() }) for raw in raw_fields: tag = raw['tag'] data = raw['value'].strip() field = Field(tag=tag, indicators=[raw['indicator1'], raw['indicator2']], data=data) if not field.is_control_field(): for sub in data.split('|'): try: field.add_subfield(sub[0].strip(), sub[1:].strip()) except Exception: # Skip blank/empty subfields continue record.add_field(field) record.parse_leader() # Disregard record if no title present if not record.get_fields('245'): return None else: return record
def test_add_publisher(self): edition = self._edition() edition.publisher = self._str edition.issued = datetime.datetime(1894, 4, 5) record = Record() Annotator.add_publisher(record, edition) self._check_field( record, "264", { "a": "[Place of publication not identified]", "b": edition.publisher, "c": "1894", }, [" ", "1"]) # If there's no publisher, the field is left out. record = Record() edition.publisher = None Annotator.add_publisher(record, edition) eq_([], record.get_fields("264"))
def test_add_isbn(self): isbn = self._identifier(identifier_type=Identifier.ISBN) record = Record() Annotator.add_isbn(record, isbn) self._check_field(record, "020", {"a": isbn.identifier}) # If the identifier isn't an ISBN, but has an equivalent that is, it still # works. equivalent = self._identifier() data_source = DataSource.lookup(self._db, DataSource.OCLC) equivalent.equivalent_to(data_source, isbn, 1) record = Record() Annotator.add_isbn(record, equivalent) self._check_field(record, "020", {"a": isbn.identifier}) # If there is no ISBN, the field is left out. non_isbn = self._identifier() record = Record() Annotator.add_isbn(record, non_isbn) eq_([], record.get_fields("020"))
def textIn020z(record: Record) -> bool: sf020z = [] if "020" in record: fields = record.get_fields("020") for f in fields: if "z" in f: sf020z.append(f.get_subfields("z").pop()) if len(sf020z) > 0: alpha = 0 for f in sf020z: alpha += len([x for x in f if x.isalpha()]) return alpha > 1 return False
def test_add_control_fields(self): # This edition has one format and was published before 1900. edition, pool = self._edition(with_license_pool=True) identifier = pool.identifier edition.issued = datetime.datetime(956, 1, 1) now = datetime.datetime.now() record = Record() Annotator.add_control_fields(record, identifier, pool, edition) self._check_control_field(record, "001", identifier.urn) assert now.strftime("%Y%m%d") in record.get_fields("005")[0].value() self._check_control_field(record, "006", "m d ") self._check_control_field(record, "007", "cr cn ---anuuu") self._check_control_field( record, "008", now.strftime("%y%m%d") + "s0956 xxu eng ") # This French edition has two formats and was published in 2018. edition2, pool2 = self._edition(with_license_pool=True) identifier2 = pool2.identifier edition2.issued = datetime.datetime(2018, 2, 3) edition2.language = "fre" LicensePoolDeliveryMechanism.set( pool2.data_source, identifier2, Representation.PDF_MEDIA_TYPE, DeliveryMechanism.ADOBE_DRM, RightsStatus.IN_COPYRIGHT) record = Record() Annotator.add_control_fields(record, identifier2, pool2, edition2) self._check_control_field(record, "001", identifier2.urn) assert now.strftime("%Y%m%d") in record.get_fields("005")[0].value() self._check_control_field(record, "006", "m d ") self._check_control_field(record, "007", "cr cn ---mnuuu") self._check_control_field( record, "008", now.strftime("%y%m%d") + "s2018 xxu fre ")
def startElementNS(self, name, qname, attrs): if self._strict and name[0] != MARC_XML_NS: return element = name[1] self._text = [] if element == 'record': self._record = Record() elif element == 'controlfield': tag = attrs.getValue((None, u'tag')) self._field = Field(tag) elif element == 'datafield': tag = attrs.getValue((None, u'tag')) ind1 = attrs.get((None, u'ind1'), u' ') ind2 = attrs.get((None, u'ind2'), u' ') self._field = Field(tag, [ind1, ind2]) elif element == 'subfield': self._subfield_code = attrs[(None, 'code')]
def test_add_web_client_urls(self): # Web client URLs can come from either the MARC export integration or # a library registry integration. annotator = LibraryAnnotator(self._default_library) # If no web catalog URLs are set for the library, nothing will be changed. record = Record() identifier = self._identifier(foreign_id="identifier") annotator.add_web_client_urls(record, self._default_library, identifier) eq_([], record.get_fields("856")) # Add a URL from a library registry. registry = self._external_integration( ExternalIntegration.OPDS_REGISTRATION, ExternalIntegration.DISCOVERY_GOAL, libraries=[self._default_library]) ConfigurationSetting.for_library_and_externalintegration( self._db, Registration.LIBRARY_REGISTRATION_WEB_CLIENT, self._default_library, registry).value = "http://web_catalog" record = Record() annotator.add_web_client_urls(record, self._default_library, identifier) [field] = record.get_fields("856") eq_(["4", "0"], field.indicators) eq_("http://web_catalog/book/Gutenberg%20ID%2Fidentifier", field.get_subfields("u")[0]) # Add a manually configured URL on a MARC export integration. integration = self._external_integration( ExternalIntegration.MARC_EXPORT, ExternalIntegration.CATALOG_GOAL, libraries=[self._default_library]) ConfigurationSetting.for_library_and_externalintegration( self._db, MARCExporter.WEB_CLIENT_URL, self._default_library, integration).value = "http://another_web_catalog" record = Record() annotator.add_web_client_urls(record, self._default_library, identifier, integration) [field1, field2] = record.get_fields("856") eq_(["4", "0"], field1.indicators) eq_("http://another_web_catalog/book/Gutenberg%20ID%2Fidentifier", field1.get_subfields("u")[0]) eq_(["4", "0"], field2.indicators) eq_("http://web_catalog/book/Gutenberg%20ID%2Fidentifier", field2.get_subfields("u")[0])
def handleXLSX(mapping, sheet, outputFolder, rowsToIgnore): # mapping as defined in mapping function loadMapping # sheet is the specific excel sheet that contains the data we're extracting # rowsToIgnore is the number of rows at top of sheet that we don't care about for row in sheet.rows[rowsToIgnore:]: record = Record() # for every key in our map for i, key_entry in enumerate(mapping): key = key_entry["field"] # if the key isn't empty (i.e. we're not mapping that column) if key: addField(record, key_entry, [key[3], unicode(row[i].value)]) if DEBUG: print record.title() if record.isbn() <> None: # Create unique UUID & add to record bookUUID = str(uuid.uuid1()) record.add_field(Field(tag='001', data=bookUUID)) # Write out our marcxml for each row writeMARCXML(record, os.path.join(outputFolder, record.isbn() + '.xml'))
''' script to migrate book information to koha database ''' from pymarc import Record,Field,record_to_xml import MySQLdb record=Record() print dir(record) dbLo=MySQLdb.connect("localhost","shailesh","123","shailesh") dbKoha=MySQLdb.connect("localhost","root","","koha1") curaKoha=dbKoha.cursor() curaLocl = dbLo.cursor() curaLocl.execute("select BookNo,BookName,authorName from book_info group by BookNo;") dat=curaLocl.fetchall() curaLocl.execute("select accession,BookNo,callNo from book_info;") datIte=curaLocl.fetchall() for i in dat: record=Record() record.add_field(Field(tag='040',indicators=['0','1'],subfields=['c','LIBRARY OF CONGRESS'])) record.add_field(Field(tag='245',indicators=['0','1'],subfields=['a',i[1]])) record.add_field(Field(tag='942',indicators=['0','1'],subfields=['2','book of parag','c','BOOK'])) record.add_field(Field(tag='100',indicators=['0','1'],subfields=['a',i[2]])) record.add_field(Field(tag='999',indicators=['0','1'],subfields=['c','8','d','8'])) marcI=record_to_xml(record) #print i[0],i[1],i[2] curaKoha.execute("insert into biblio(biblionumber,title,author) values(%s,%s,%s);",(i[0],i[1],i[2])) curaKoha.execute("insert into biblioitems(biblionumber,biblioitemnumber,marcxml) values(%s,%s,%s);",(i[0],i[0],marcI)) for i in datIte: barcode='1111'+str(i[0]) curaKoha.execute("insert into items(itemnumber,biblionumber,biblioitemnumber,barcode,itemcallnumber) values(%s,%s,%s,%s,%s);",(i[0],i[1],i[1],barcode,i[2]))
def format_language(langcode): return '<LNG>' + LANGUAGES.get(langcode, PRIMARYLANG) def preflabel_of(conc): labels = g.preferredLabel(conc, lang=PRIMARYLANG) try: return labels[0][1] except IndexError: print >>sys.stderr, "WARNING: couldn't find label of %s, result: %s" % (conc,labels) return '' for conc in sorted(g.subjects(RDF.type, SKOS.Concept)): if (conc, OWL.deprecated, Literal(True)) in g: continue rec = Record(leader=LEADER) # URI -> 001 rec.add_field( Field( tag='001', data=conc ) ) # dct:modified -> 005 mod = g.value(conc, DCT.modified, None) if mod is None: modified = datetime.date(2000, 1, 1) else: modified = mod.toPython() # datetime.date or datetime.datetime object
parsed_html = getParsedHTML(url) # Get raw publication metadata trList = parsed_html.find(class_="table itemDisplayTable").find_all('tr') # Sort metadata for tr in trList: data[tr.find(class_="metadataFieldLabel").string[:-2].lower()] = tr.find(class_="metadataFieldValue").string # Record where all this data came from data[u'originURL'] = url # If the pdf link exists, download book and print metadata aPdf = parsed_html.find('td', headers='t0', class_="standard").a if (aPdf): # Get PDF pdfUrl = 'http://apps.who.int' + aPdf['href'] urllib.urlretrieve(pdfUrl, SAVE_LOCATION + data['UUID'] + '.' + url.rsplit('.')[-1]) # Print metadata in MARCXML record = Record() for key in data: if key in MARCMapping: if(key == u'UUID'): field = Field( tag = MARCMapping[key], data = data[key]) else: field = Field( tag = MARCMapping[key][:3], subfields = [MARCMapping[key][3], data[key]], indicators=['0', '0']) record.add_field(field) writer = XMLWriter(open(SAVE_LOCATION + data[u'UUID'] + '.xml', 'wb')) writer.write(record) writer.close()
#------------------------------------------------- f = open('ostinos.csv') csv_f = csv.reader(f) out = open ('osti_recs.csv', 'w') data = csv.writer(out) data.writerow(['Title', 'Author', 'Date', 'Subjects', 'Description', 'OstiID', 'DOI', 'Report Number', 'DOE Number', 'URL', '']) marcOut = open('ostimarc.mrc', 'w') dc = '{http://purl.org/dc/elements/1.1/}' dcq = '{http://purl.org/dc/terms/}' for number in csv_f: ostiId = number[0] marc = Record() # Create a new record for each loop. tree = etree.parse('http://www.osti.gov/scitech/scitechxml?Identifier='+ ostiId+ '.xml') for node in tree.iter(): if node.tag == dc + 'ostiId': if node.text == ostiId: o = node.getparent() osti = o.getchildren() getRecs(osti, data) getMarc(osti, marc) marcOut.write(marc.as_marc()) # Write each new record.
class XmlHandler(ContentHandler): """ You can subclass XmlHandler and add your own process_record method that'll be passed a pymarc.Record as it becomes available. This could be useful if you want to stream the records elsewhere (like to a rdbms) without having to store them all in memory. """ def __init__(self, strict=False, normalize_form=None): self.records = [] self._record = None self._field = None self._subfield_code = None self._text = [] self._strict = strict self.normalize_form = normalize_form def startElementNS(self, name, qname, attrs): if self._strict and name[0] != MARC_XML_NS: return element = name[1] self._text = [] if element == 'record': self._record = Record() elif element == 'controlfield': tag = attrs.getValue((None, u'tag')) self._field = Field(tag) elif element == 'datafield': tag = attrs.getValue((None, u'tag')) ind1 = attrs.get((None, u'ind1'), u' ') ind2 = attrs.get((None, u'ind2'), u' ') self._field = Field(tag, [ind1, ind2]) elif element == 'subfield': self._subfield_code = attrs[(None, 'code')] def endElementNS(self, name, qname): if self._strict and name[0] != MARC_XML_NS: return element = name[1] if self.normalize_form is not None: text = unicodedata.normalize(self.normalize_form, u''.join(self._text)) else: text = u''.join(self._text) if element == 'record': self.process_record(self._record) self._record = None elif element == 'leader': self._record.leader = text elif element == 'controlfield': self._field.data = text self._record.add_field(self._field) self._field = None elif element == 'datafield': self._record.add_field(self._field) self._field = None elif element == 'subfield': self._field.subfields.append(self._subfield_code) self._field.subfields.append(text) self._subfield_code = None self._text = [] def characters(self, chars): self._text.append(chars) def process_record(self, record): self.records.append(record)
def epub_to_marc(fname, conf_file=None): ns = { 'n': 'urn:oasis:names:tc:opendocument:xmlns:container', 'pkg': 'http://www.idpf.org/2007/opf', 'dc': 'http://purl.org/dc/elements/1.1/' } # prepare to read from the .epub file zip = zipfile.ZipFile(fname) # find the contents metafile txt = zip.read('META-INF/container.xml') tree = etree.fromstring(txt) for el in tree: for elel in el: for item in elel.items(): if item[0] == 'full-path': cfname = item[1] # grab the metadata block from the contents metafile cf = zip.read(cfname) tree = etree.fromstring(cf) p = tree.xpath('/pkg:package/pkg:metadata',namespaces=ns)[0] # Read from the config file conf = configparser.ConfigParser() if conf_file: conf.read(conf_file) else: conf.read_string(DEFAULT_CONF) leader_dict = {} tag_005_dict = {} tag_006_dict = {} tag_007_dict = {} tag_008_dict = {} tag_040_dict = {} tag_264_dict = {} sections = conf.sections() for section in sections: if section == 'leader': for option in conf.options(section): leader_dict[option] = conf.get(section, option) elif section == '006': for option in conf.options(section): tag_006_dict[option] = conf.get(section, option) elif section == '007': for option in conf.options(section): tag_007_dict[option] = conf.get(section, option) elif section == '008': for option in conf.options(section): tag_008_dict[option] = conf.get(section, option) elif section == '040': for option in conf.options(section): tag_040_dict[option] = conf.get(section, option) elif section == '264': for option in conf.options(section): tag_264_dict[option] = conf.get(section, option) record = Record(force_utf8=True) # set the leader record.leader = build_leader(leader_dict) # I *think* it's updating the 'Base Address of Data' position when # it is written to file, so I have kept characters 12-16 blank. # Field 005 record.add_field(Field(tag='005', data=build_tag_005())) # Field 006 record.add_field(Field(tag='006', data=build_tag_006(tag_006_dict, tag_008_dict))) # Field 007 record.add_field(Field(tag='007', data=build_tag_007(tag_007_dict))) # Field 008 record.add_field(Field(tag='008', data=build_tag_008(tag_008_dict, p, ns))) # Field 020 if p.xpath('dc:identifier[@id="ISBN"]/text()', namespaces=ns): epub_isbn = p.xpath( 'dc:identifier[@id="ISBN"]/text()', namespaces=ns)[0].strip() epub_field = Field( tag = '020', indicators = [' ', ' '], subfields = ['a', epub_isbn, 'q', 'epub'] ) elif p.xpath('dc:identifier[@pkg:scheme="ISBN"]/text()', namespaces=ns): epub_isbn = p.xpath( 'dc:identifier[@pkg:scheme="ISBN"]/text()', namespaces=ns)[0].strip() epub_field = Field( tag = '020', indicators = [' ', ' '], subfields = ['a', epub_isbn, 'q', 'epub'] ) # Field 040 # First, check if the indicators are empty and if they are, # turn them into single spaces. for value in ('indicator_1', 'indicator_2'): if tag_040_dict[value] == '': tag_040_dict[value] = ' ' record.add_field(Field( tag = '040', indicators = [tag_040_dict['indicator_1'], tag_040_dict['indicator_2']], subfields = ['a', tag_040_dict['subfield_a'], 'b', tag_040_dict['subfield_b'], 'e', tag_040_dict['subfield_e'], 'c', tag_040_dict['subfield_c']] )) # Field 245 if p.xpath('dc:title/text()',namespaces=ns): full_title = p.xpath('dc:title/text()',namespaces=ns)[0] if ":" in full_title: title = full_title[:full_title.index(':') ].strip() subtitle = full_title[full_title.index(':') + 1:].strip() else: title = full_title subtitle = None if p.xpath('dc:creator/text()', namespaces=ns)[0]: creator_statement = p.xpath('dc:creator/text()', namespaces=ns)[0] if title and subtitle and creator_statement: offset = 0 if ' ' in title: title_words = title.split(' ') if title_words[0].lower() in NON_FILING_WORDS: offset = len(title_words[0]) + 1 record.add_field( Field('245', ['0', offset], ['a', title + " :", 'b', subtitle + " /", 'c', creator_statement])) elif title and creator_statement: offset = 0 if ' ' in title: title_words = title.split(' ') if title_words[0].lower() in NON_FILING_WORDS: offset = len(title_words[0]) + 1 record.add_field( Field('245', ['0', offset], ['a', title + " /", 'c', creator_statement])) # Field 264 if p.xpath('dc:publisher/text()', namespaces=ns) \ and p.xpath('dc:date/text()', namespaces=ns): record.add_field(Field('264', [' ', '1'], ['a', tag_264_dict['subfield_a'] + ' :', 'b', p.xpath('dc:publisher/text()', namespaces=ns)[0] + ", ", 'c', p.xpath('dc:date/text()', namespaces=ns)[0]])) if p.xpath('dc:rights/text()', namespaces=ns): copyright_statement = "" copyright_symbol = "©" rights_words_array = p.xpath('dc:rights/text()', namespaces=ns)[0].split() for word in rights_words_array: if word in copyright_year_range: copyright_statement = copyright_symbol + word if len(copyright_statement) > 4: record.add_field(Field('264', [' ', '4'], ['c', copyright_statement])) return record