def test_ia_charset(): # Tests a corrupted unicode MARC record is corrected, does code exist to fix this? data = open(test_data + 'histoirereligieu05cr_meta.mrc').read() line = list(get_tag_lines(data, set(['100'])))[0][1] a = list(get_all_subfields(line))[0][1] expect = u'Crétineau-Joly, J.' assert a == expect
def find_subjects(marc_subjects): person = defaultdict(int) event = defaultdict(int) work = defaultdict(int) org = defaultdict(int) time = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects: for tag, line in lines: aspects = find_aspects(line) if aspects: subject[aspects] += 1 if re_large_book.search(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip( ' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': person[name] += 1 elif tag == '610': # org v = ' '.join(get_subfield_values(line, 'abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 for v in get_subfield_values(line, 'a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: event[v] += 1 elif tag == '630': # work for v in get_subfield_values(line, ['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: work[v] += 1 elif tag == '650': # topical for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 elif tag == '651': # geo for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 else: print 'other', tag, list(get_all_subfields(line)) cur = [ v for k, v in get_all_subfields(line) if k == 'a' or v.strip('. ').lower() == 'fiction' ] # skip: 'Good, Sally (Fictitious character) in fiction' if len(cur) > 1 and cur[-1].strip( '. ').lower() == 'fiction' and ')' not in cur[-2]: subject[flip_subject(cur[-2]) + ' in fiction'] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: time[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subject[v] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if person: ret['person'] = dict(person) if time: ret['time'] = dict(time) if place: ret['place'] = dict(place) if subject: ret['subject'] = dict(subject) if event: ret['event'] = dict(event) if org: ret['org'] = dict(org) if work: ret['work'] = dict(work) return ret
def get_all_subfields(self): return fast_parse.get_all_subfields(self.line, self.rec.leader()[9] != 'a')
def read_people(people): found = defaultdict(int) marc_alt = {} people = list(people) for lines in people: for line in lines: p = tuple(clean_subfield(k, v) for k, v in get_all_subfields(line)) #check_for_dup_a(p) found[date_field_missing(p)]+=1 for p in found.keys(): c = None for k, v in p: if k == 'c': c = v break if not c or c.lower() != 'family': continue new = tuple((k, v + ' family' if k == 'a' else v) for k, v in p if k != 'c') if new in found: found[new] += found.pop(p) marc_alt[p] = new fix_bad_subtags(found, marc_alt) wrong_subtag_on_date(found, marc_alt) try: missing_subtag(found, marc_alt) except AssertionError: print people raise found_name = defaultdict(int) for p, num in found.items(): found_name[just_abcdq(p)] += num found = found_name assert found if len(found) == 1: return dict(found), marc_alt #for func in subtag_should_be_c, merge_question_date: #for func in subtag_should_be_c, merge_question_date, missing_subtag, bad_char_name_match: for func in subtag_should_be_c, merge_question_date, bad_char_name_match, similar_dates: func(found, marc_alt) if len(found) == 1: return dict(found), marc_alt assert found # one author missing death date name_and_birth = build_name_and_birth(found) assert found try: if authority_lookup(name_and_birth, found, marc_alt): if len(found) == 1: return dict(found), marc_alt name_and_birth = build_name_and_birth(found) except AssertionError: print people raise assert found for p, num in found.items(): if p not in name_and_birth: continue assert len(name_and_birth[p]) == 1 new_name = list(name_and_birth[p])[0] found[new_name] += found.pop(p) marc_alt[p] = new_name assert found if len(found) == 1: return dict(found), marc_alt # match up authors with the same name # where one has dates and the other doesn't by_name = build_by_name(found) try: if authority_lookup(by_name, found, marc_alt): if len(found) == 1: return dict(found), marc_alt by_name = build_by_name(found) # rebuild except AssertionError: print people raise for p, num in found.items(): if p not in by_name: continue if len(by_name[p]) != 1: for i in by_name[p]: print i print people assert len(by_name[p]) == 1 new_name = list(by_name[p])[0] found[new_name] += found.pop(p) marc_alt[p] = new_name assert found if len(found) == 1: return dict(found), marc_alt by_date = defaultdict(set) for p in found: if not has_subtag('d', p): continue d = tuple(v for k, v in p if k=='d') by_date[d].add(p) # for k, v in by_date.iteritems(): # print len(v), k, v return dict(found), marc_alt
from openlibrary.catalog.marc.fast_parse import get_all_subfields import re samples = [ "00\x1faDi 1 juan.Guo se tian xiang /Wu Jingsuo bian.Ba duan jin /Xingshizhushi bian ji --di 2 juan.Wu sheng xi ;Shi er lou /Li Yu --di 3 juan.Jin xiang ting /Su'anzhuren bian.\x1ftFen zhuang lou /Zhuxishanren --\x1fgdi 4 juan.Wu se shi /Bilian'gezhuren.Ba dong tian /Wuseshizhuren.Wu feng yin /Chichi dao ren bian zhu --di 5 juan.Shuang feng qi yuan /Xueqiaozhuren zi ding.Jin shi yuan.Qing meng tuo /Anyangjiumin --di 6 juan.Wu mei yuan.Xiu qiu yuan.Yuan yang ying /Qiaoyunshanren bian --di 7 juan.Mei ren shu /Xu Zhen.Wan hua lou /Li Yutang --di 8 juan.Bei shi yan yi /Du Gang.Kong kong huan /Wugangzhuren bian ci.Chun qiu pei --di 9 juan.Qian Qi guo zhi /Wumenxiaoke.Hou Qi guo zhi /Yanshuisanren.Qiao shi yan yi /Lu Yingyang --di 10 juan.Liaohai dan zhong lu /Lu Renlong.Tian bao tu.Jin xiu yi --di 11 juan.Shi mei tu.Huan xi yuan jia /Xihuyuyinzhuren.Feng liu he shang.Liang jiao hun /Tianhuazangzhuren --di 12 juan.Ge lian hua ying.Qi lou chong meng /Wang Lanzhi.\x1e", '00\x1ftManierismus als Artistik : systematische Aspekte einer \xe8asthetischen Kategorie / R\xe8udiger Zymner -- "Stil" und "Manier" in der Alltagskultur / Kaspar Maase -- Die Wortfamilie von it. "Maniera" zwischen Literatur, bildender Kunst und Psychologie / Margarete Lindemann -- Der Manierismus : zur Problematik einer kunsthistorischen Erfindung / Horst Bredekamp -- Inszenierte K\xe8unstlichkeit : Musik als manieristisches Dispositiv / Hermann Danuser -- Manierismus als Stilbegriff in der Architekturgeschichte / Hermann Hipp -- "Raffael ohne H\xe8ande," oder, Das Kunstwerk zwischen Sch\xe8opfung und Fabrikation : Konzepte der "maniera" bei Vasari und seinen Zeitgenossen / Ursula Link-Heer -- "Sprezzatura" : Pontormos Portraits und das h\xe8ofische Ideal des Manierismus / Axel Christoph Gampp -- Maniera and the grotesque / Maria Fabricius Hansen -- Neulateinisches Figurengedicht und manieristische Poetik : zum "Poematum liber" (1573) des Richard Willis / Ulrich Ernst -- Manierismus als Selbstbehauptung, Jean Paul / Wolfgang Braungart -- Artistische Erkenntnis : (Sprach-)Alchimie und Manierismus in der Romantik / Axel Dunker -- "Als lebeten sie" / Holk Cruse.\x1e', ] re_gt = re.compile('^(gt)+$') re_gtr = re.compile('^(gtr)+$') re_at = re.compile('^at+$') re_end_num = re.compile('\d[]. ]*$') for line in open('test_data/marc_toc'): (loc, line) = eval(line) #print loc subfields = list(get_all_subfields(line)) if subfields[0][0] == '6': subfields.pop(0) subtags = ''.join(k for k, v in subfields) if re_at.match(subtags): a = subfields[0][1] m = re_end_num.search(a) print((bool(m), repr(a))) continue if not m: for k, v in subfields: print((k, repr(v))) assert m continue if re_gtr.match(subtags):
def find_subjects(marc_subjects): person = defaultdict(int) event = defaultdict(int) work = defaultdict(int) org = defaultdict(int) time = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects: for tag, line in lines: aspects = find_aspects(line) if aspects: subject[aspects] += 1 if re_large_book.search(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': person[name] += 1 elif tag == '610': # org v = ' '.join(get_subfield_values(line, 'abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 for v in get_subfield_values(line, 'a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: event[v] += 1 elif tag == '630': # work for v in get_subfield_values(line, ['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: work[v] += 1 elif tag == '650': # topical for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 elif tag == '651': # geo for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 else: print 'other', tag, list(get_all_subfields(line)) cur = [v for k, v in get_all_subfields(line) if k=='a' or v.strip('. ').lower() == 'fiction'] # skip: 'Good, Sally (Fictitious character) in fiction' if len(cur) > 1 and cur[-1].strip('. ').lower() == 'fiction' and ')' not in cur[-2]: subject[flip_subject(cur[-2]) + ' in fiction'] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: time[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subject[v] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if person: ret['person'] = dict(person) if time: ret['time'] = dict(time) if place: ret['place'] = dict(place) if subject: ret['subject'] = dict(subject) if event: ret['event'] = dict(event) if org: ret['org'] = dict(org) if work: ret['work'] = dict(work) return ret
def get_all_subfields(self): return fast_parse.get_all_subfields(self.line, self.rec.leader()[9] != 'a')
if tag == '300': if 'accompanying_material' in rec: continue subtag_e = ' '.join(i.strip('. ') for i in get_subfield_values(line, set(['e']))) if subtag_e: if subtag_e.lower() in ('list', 'notes', 'book'): continue rec['accompanying_material'] = subtag_e continue fields.setdefault(tag, []).append(line) for line in fields.get('041', []): found = [] marc_h = list(get_subfield_values(line, 'h')) if not marc_h: continue for h in marc_h: if len(h) % 3 != 0: print 'bad:', list(get_all_subfields(line)) continue found += ['/l/' + i for i in (h[i * 3:(i+1) * 3].lower() for i in range(len(h) / 3)) if i in langs] if found: rec.setdefault('translated_from', []).extend(found) rec.update(read_oclc(fields)) if rec: rec['source_record'] = loc print >> out, rec out.close()
def test_bad_marc_line(): line = '0 \x1f\xe2aEtude objective des ph\xe2enom\xe1enes neuro-psychiques;\x1e' assert list(get_all_subfields(line, True)) == [ (u'á', u'Etude objective des phénomènes neuro-psychiques;') ]
i.strip('. ') for i in get_subfield_values(line, set(['e']))) if subtag_e: if subtag_e.lower() in ('list', 'notes', 'book'): continue rec['accompanying_material'] = subtag_e continue fields.setdefault(tag, []).append(line) for line in fields.get('041', []): found = [] marc_h = list(get_subfield_values(line, 'h')) if not marc_h: continue for h in marc_h: if len(h) % 3 != 0: print 'bad:', list(get_all_subfields(line)) continue found += [ '/l/' + i for i in (h[i * 3:(i + 1) * 3].lower() for i in range(len(h) / 3)) if i in langs ] if found: rec.setdefault('translated_from', []).extend(found) rec.update(read_oclc(fields)) if rec: rec['source_record'] = loc print >> out, rec out.close()
def get_all_subfields(self): return fast_parse.get_all_subfields(self.line)