Exemplo n.º 1
0
def test_ia_charset():
    # Tests a corrupted unicode MARC record is corrected, does code exist to fix this?
    data = open(test_data + 'histoirereligieu05cr_meta.mrc').read()
    line = list(get_tag_lines(data, set(['100'])))[0][1]
    a = list(get_all_subfields(line))[0][1]
    expect = u'Crétineau-Joly, J.'
    assert a == expect
Exemplo n.º 2
0
def find_subjects(marc_subjects):
    person = defaultdict(int)
    event = defaultdict(int)
    work = defaultdict(int)
    org = defaultdict(int)
    time = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects:
        for tag, line in lines:
            aspects = find_aspects(line)
            if aspects:
                subject[aspects] += 1
            if re_large_book.search(line):
                continue
            if tag == '600':  # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(
                        ' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    person[name] += 1
            elif tag == '610':  # org
                v = ' '.join(get_subfield_values(line, 'abcd'))
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    org[v] += 1

                for v in get_subfield_values(line, 'a'):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        org[v] += 1
            elif tag == '611':  # event
                v = ' '.join(j.strip() for i, j in get_all_subfields(line)
                             if i not in 'vxyz')
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    event[v] += 1
            elif tag == '630':  # work
                for v in get_subfield_values(line, ['a']):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        work[v] += 1
            elif tag == '650':  # topical
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            elif tag == '651':  # geo
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1
            else:
                print 'other', tag, list(get_all_subfields(line))

            cur = [
                v for k, v in get_all_subfields(line)
                if k == 'a' or v.strip('. ').lower() == 'fiction'
            ]

            # skip: 'Good, Sally (Fictitious character) in fiction'
            if len(cur) > 1 and cur[-1].strip(
                    '. ').lower() == 'fiction' and ')' not in cur[-2]:
                subject[flip_subject(cur[-2]) + ' in fiction'] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    time[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                v = tidy_subject(v)
                if v:
                    subject[v] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if not v:
                    continue
                if aspects and re_aspects.search(v):
                    continue
                v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if person:
        ret['person'] = dict(person)
    if time:
        ret['time'] = dict(time)
    if place:
        ret['place'] = dict(place)
    if subject:
        ret['subject'] = dict(subject)
    if event:
        ret['event'] = dict(event)
    if org:
        ret['org'] = dict(org)
    if work:
        ret['work'] = dict(work)
    return ret
Exemplo n.º 3
0
 def get_all_subfields(self):
     return fast_parse.get_all_subfields(self.line, self.rec.leader()[9] != 'a')
Exemplo n.º 4
0
def read_people(people):
    found = defaultdict(int)
    marc_alt = {}
    people = list(people)

    for lines in people:
        for line in lines:
            p = tuple(clean_subfield(k, v) for k, v in get_all_subfields(line))
            #check_for_dup_a(p)
            found[date_field_missing(p)]+=1

    for p in found.keys():
        c = None
        for k, v in p:
            if k == 'c':
                c = v
                break
        if not c or c.lower() != 'family':
            continue
        new = tuple((k, v + ' family' if k == 'a' else v) for k, v in p if k != 'c') 
        if new in found:
            found[new] += found.pop(p)
            marc_alt[p] = new

    fix_bad_subtags(found, marc_alt)

    wrong_subtag_on_date(found, marc_alt)

    try:
        missing_subtag(found, marc_alt)
    except AssertionError:
        print people
        raise

    found_name = defaultdict(int)
    for p, num in found.items():
        found_name[just_abcdq(p)] += num
    found = found_name

    assert found

    if len(found) == 1:
        return dict(found), marc_alt

    #for func in subtag_should_be_c, merge_question_date:
    #for func in subtag_should_be_c, merge_question_date, missing_subtag, bad_char_name_match:
    for func in subtag_should_be_c, merge_question_date, bad_char_name_match, similar_dates:
        func(found, marc_alt)

        if len(found) == 1:
            return dict(found), marc_alt

    assert found

    # one author missing death date
    name_and_birth = build_name_and_birth(found)

    assert found

    try:
        if authority_lookup(name_and_birth, found, marc_alt):
            if len(found) == 1:
                return dict(found), marc_alt

            name_and_birth = build_name_and_birth(found)
    except AssertionError:
        print people
        raise

    assert found

    for p, num in found.items():
        if p not in name_and_birth:
            continue
        assert len(name_and_birth[p]) == 1
        new_name = list(name_and_birth[p])[0]
        found[new_name] += found.pop(p)
        marc_alt[p] = new_name

    assert found

    if len(found) == 1:
        return dict(found), marc_alt

    # match up authors with the same name
    # where one has dates and the other doesn't
    by_name = build_by_name(found)

    try:
        if authority_lookup(by_name, found, marc_alt):
            if len(found) == 1:
                return dict(found), marc_alt
            by_name = build_by_name(found) # rebuild
    except AssertionError:
        print people
        raise

    for p, num in found.items():
        if p not in by_name:
            continue
        if len(by_name[p]) != 1:
            for i in by_name[p]:
                print i
            print people
        assert len(by_name[p]) == 1
        new_name = list(by_name[p])[0]
        found[new_name] += found.pop(p)
        marc_alt[p] = new_name
    assert found

    if len(found) == 1:
        return dict(found), marc_alt

    by_date = defaultdict(set)
    for p in found:
        if not has_subtag('d', p):
            continue
        d = tuple(v for k, v in p if k=='d')
        by_date[d].add(p)
#    for k, v in by_date.iteritems():
#        print len(v), k, v

    return dict(found), marc_alt
Exemplo n.º 5
0
from openlibrary.catalog.marc.fast_parse import get_all_subfields
import re

samples = [
    "00\x1faDi 1 juan.Guo se tian xiang /Wu Jingsuo bian.Ba duan jin /Xingshizhushi bian ji --di 2 juan.Wu sheng xi ;Shi er lou /Li Yu --di 3 juan.Jin xiang ting /Su'anzhuren bian.\x1ftFen zhuang lou /Zhuxishanren --\x1fgdi 4 juan.Wu se shi /Bilian'gezhuren.Ba dong tian /Wuseshizhuren.Wu feng yin /Chichi dao ren bian zhu --di 5 juan.Shuang feng qi yuan /Xueqiaozhuren zi ding.Jin shi yuan.Qing meng tuo /Anyangjiumin --di 6 juan.Wu mei yuan.Xiu qiu yuan.Yuan yang ying /Qiaoyunshanren bian --di 7 juan.Mei ren shu /Xu Zhen.Wan hua lou /Li Yutang --di 8 juan.Bei shi yan yi /Du Gang.Kong kong huan /Wugangzhuren bian ci.Chun qiu pei --di 9 juan.Qian Qi guo zhi /Wumenxiaoke.Hou Qi guo zhi /Yanshuisanren.Qiao shi yan yi /Lu Yingyang --di 10 juan.Liaohai dan zhong lu /Lu Renlong.Tian bao tu.Jin xiu yi --di 11 juan.Shi mei tu.Huan xi yuan jia /Xihuyuyinzhuren.Feng liu he shang.Liang jiao hun /Tianhuazangzhuren --di 12 juan.Ge lian hua ying.Qi lou chong meng /Wang Lanzhi.\x1e",
    '00\x1ftManierismus als Artistik : systematische Aspekte einer \xe8asthetischen Kategorie / R\xe8udiger Zymner -- "Stil" und "Manier" in der Alltagskultur / Kaspar Maase -- Die Wortfamilie von it. "Maniera" zwischen Literatur, bildender Kunst und Psychologie / Margarete Lindemann -- Der Manierismus : zur Problematik einer kunsthistorischen Erfindung / Horst Bredekamp -- Inszenierte K\xe8unstlichkeit : Musik als manieristisches Dispositiv / Hermann Danuser -- Manierismus als Stilbegriff in der Architekturgeschichte / Hermann Hipp -- "Raffael ohne H\xe8ande," oder, Das Kunstwerk zwischen Sch\xe8opfung und Fabrikation : Konzepte der "maniera" bei Vasari und seinen Zeitgenossen / Ursula Link-Heer -- "Sprezzatura" : Pontormos Portraits und das h\xe8ofische Ideal des Manierismus / Axel Christoph Gampp -- Maniera and the grotesque / Maria Fabricius Hansen -- Neulateinisches Figurengedicht und manieristische Poetik : zum "Poematum liber" (1573) des Richard Willis / Ulrich Ernst -- Manierismus als Selbstbehauptung, Jean Paul / Wolfgang Braungart --  Artistische Erkenntnis : (Sprach-)Alchimie und Manierismus in der Romantik / Axel Dunker -- "Als lebeten sie" / Holk Cruse.\x1e',
]

re_gt = re.compile('^(gt)+$')
re_gtr = re.compile('^(gtr)+$')
re_at = re.compile('^at+$')
re_end_num = re.compile('\d[]. ]*$')
for line in open('test_data/marc_toc'):
    (loc, line) = eval(line)
    #print loc
    subfields = list(get_all_subfields(line))
    if subfields[0][0] == '6':
        subfields.pop(0)
    subtags = ''.join(k for k, v in subfields)
    if re_at.match(subtags):
        a = subfields[0][1]
        m = re_end_num.search(a)
        print((bool(m), repr(a)))
        continue

        if not m:
            for k, v in subfields:
                print((k, repr(v)))
        assert m
    continue
    if re_gtr.match(subtags):
Exemplo n.º 6
0
def find_subjects(marc_subjects):
    person = defaultdict(int)
    event = defaultdict(int)
    work = defaultdict(int)
    org = defaultdict(int)
    time = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects:
        for tag, line in lines:
            aspects = find_aspects(line)
            if aspects:
                subject[aspects] += 1
            if re_large_book.search(line):
                continue
            if tag == '600': # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    person[name] += 1
            elif tag == '610': # org
                v = ' '.join(get_subfield_values(line, 'abcd'))
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    org[v] += 1

                for v in get_subfield_values(line, 'a'):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        org[v] += 1
            elif tag == '611': # event
                v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz')
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    event[v] += 1
            elif tag == '630': # work
                for v in get_subfield_values(line, ['a']):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        work[v] += 1
            elif tag == '650': # topical
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            elif tag == '651': # geo
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1
            else:
                print 'other', tag, list(get_all_subfields(line))

            cur = [v for k, v in get_all_subfields(line) if k=='a' or v.strip('. ').lower() == 'fiction']

            # skip: 'Good, Sally (Fictitious character) in fiction'
            if len(cur) > 1 and cur[-1].strip('. ').lower() == 'fiction' and ')' not in cur[-2]:
                subject[flip_subject(cur[-2]) + ' in fiction'] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    time[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                v = tidy_subject(v)
                if v:
                    subject[v] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if not v:
                    continue
                if aspects and re_aspects.search(v):
                    continue
                v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if person:
        ret['person'] = dict(person)
    if time:
        ret['time'] = dict(time)
    if place:
        ret['place'] = dict(place)
    if subject:
        ret['subject'] = dict(subject)
    if event:
        ret['event'] = dict(event)
    if org:
        ret['org'] = dict(org)
    if work:
        ret['work'] = dict(work)
    return ret
Exemplo n.º 7
0
 def get_all_subfields(self):
     return fast_parse.get_all_subfields(self.line, self.rec.leader()[9] != 'a')
Exemplo n.º 8
0
        if tag == '300':
            if 'accompanying_material' in rec:
                continue
            subtag_e = ' '.join(i.strip('. ') for i in get_subfield_values(line, set(['e'])))
            if subtag_e:
                if subtag_e.lower() in ('list', 'notes', 'book'):
                    continue
                rec['accompanying_material'] = subtag_e
            continue
        fields.setdefault(tag, []).append(line)

    for line in fields.get('041', []):
        found = []
        marc_h = list(get_subfield_values(line, 'h'))
        if not marc_h:
            continue
        for h in marc_h:
            if len(h) % 3 != 0:
                print 'bad:', list(get_all_subfields(line))
                continue
            found += ['/l/' + i for i in (h[i * 3:(i+1) * 3].lower() for i in range(len(h) / 3)) if i in langs]
        if found:
            rec.setdefault('translated_from', []).extend(found)

    rec.update(read_oclc(fields))

    if rec:
        rec['source_record'] = loc
        print >> out, rec
out.close()
Exemplo n.º 9
0
def test_bad_marc_line():
    line = '0 \x1f\xe2aEtude objective des ph\xe2enom\xe1enes neuro-psychiques;\x1e'
    assert list(get_all_subfields(line, True)) == [
        (u'á', u'Etude objective des phénomènes neuro-psychiques;')
    ]
Exemplo n.º 10
0
                i.strip('. ') for i in get_subfield_values(line, set(['e'])))
            if subtag_e:
                if subtag_e.lower() in ('list', 'notes', 'book'):
                    continue
                rec['accompanying_material'] = subtag_e
            continue
        fields.setdefault(tag, []).append(line)

    for line in fields.get('041', []):
        found = []
        marc_h = list(get_subfield_values(line, 'h'))
        if not marc_h:
            continue
        for h in marc_h:
            if len(h) % 3 != 0:
                print 'bad:', list(get_all_subfields(line))
                continue
            found += [
                '/l/' + i for i in (h[i * 3:(i + 1) * 3].lower()
                                    for i in range(len(h) / 3)) if i in langs
            ]
        if found:
            rec.setdefault('translated_from', []).extend(found)

    rec.update(read_oclc(fields))

    if rec:
        rec['source_record'] = loc
        print >> out, rec
out.close()
Exemplo n.º 11
0
 def get_all_subfields(self):
     return fast_parse.get_all_subfields(self.line)