예제 #1
0
def get_work_title(e):
    # use first work title we find in source MARC records
    wt = None
    for src_type, src in get_marc_src(e):
        if src_type == 'ia':
            wt = get_ia_work_title(src)
            if wt:
                break
            continue
        assert src_type == 'marc'
        try:
            data = get_from_archive(src)
        except ValueError:
            print 'bad record source:', src
            print 'http://openlibrary.org' + e['key']
            continue
        if not data:
            continue
        try:
            line = get_first_tag(data, set(['240']))
        except BadDictionary:
            print 'bad dictionary:', src
            print 'http://openlibrary.org' + e['key']
            continue
        if line:
            wt = ' '.join(get_subfield_values(line, ['a'])).strip('. ')
            break
    if wt:
        return wt
    if not e.get('work_titles', []):
        return
    print 'work title in MARC, but not in OL'
    print 'http://openlibrary.org' + e['key']
    return e['work_titles'][0]
예제 #2
0
def get_work_title(e, mc):
    # use first work title we find in source MARC records
    wt = None
    for src_type, src in get_marc_src(e, mc):
        if src_type == 'ia':
            wt = get_ia_work_title(src)
            if wt:
                wt = wt.strip('. ')
            if wt:
                break
            continue
        assert src_type == 'marc'
        data = None
        try:
            data = get_data(src)
        except ValueError:
            print 'bad record source:', src
            print 'http://openlibrary.org' + e['key']
            continue
        except urllib2.HTTPError, error:
            print 'HTTP error:', error.code, error.msg
            print e['key']
        if not data:
            continue
        is_marc8 = data[9] != 'a'
        try:
            line = get_first_tag(data, set(['240']))
        except BadDictionary:
            print 'bad dictionary:', src
            print 'http://openlibrary.org' + e['key']
            continue
        if line:
            wt = ' '.join(get_subfield_values(line, ['a'], is_marc8)).strip('. ')
            break
예제 #3
0
def get_work_title(e, mc):
    # use first work title we find in source MARC records
    wt = None
    for src_type, src in get_marc_src(e, mc):
        if src_type == 'ia':
            wt = get_ia_work_title(src)
            if wt:
                wt = wt.strip('. ')
            if wt:
                break
            continue
        assert src_type == 'marc'
        data = None
        try:
            data = get_data(src)
        except ValueError:
            print 'bad record source:', src
            print 'http://openlibrary.org' + e['key']
            continue
        except urllib2.HTTPError, error:
            print 'HTTP error:', error.code, error.msg
            print e['key']
        if not data:
            continue
        is_marc8 = data[9] != 'a'
        try:
            line = get_first_tag(data, set(['240']))
        except BadDictionary:
            print 'bad dictionary:', src
            print 'http://openlibrary.org' + e['key']
            continue
        if line:
            wt = ' '.join(get_subfield_values(line, ['a'],
                                              is_marc8)).strip('. ')
            break
예제 #4
0
def get_work_title(e):
    if e['key'] not in marc:
        assert not e.get('work_titles', [])
        return
#    assert e.get('work_titles', [])
    data = marc[e['key']][1]
    line = get_first_tag(data, set(['240']))
    if not line:
        assert not e.get('work_titles', [])
        return
    return ' '.join(get_subfield_values(line, ['a'])).strip('. ')
예제 #5
0
def get_work_title(e):
    if e['key'] not in marc:
        assert not e.get('work_titles', [])
        return


#    assert e.get('work_titles', [])
    data = marc[e['key']][1]
    line = get_first_tag(data, set(['240']))
    if not line:
        assert not e.get('work_titles', [])
        return
    return ' '.join(get_subfield_values(line, ['a'])).strip('. ')
예제 #6
0
def get_work_title(e, mc):
    # use first work title we find in source MARC records
    wt = None
    for src_type, src in get_marc_src(e, mc):
        if src_type == 'ia':
            wt = get_ia_work_title(src)
            if wt:
                wt = wt.strip('. ')
            if wt:
                break
            continue
        assert src_type == 'marc'
        data = None
        try:
            data = get_data(src)
        except ValueError:
            print('bad record source:', src)
            print('http://openlibrary.org' + e['key'])
            continue
        except urllib.error.HTTPError as error:
            print('HTTP error:', error.code, error.msg)
            print(e['key'])
        if not data:
            continue
        is_marc8 = data[9] != 'a'
        try:
            line = get_first_tag(data, set(['240']))
        except BadDictionary:
            print('bad dictionary:', src)
            print('http://openlibrary.org' + e['key'])
            continue
        if line:
            wt = ' '.join(get_subfield_values(line, ['a'],
                                              is_marc8)).strip('. ')
            break
    if wt:
        return wt
    for f in 'work_titles', 'work_title':
        e_wt = e.get(f, [])
        if e_wt:
            assert isinstance(e_wt, list)
            return e_wt[0].strip('. ')
예제 #7
0
def get_work_title(e, mc):
    # use first work title we find in source MARC records
    wt = None
    for src_type, src in get_marc_src(e, mc):
        if src_type == 'ia':
            wt = get_ia_work_title(src)
            if wt:
                wt = wt.strip('. ')
            if wt:
                break
            continue
        assert src_type == 'marc'
        data = None
        try:
            data = get_data(src)
        except ValueError:
            print('bad record source:', src)
            print('http://openlibrary.org' + e['key'])
            continue
        except urllib2.HTTPError as error:
            print('HTTP error:', error.code, error.msg)
            print(e['key'])
        if not data:
            continue
        is_marc8 = data[9] != 'a'
        try:
            line = get_first_tag(data, set(['240']))
        except BadDictionary:
            print('bad dictionary:', src)
            print('http://openlibrary.org' + e['key'])
            continue
        if line:
            wt = ' '.join(get_subfield_values(line, ['a'], is_marc8)).strip('. ')
            break
    if wt:
        return wt
    for f in 'work_titles', 'work_title':
        e_wt = e.get(f, [])
        if e_wt:
            assert isinstance(e_wt, list)
            return e_wt[0].strip('. ')
예제 #8
0
def get_work_title(e):
    # use first work title we find in source MARC records
    wt = None
    for src_type, src in get_marc_src(e):
        if src_type == 'ia':
            wt = get_ia_work_title(src)
            if wt:
                break
            continue
        assert src_type == 'marc'
        data = None
        #print 'get from archive:', src
        try:
            data = get_data(src)
        except ValueError:
            print('bad record source:', src)
            print('http://openlibrary.org' + e['key'])
            continue
        except urllib2.HTTPError as error:
            print('HTTP error:', error.code, error.msg)
            print(e['key'])
        if not data:
            continue
        try:
            line = get_first_tag(data, set(['240']))
        except BadDictionary:
            print('bad dictionary:', src)
            print('http://openlibrary.org' + e['key'])
            continue
        if line:
            wt = ' '.join(get_subfield_values(line, ['a'])).strip('. ')
            break
    if wt:
        return wt
    if not e.get('work_titles', []):
        return
    print('work title in MARC, but not in OL')
    print('http://openlibrary.org' + e['key'])
    return e['work_titles'][0]
예제 #9
0
def find_subjects(marc_subjects):
    person = defaultdict(int)
    event = defaultdict(int)
    work = defaultdict(int)
    org = defaultdict(int)
    time = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects:
        for tag, line in lines:
            aspects = find_aspects(line)
            if aspects:
                subject[aspects] += 1
            if re_large_book.search(line):
                continue
            if tag == '600':  # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(
                        ' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    person[name] += 1
            elif tag == '610':  # org
                v = ' '.join(get_subfield_values(line, 'abcd'))
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    org[v] += 1

                for v in get_subfield_values(line, 'a'):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        org[v] += 1
            elif tag == '611':  # event
                v = ' '.join(j.strip() for i, j in get_all_subfields(line)
                             if i not in 'vxyz')
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    event[v] += 1
            elif tag == '630':  # work
                for v in get_subfield_values(line, ['a']):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        work[v] += 1
            elif tag == '650':  # topical
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            elif tag == '651':  # geo
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1
            else:
                print 'other', tag, list(get_all_subfields(line))

            cur = [
                v for k, v in get_all_subfields(line)
                if k == 'a' or v.strip('. ').lower() == 'fiction'
            ]

            # skip: 'Good, Sally (Fictitious character) in fiction'
            if len(cur) > 1 and cur[-1].strip(
                    '. ').lower() == 'fiction' and ')' not in cur[-2]:
                subject[flip_subject(cur[-2]) + ' in fiction'] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    time[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                v = tidy_subject(v)
                if v:
                    subject[v] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if not v:
                    continue
                if aspects and re_aspects.search(v):
                    continue
                v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if person:
        ret['person'] = dict(person)
    if time:
        ret['time'] = dict(time)
    if place:
        ret['place'] = dict(place)
    if subject:
        ret['subject'] = dict(subject)
    if event:
        ret['event'] = dict(event)
    if org:
        ret['org'] = dict(org)
    if work:
        ret['work'] = dict(work)
    return ret
예제 #10
0
def find_subjects(marc_subjects):
    person = defaultdict(int)
    event = defaultdict(int)
    work = defaultdict(int)
    org = defaultdict(int)
    time = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects:
        for tag, line in lines:
            aspects = find_aspects(line)
            if aspects:
                subject[aspects] += 1
            if re_large_book.search(line):
                continue
            if tag == '600': # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    person[name] += 1
            elif tag == '610': # org
                v = ' '.join(get_subfield_values(line, 'abcd'))
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    org[v] += 1

                for v in get_subfield_values(line, 'a'):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        org[v] += 1
            elif tag == '611': # event
                v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz')
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    event[v] += 1
            elif tag == '630': # work
                for v in get_subfield_values(line, ['a']):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        work[v] += 1
            elif tag == '650': # topical
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            elif tag == '651': # geo
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1
            else:
                print 'other', tag, list(get_all_subfields(line))

            cur = [v for k, v in get_all_subfields(line) if k=='a' or v.strip('. ').lower() == 'fiction']

            # skip: 'Good, Sally (Fictitious character) in fiction'
            if len(cur) > 1 and cur[-1].strip('. ').lower() == 'fiction' and ')' not in cur[-2]:
                subject[flip_subject(cur[-2]) + ' in fiction'] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    time[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                v = tidy_subject(v)
                if v:
                    subject[v] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if not v:
                    continue
                if aspects and re_aspects.search(v):
                    continue
                v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if person:
        ret['person'] = dict(person)
    if time:
        ret['time'] = dict(time)
    if place:
        ret['place'] = dict(place)
    if subject:
        ret['subject'] = dict(subject)
    if event:
        ret['event'] = dict(event)
    if org:
        ret['org'] = dict(org)
    if work:
        ret['work'] = dict(work)
    return ret
예제 #11
0
                continue
            title_seen = True
            if line[1] == '0': # no prefix
                continue
            contents = get_contents(line, ['a', 'b'])
            if 'a' in contents:
                rec['title'] = ' '.join(x.strip(' /,;:') for x in contents['a'])
            elif 'b' in contents:
                rec['title'] = contents['b'][0].strip(' /,;:')
            if 'title' in rec and has_dot(rec['title']):
                rec['title'] = rec['title'][:-1]
            continue
        if tag == '300':
            if 'accompanying_material' in rec:
                continue
            subtag_e = ' '.join(i.strip('. ') for i in get_subfield_values(line, set(['e'])))
            if subtag_e:
                if subtag_e.lower() in ('list', 'notes', 'book'):
                    continue
                rec['accompanying_material'] = subtag_e
            continue
        fields.setdefault(tag, []).append(line)

    for line in fields.get('041', []):
        found = []
        marc_h = list(get_subfield_values(line, 'h'))
        if not marc_h:
            continue
        for h in marc_h:
            if len(h) % 3 != 0:
                print 'bad:', list(get_all_subfields(line))
예제 #12
0
            if line[1] == '0':  # no prefix
                continue
            contents = get_contents(line, ['a', 'b'])
            if 'a' in contents:
                rec['title'] = ' '.join(
                    x.strip(' /,;:') for x in contents['a'])
            elif 'b' in contents:
                rec['title'] = contents['b'][0].strip(' /,;:')
            if 'title' in rec and has_dot(rec['title']):
                rec['title'] = rec['title'][:-1]
            continue
        if tag == '300':
            if 'accompanying_material' in rec:
                continue
            subtag_e = ' '.join(
                i.strip('. ') for i in get_subfield_values(line, set(['e'])))
            if subtag_e:
                if subtag_e.lower() in ('list', 'notes', 'book'):
                    continue
                rec['accompanying_material'] = subtag_e
            continue
        fields.setdefault(tag, []).append(line)

    for line in fields.get('041', []):
        found = []
        marc_h = list(get_subfield_values(line, 'h'))
        if not marc_h:
            continue
        for h in marc_h:
            if len(h) % 3 != 0:
                print 'bad:', list(get_all_subfields(line))
예제 #13
0
def find_subjects(w, marc_subjects=None):
    people = defaultdict(int)
    genres = defaultdict(int)
    when = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects or get_marc_subjects(w):
        for tag, line in lines:
            if re_large_book.match(line):
                continue
            if tag == '600': # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    people[name] += 1
            if tag == '650':
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            if tag == '651':
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    when[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    subject[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if people:
        ret['people'] = dict(people)
    if when:
        ret['times'] = dict(when)
    if place:
        ret['places'] = dict(place)
    if subject:
        ret['subjects'] = dict(subject)
    return ret