Пример #1
0
def test_lookup3():
    line = '00\x1faJohn,\x1fcof Paris,\x1fd1240?-1306.\x1e'
    print fmt_line(get_subfields(line, 'abcd'))
    fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
    print fields
    found = name_lookup(fields)
#    print [i for i in found if 'Paris' in i[0]]
#    found = [(u'John of Paris', [u'Christian philosophers', u'Dominicans', u'Roman Catholic theologians', u'13th-century Latin writers', u'1255 births', u'1306 deaths'], u'john of paris', None)]
    dates = pick_first_date(v for k, v in fields if k == 'd')
    match = look_for_match(found, dates, False)
    match = pick_from_match(match)
    pprint(match)
Пример #2
0
def test_lookup():
    line = '00\x1faEgeria,\x1fd4th/5th cent.\x1e' # count=3
    wiki = 'Egeria (pilgrim)'
    print fmt_line(get_subfields(line, 'abcd'))
    fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
    print fields
    found = name_lookup(fields)
    print found
    dates = pick_first_date(v for k, v in fields if k == 'd')
    assert dates.items()[0] != ('date', '')
    print dates
    print
    print look_for_match(found, dates, True)
Пример #3
0
def test_lookup2():
    line = '00\x1faRichard,\x1fcof St. Victor,\x1fdd. 1173.\x1e'
    print fmt_line(get_subfields(line, 'abcd'))
    fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
    print fields
    found = name_lookup(fields)
    dates = pick_first_date(v for k, v in fields if k == 'd')
    assert dates.items()[0] != ('date', '')
    print dates
    print
    match = look_for_match(found, dates, False)
    pprint(match)
    print
    match = pick_from_match(match)
    pprint(match)
Пример #4
0
def read_isbn(fields):
    if '020' not in fields:
        return {}

    found = []
    for line in fields['020']:
        if '\x1f' in line:
            for k, v in get_subfields(line, ['a', 'z']):
                m = re_isbn_and_price.match(v)
                if m:
                    found.append(m.group(1))
                else:
                    m = re_isbn.match(v)
                    if m:
                        found.append(m.group(1))
        else:
            m = re_isbn.match(line[3:-1])
            if m:
                found.append(m.group(1))
    ret = {}
    seen = set()

    for i in tidy_isbn(found):
        if i in seen: # avoid dups
            continue
        seen.add(i)
        if len(i) == 13:
            ret.setdefault('isbn_13', []).append(i)
        elif len(i) <= 16:
            ret.setdefault('isbn_10', []).append(i)
    return ret
Пример #5
0
def read_marc():
    for line in bz2.BZ2File('marc_authors.bz2'):
        line = eval(line)
        if '[Sound recording]' in line:
            continue
        line = strip_brackets(line)
        #print expr_in_utf8(get_all_subfields(line))
        print(fmt_line(get_subfields(line, 'abcd')))
Пример #6
0
def read_line(line, name):
    if not line or '\x1fd' not in line:
        return
    subfields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
    marc_name = ' '.join(v for k, v in subfields if k in 'abc')
    flipped = flip_name(marc_name)
    if marc_name != name and flipped != name:
        return
    d = pick_first_date(v for k, v in subfields if k in 'abcd')
    dates = tuple(d.get(k, None) for k in ['birth_date', 'death_date', 'date'])
    return (marc_name, flipped, dates)
Пример #7
0
def read_line(line, name):
    if not line or '\x1fd' not in line:
        return
    subfields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
    marc_name = ' '.join(v for k, v in subfields if k in 'abc')
    flipped = flip_name(marc_name)
    if marc_name != name and flipped != name:
        return
    d = pick_first_date(v for k, v in subfields if k in 'abcd')
    dates = tuple(d.get(k, None) for k in ['birth_date', 'death_date', 'date'])
    return (marc_name, flipped, dates)
Пример #8
0
def read_oclc(fields):
    found = []
    if '003' in fields and '001' in fields \
            and fields['003'][0] == 'OCoLC':
        oclc = fields['001'][0]
        assert oclc.isdigit()
        found.append(oclc)

    for line in fields.get('035', []):
        for k, v in get_subfields(line, ['a']):
            m = re_oclc.match(v)
            if m:
                oclc = m.group(1)
                if oclc not in found:
                    found.append(oclc)
    return {'oclc_number': remove_duplicates(found) } if found else {}
Пример #9
0
def read_series(fields):
    found = []
    for tag in ('440', '490', '830'):
        if tag not in fields:
            continue
        for line in fields[tag]:
            this = []
            for k, v in get_subfields(line, ['a', 'v']):
                if k == 'v' and v:
                    this.append(v)
                    continue
                v = v.rstrip('.,; ')
                if v:
                    this.append(v)
            if this:
                found += [' -- '.join(this)]
    return {'series': found} if found else {}
Пример #10
0
def read_lccn(fields):
    if '010' not in fields:
        return {}

    found = []
    for line in fields['010']:
        for k, v in get_subfields(line, ['a']):
            lccn = v.strip()
            if re_question.match(lccn):
                continue
            m = re_lccn.search(lccn)
            if not m:
                continue
            lccn = re_letters.sub('', m.group(1)).strip()
            if lccn:
                found.append(lccn)

    return {'lccn': found}
Пример #11
0
def db_marc_lookup():
    c = get_cursor()
    articles = set()
    count = 0
    t0 = time()
    match_count = 0
    total = 3596802
    for line in bz2.BZ2File('marc_authors.bz2'):
        count += 1
        if count % 1000 == 0:
            t1 = time() - t0
            rec_per_sec = count / t1
            time_left = (total - count) / rec_per_sec
            print(
                count, match_count, "%.2f%% %.2f mins left" %
                ((match_count * 100) / count, time_left / 60))
        line = eval(line)
        line = strip_brackets(line)
        fields = [(k, v.strip(' /,;:'))
                  for k, v in get_subfields(line, 'abcd')]
        dates = pick_first_date(v for k, v in fields if k == 'd')
        if dates.items()[0] == ('date', ''):
            continue
        found = name_lookup(c, fields)
        if not found:
            continue
        match = {}
        seen = set()
        #        print fmt_line(get_subfields(line, 'abcd'))
        #        print dates
        for name, cats, match_name, pd in found:
            if name in seen:
                continue
            seen.add(name)
            cats = eval(cats)
            if not any(
                    cat.endswith(' births') or cat.endswith(' deaths')
                    for cat in cats):
                continue
            dm = date_match(dates, cats)
            if dm:
                match[name] = (cats, match_name)
            continue
            print((name, match_name))
            print("cats =", cats)
            print(('match' if dm else 'no match'))
            for field in ['birth', 'death']:
                print(field + 's:', [
                    i[:-(len(field) + 2)]
                    for i in cats if i.endswith(' %ss' % field)
                ],
                      end=' ')
            print()
#        print '---'

        if not match:
            continue
        match_count += 1
        #        articles.add(match.keys()[0])
        if len(match) != 1:
            print(count, match_count)
            print(fmt_line(get_subfields(line, 'abcd')))
            for name, (cats, match_name) in match.items():
                print(name, cats, match_name)
                print("http://en.wikipedia.org/wiki/" + name.replace(' ', '_'))
            print()
        continue
        #        print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd'))
        assert len(match) == 1
    print(match_count)
Пример #12
0
def marc_title(data):
    line = get_first_tag(data, set(['245']))
    return ''.join(
        "<b>$%s</b>%s" % (esc(k), esc(v))
        for k, v in get_subfields(line, set(['a', 'b']))) if line else None
Пример #13
0
def db_marc_lookup():
    c = get_cursor()
    articles = set()
    count = 0
    t0 = time()
    match_count = 0
    total = 3596802
    for line in bz2.BZ2File('marc_authors.bz2'):
        count+=1
        if count % 1000 == 0:
            t1 = time() - t0
            rec_per_sec = count / t1
            time_left = (total - count) / rec_per_sec
            print count, match_count, "%.2f%% %.2f mins left" % ((match_count * 100) / count, time_left / 60)
        line = eval(line)
        line = strip_brackets(line)
        fields = [(k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')]
        dates = pick_first_date(v for k, v in fields if k == 'd')
        if dates.items()[0] == ('date', ''):
            continue
        found = name_lookup(c, fields)
        if not found:
            continue
        match = {}
        seen = set()
#        print fmt_line(get_subfields(line, 'abcd'))
#        print dates
        for name, cats, match_name, pd in found:
            if name in seen:
                continue
            seen.add(name)
            cats = eval(cats)
            if not any(cat.endswith(' births') or cat.endswith(' deaths') for cat in cats):
                continue
            dm = date_match(dates, cats)
            if dm:
                match[name] = (cats, match_name)
            continue
            print (name, match_name)
            print "cats =", cats
            print ('match' if dm else 'no match')
            for field in ['birth', 'death']:
                print field + 's:', [i[:-(len(field)+2)] for i in cats if i.endswith(' %ss' % field)],
            print
#        print '---'

        if not match:
            continue
        match_count+=1
#        articles.add(match.keys()[0])
        if len(match) != 1:
            print count, match_count
            print fmt_line(get_subfields(line, 'abcd'))
            for name, (cats, match_name) in match.items():
                print name, cats, match_name
                print "http://en.wikipedia.org/wiki/" + name.replace(' ', '_')
            print
        continue
#        print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd'))
        assert len(match) == 1
    print match_count
Пример #14
0
def marc_title(data):
    line = get_first_tag(data, set(["245"]))
    return "".join("<b>$%s</b>%s" % (esc(k), esc(v)) for k, v in get_subfields(line, set(["a", "b"]))) if line else None
Пример #15
0
def get_subfield_values(line, want):
    return [v for k, v in get_subfields(line, want)]
Пример #16
0
def marc_title(data):
    line = get_first_tag(data, set(['245']))
    return ''.join("<b>$%s</b>%s" % (esc(k), esc(v)) for k, v in get_subfields(line, set(['a', 'b']))) if line else None
Пример #17
0
def db_marc_lookup():
    verbose = False
    c = get_cursor()
    articles = set()
    count = 0
    count_with_date = 0
    t0 = time()
    match_count = 0
    total = 3596802
    prev_fields = None
    fh = open('matches3', 'w')
    for line in bz2.BZ2File('marc_authors.bz2'):
        count+=1
        line = eval(line)
        line = strip_brackets(line)
        if count % 5000 == 0:
            t1 = time() - t0
            rec_per_sec = count / t1
            time_left = (total - count) / rec_per_sec
            print(fmt_line(get_subfields(line, 'abcd')))
            print(count, count_with_date, match_count, "%.2f%% %.2f mins left" % (float(match_count * 100.0) / float(count_with_date), time_left / 60))
        fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
        if prev_fields == fields:
            continue
        prev_fields = fields
        dates = pick_first_date(v for k, v in fields if k == 'd')
        if dates.items()[0] == ('date', ''):
            continue
        count_with_date += 1
        if verbose:
            print(fmt_line(get_subfields(line, 'abcd')))
            print(dates)
        is_noble_or_clergy = any(re_noble_or_clergy.search(v) \
            for v in get_subfield_values(line, 'c'))
        found = name_lookup(c, fields)
        if not found:
            continue
            if is_noble_or_clergy:
                print('noble or clergy not found:')
                print(fmt_line(get_subfields(line, 'abcd')))
                print()
            continue
        match = {}
        seen = set()
        for name, cats, match_name, pd in found:
            if name in seen:
                continue
            seen.add(name)
            cats = eval(cats)
            if not any(cat.endswith(' births') or cat.endswith(' deaths') for cat in cats):
                continue
            dm = date_match(dates, cats)
            if dm:
                match[name] = (cats, match_name)
            if not verbose:
                continue
            print((name, match_name))
            print("cats =", cats)
            print(('match' if dm else 'no match'))
            for field in ['birth', 'death']:
                print(field + 's:', [i[:-(len(field)+2)] for i in cats if i.endswith(' %ss' % field)], end=' ')
            print()
        if verbose:
            print('---')

        if not match:
            continue
            if is_noble_or_clergy:
                print('noble or clergy not found:')
                print(fmt_line(get_subfields(line, 'abcd')))
                print(found)
                print()
            continue
        match_count+=1
#        articles.add(match.keys()[0])
        if len(match) != 1:
            match = pick_from_match(match)
        if len(match) != 1:
            print(count, match_count)
            print(fmt_line(get_subfields(line, 'abcd')))
            more_than_one_match(match)
        else:
            #print (list(get_subfields(line, 'abcd')), match.keys()[0])
            print((match.keys()[0], fields), file=fh)
        continue
#        print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd'))
        assert len(match) == 1
    print(match_count)
    fh.close()
Пример #18
0
def get_contents(line, want):
    contents = {}
    for k, v in get_subfields(line, want):
        contents.setdefault(k, []).append(v)
    return contents