示例#1
0
def read_author_person(line):
    author = {}
    contents = get_contents(line, ['a', 'b', 'c', 'd'])
    if 'a' not in contents and 'c' not in contents:
        return None # should at least be a name or title
    name = [v.strip(' /,;:') for v in get_subfield_values(line, ['a', 'b', 'c'])]
    if 'd' in contents:
        author = pick_first_date(contents['d'])
        if 'death_date' in author and author['death_date']:
            death_date = author['death_date']
            if re_number_dot.search(death_date):
                author['death_date'] = death_date[:-1]

    author['name'] = ' '.join(name)
    author['entity_type'] = 'person'
    subfields = [
        ('a', 'personal_name'),
        ('b', 'numeration'),
        ('c', 'title')
    ]
    for subfield, field_name in subfields:
        if subfield in contents:
            author[field_name] = ' '.join([x.strip(' /,;:') for x in contents[subfield]])
    if 'q' in contents:
        author['fuller_name'] = ' '.join(contents['q'])
    return author
示例#2
0
def test_lookup4():
    fields = (('a', 'Forbes, George'), ('d', '1849-1936.'))
    found = name_lookup(fields)
    dates = pick_first_date(v for k, v in fields if k == 'd')
    match = look_for_match(found, dates, False)
    for k, v in match.iteritems():
        print k, v
    match = pick_from_match(match)
    pprint(match)
示例#3
0
def test_lookup4():
    fields = (('a', 'Forbes, George'), ('d', '1849-1936.'))
    found = name_lookup(fields)
    dates = pick_first_date(v for k, v in fields if k == 'd')
    match = look_for_match(found, dates, False)
    for k, v in match.iteritems():
        print k, v
    match = pick_from_match(match)
    pprint(match)
示例#4
0
def read_line(line, name):
    if not line or '\x1fd' not in line:
        return
    subfields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
    marc_name = ' '.join(v for k, v in subfields if k in 'abc')
    flipped = flip_name(marc_name)
    if marc_name != name and flipped != name:
        return
    d = pick_first_date(v for k, v in subfields if k in 'abcd')
    dates = tuple(d.get(k, None) for k in ['birth_date', 'death_date', 'date'])
    return (marc_name, flipped, dates)
示例#5
0
def read_line(line, name):
    if not line or '\x1fd' not in line:
        return
    subfields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
    marc_name = ' '.join(v for k, v in subfields if k in 'abc')
    flipped = flip_name(marc_name)
    if marc_name != name and flipped != name:
        return
    d = pick_first_date(v for k, v in subfields if k in 'abcd')
    dates = tuple(d.get(k, None) for k in ['birth_date', 'death_date', 'date'])
    return (marc_name, flipped, dates)
示例#6
0
def test_lookup3():
    line = '00\x1faJohn,\x1fcof Paris,\x1fd1240?-1306.\x1e'
    print fmt_line(get_subfields(line, 'abcd'))
    fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
    print fields
    found = name_lookup(fields)
#    print [i for i in found if 'Paris' in i[0]]
#    found = [(u'John of Paris', [u'Christian philosophers', u'Dominicans', u'Roman Catholic theologians', u'13th-century Latin writers', u'1255 births', u'1306 deaths'], u'john of paris', None)]
    dates = pick_first_date(v for k, v in fields if k == 'd')
    match = look_for_match(found, dates, False)
    match = pick_from_match(match)
    pprint(match)
示例#7
0
def test_lookup():
    line = '00\x1faEgeria,\x1fd4th/5th cent.\x1e' # count=3
    wiki = 'Egeria (pilgrim)'
    print fmt_line(get_subfields(line, 'abcd'))
    fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
    print fields
    found = name_lookup(fields)
    print found
    dates = pick_first_date(v for k, v in fields if k == 'd')
    assert dates.items()[0] != ('date', '')
    print dates
    print
    print look_for_match(found, dates, True)
示例#8
0
def test_lookup2():
    line = '00\x1faRichard,\x1fcof St. Victor,\x1fdd. 1173.\x1e'
    print fmt_line(get_subfields(line, 'abcd'))
    fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
    print fields
    found = name_lookup(fields)
    dates = pick_first_date(v for k, v in fields if k == 'd')
    assert dates.items()[0] != ('date', '')
    print dates
    print
    match = look_for_match(found, dates, False)
    pprint(match)
    print
    match = pick_from_match(match)
    pprint(match)
示例#9
0
def parse_person(line):
    contents = get_person_content(line)
    marc_orig = list(get_all_subfields(line)),
    if not ('a' in contents or 'c' in contents):
        return marc_orig, {}
    assert 'a' in contents or 'c' in contents

    if 'd' in contents:
        author = pick_first_date(contents['d'])
    else:
        author = {}
    #author['marc_orig'] = list(get_all_subfields(line)),
    for tag, f in [ ('b', 'numeration'), ('c', 'title') ]:
        if tag in contents:
            author[f] = ' '.join(x.strip(' /,;:') for x in contents[tag])

    if 'a' in contents:
        name = ' '.join(x.strip(' /,;:') for x in contents['a'])
        name = remove_trailing_dot(name)
        m = re_marc_name.match(name)
        if m:
            author['family_name'] = m.group(1)
            author['given_names'] = m.group(2)
            author['name'] = m.group(2) + ' ' + m.group(1)
        else:
            author['name'] = name
    name_subfields = get_subfield_values(line, ['a', 'b', 'c'])
    author['sort'] = ' '.join(v.strip(' /,;:') for v in name_subfields)


    if 'q' in contents:
        if len(contents['q']) != 1:
            print(marc_orig)
        assert len(contents['q']) == 1
        q = strip_q(contents['q'][0])
        if 'given_names' in authors:
            assert initials(q) == initials(author['given_names']) \
                    or q.startswith(author['given_names'])
        author['given_names'] = q
    return marc_orig, author
示例#10
0
def db_marc_lookup():
    verbose = False
    c = get_cursor()
    articles = set()
    count = 0
    count_with_date = 0
    t0 = time()
    match_count = 0
    total = 3596802
    prev_fields = None
    fh = open('matches3', 'w')
    for line in bz2.BZ2File('marc_authors.bz2'):
        count+=1
        line = eval(line)
        line = strip_brackets(line)
        if count % 5000 == 0:
            t1 = time() - t0
            rec_per_sec = count / t1
            time_left = (total - count) / rec_per_sec
            print(fmt_line(get_subfields(line, 'abcd')))
            print(count, count_with_date, match_count, "%.2f%% %.2f mins left" % (float(match_count * 100.0) / float(count_with_date), time_left / 60))
        fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
        if prev_fields == fields:
            continue
        prev_fields = fields
        dates = pick_first_date(v for k, v in fields if k == 'd')
        if dates.items()[0] == ('date', ''):
            continue
        count_with_date += 1
        if verbose:
            print(fmt_line(get_subfields(line, 'abcd')))
            print(dates)
        is_noble_or_clergy = any(re_noble_or_clergy.search(v) \
            for v in get_subfield_values(line, 'c'))
        found = name_lookup(c, fields)
        if not found:
            continue
            if is_noble_or_clergy:
                print('noble or clergy not found:')
                print(fmt_line(get_subfields(line, 'abcd')))
                print()
            continue
        match = {}
        seen = set()
        for name, cats, match_name, pd in found:
            if name in seen:
                continue
            seen.add(name)
            cats = eval(cats)
            if not any(cat.endswith(' births') or cat.endswith(' deaths') for cat in cats):
                continue
            dm = date_match(dates, cats)
            if dm:
                match[name] = (cats, match_name)
            if not verbose:
                continue
            print((name, match_name))
            print("cats =", cats)
            print(('match' if dm else 'no match'))
            for field in ['birth', 'death']:
                print(field + 's:', [i[:-(len(field)+2)] for i in cats if i.endswith(' %ss' % field)], end=' ')
            print()
        if verbose:
            print('---')

        if not match:
            continue
            if is_noble_or_clergy:
                print('noble or clergy not found:')
                print(fmt_line(get_subfields(line, 'abcd')))
                print(found)
                print()
            continue
        match_count+=1
#        articles.add(match.keys()[0])
        if len(match) != 1:
            match = pick_from_match(match)
        if len(match) != 1:
            print(count, match_count)
            print(fmt_line(get_subfields(line, 'abcd')))
            more_than_one_match(match)
        else:
            #print (list(get_subfields(line, 'abcd')), match.keys()[0])
            print((match.keys()[0], fields), file=fh)
        continue
#        print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd'))
        assert len(match) == 1
    print(match_count)
    fh.close()
示例#11
0
def db_marc_lookup():
    c = get_cursor()
    articles = set()
    count = 0
    t0 = time()
    match_count = 0
    total = 3596802
    for line in bz2.BZ2File('marc_authors.bz2'):
        count += 1
        if count % 1000 == 0:
            t1 = time() - t0
            rec_per_sec = count / t1
            time_left = (total - count) / rec_per_sec
            print(
                count, match_count, "%.2f%% %.2f mins left" %
                ((match_count * 100) / count, time_left / 60))
        line = eval(line)
        line = strip_brackets(line)
        fields = [(k, v.strip(' /,;:'))
                  for k, v in get_subfields(line, 'abcd')]
        dates = pick_first_date(v for k, v in fields if k == 'd')
        if dates.items()[0] == ('date', ''):
            continue
        found = name_lookup(c, fields)
        if not found:
            continue
        match = {}
        seen = set()
        #        print fmt_line(get_subfields(line, 'abcd'))
        #        print dates
        for name, cats, match_name, pd in found:
            if name in seen:
                continue
            seen.add(name)
            cats = eval(cats)
            if not any(
                    cat.endswith(' births') or cat.endswith(' deaths')
                    for cat in cats):
                continue
            dm = date_match(dates, cats)
            if dm:
                match[name] = (cats, match_name)
            continue
            print((name, match_name))
            print("cats =", cats)
            print(('match' if dm else 'no match'))
            for field in ['birth', 'death']:
                print(field + 's:', [
                    i[:-(len(field) + 2)]
                    for i in cats if i.endswith(' %ss' % field)
                ],
                      end=' ')
            print()
#        print '---'

        if not match:
            continue
        match_count += 1
        #        articles.add(match.keys()[0])
        if len(match) != 1:
            print(count, match_count)
            print(fmt_line(get_subfields(line, 'abcd')))
            for name, (cats, match_name) in match.items():
                print(name, cats, match_name)
                print("http://en.wikipedia.org/wiki/" + name.replace(' ', '_'))
            print()
        continue
        #        print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd'))
        assert len(match) == 1
    print(match_count)
示例#12
0
def db_marc_lookup():
    c = get_cursor()
    articles = set()
    count = 0
    t0 = time()
    match_count = 0
    total = 3596802
    for line in bz2.BZ2File('marc_authors.bz2'):
        count+=1
        if count % 1000 == 0:
            t1 = time() - t0
            rec_per_sec = count / t1
            time_left = (total - count) / rec_per_sec
            print count, match_count, "%.2f%% %.2f mins left" % ((match_count * 100) / count, time_left / 60)
        line = eval(line)
        line = strip_brackets(line)
        fields = [(k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')]
        dates = pick_first_date(v for k, v in fields if k == 'd')
        if dates.items()[0] == ('date', ''):
            continue
        found = name_lookup(c, fields)
        if not found:
            continue
        match = {}
        seen = set()
#        print fmt_line(get_subfields(line, 'abcd'))
#        print dates
        for name, cats, match_name, pd in found:
            if name in seen:
                continue
            seen.add(name)
            cats = eval(cats)
            if not any(cat.endswith(' births') or cat.endswith(' deaths') for cat in cats):
                continue
            dm = date_match(dates, cats)
            if dm:
                match[name] = (cats, match_name)
            continue
            print (name, match_name)
            print "cats =", cats
            print ('match' if dm else 'no match')
            for field in ['birth', 'death']:
                print field + 's:', [i[:-(len(field)+2)] for i in cats if i.endswith(' %ss' % field)],
            print
#        print '---'

        if not match:
            continue
        match_count+=1
#        articles.add(match.keys()[0])
        if len(match) != 1:
            print count, match_count
            print fmt_line(get_subfields(line, 'abcd'))
            for name, (cats, match_name) in match.items():
                print name, cats, match_name
                print "http://en.wikipedia.org/wiki/" + name.replace(' ', '_')
            print
        continue
#        print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd'))
        assert len(match) == 1
    print match_count
示例#13
0
def db_marc_lookup():
    verbose = False
    articles = set()
    count = 0
    count_with_date = 0
    t0 = time()
    match_count = 0
    total = 3596802
    prev_fields = None
    fh = open('matches', 'w')
    bad = codecs.open('more_than_one_match', 'w', 'utf8')
    for line in open('/1/edward/wikipedia/marc_authors2'):
        count += 1
        #        (author_count, line) = eval(line)
        (line, author_count) = eval(line)
        #        line = strip_brackets(line)
        if count % 5000 == 0:
            t1 = time() - t0
            rec_per_sec = count / t1
            time_left = (total - count) / rec_per_sec
            #print fmt_line(get_subfields(line, 'abcd'))
            #            print list(get_subfields(line, 'abcd'))
            print line
            print count, count_with_date, match_count, "%.2f%% %.2f mins left" % (
                float(match_count * 100.0) / float(count_with_date),
                time_left / 60)
        fields = tuple((k, v.strip(' /,;:')) for k, v in line)
        if prev_fields == fields:
            continue
        prev_fields = fields
        dates = pick_first_date(v for k, v in fields if k == 'd')
        if dates.items()[0] == ('date', ''):
            continue
        count_with_date += 1
        if verbose:
            print line
            print dates
        is_noble_or_clergy = any(k == 'c' and re_noble_or_clergy.search(v)
                                 for k, v in fields)
        found = name_lookup(fields)
        if not found:
            continue
            if is_noble_or_clergy:
                print 'noble or clergy not found:', line
                print
            continue
        match = look_for_match(found, dates, verbose)

        if not match:
            continue
            if is_noble_or_clergy:
                print 'noble or clergy not found:'
                print fmt_line(line)
                print found
                print
            continue
        match_count += 1
        #        articles.add(match.keys()[0])
        if len(match) != 1:
            match = pick_from_match(match)
        if len(match) != 1:
            print >> bad, "\n" + fmt_line(line)
            for i in more_than_one_match(match):
                print >> bad, i
        else:
            #print (list(get_subfields(line, 'abcd')), match.keys()[0])
            cats = match.values()[0]['cats']
            exact = match.values()[0]['exact_dates']
            dc = [i for i in cats if any(i.endswith(j) for j in date_cats)]
            print >> fh, (match.keys()[0], fields, author_count, dc, exact,
                          'Living people' in cats)
    print match_count
    fh.close()
示例#14
0
def db_marc_lookup():
    verbose = False
    articles = set()
    count = 0
    count_with_date = 0
    t0 = time()
    match_count = 0
    total = 3596802
    prev_fields = None
    fh = open('matches', 'w')
    bad = codecs.open('more_than_one_match', 'w', 'utf8')
    for line in open('/1/edward/wikipedia/marc_authors2'):
        count+=1
#        (author_count, line) = eval(line)
        (line, author_count) = eval(line)
#        line = strip_brackets(line)
        if count % 5000 == 0:
            t1 = time() - t0
            rec_per_sec = count / t1
            time_left = (total - count) / rec_per_sec
            #print fmt_line(get_subfields(line, 'abcd'))
#            print list(get_subfields(line, 'abcd'))
            print line
            print count, count_with_date, match_count, "%.2f%% %.2f mins left" % (float(match_count * 100.0) / float(count_with_date), time_left / 60)
        fields = tuple((k, v.strip(' /,;:')) for k, v in line)
        if prev_fields == fields:
            continue
        prev_fields = fields
        dates = pick_first_date(v for k, v in fields if k == 'd')
        if dates.items()[0] == ('date', ''):
            continue
        count_with_date += 1
        if verbose:
            print line
            print dates
        is_noble_or_clergy = any(k =='c' and re_noble_or_clergy.search(v) for k, v in fields)
        found = name_lookup(fields)
        if not found:
            continue
            if is_noble_or_clergy:
                print 'noble or clergy not found:', line
                print
            continue
        match = look_for_match(found, dates, verbose)

        if not match:
            continue
            if is_noble_or_clergy:
                print 'noble or clergy not found:'
                print fmt_line(line)
                print found
                print
            continue
        match_count+=1
#        articles.add(match.keys()[0])
        if len(match) != 1:
            match = pick_from_match(match)
        if len(match) != 1:
            print >> bad, "\n" + fmt_line(line)
            for i in more_than_one_match(match):
                print >> bad, i
        else:
            #print (list(get_subfields(line, 'abcd')), match.keys()[0])
            cats = match.values()[0]['cats']
            exact = match.values()[0]['exact_dates']
            dc = [i for i in cats if any(i.endswith(j) for j in date_cats)]
            print >> fh, (match.keys()[0], fields, author_count, dc, exact, 'Living people' in cats)
    print match_count
    fh.close()
from __future__ import print_function
from catalog.utils import pick_first_date
import web
import re
import sys
import codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

re_marc_name = re.compile('^(.*), (.*)$')
re_end_dot = re.compile('[^ ][^ ]\.$', re.UNICODE)

def flip_name(name):
    # strip end dots like this: "Smith, John." but not like this: "Smith, J."
    m = re_end_dot.search(name)
    if m:
        name = name[:-1]

    m = re_marc_name.match(name)
    return m.group(2) + ' ' + m.group(1)

for wikipedia, marc in (eval(i) for i in open("matches4")):
    dates = pick_first_date(v for k, v in marc if k == 'd')
    name = ' '.join(v for k, v in marc if k in 'abc')
    print(name)
    if ', ' in name:
        print(flip_name(name))
    print(dates)

示例#16
0
from catalog.utils import pick_first_date
import web, re, sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

re_marc_name = re.compile('^(.*), (.*)$')
re_end_dot = re.compile('[^ ][^ ]\.$', re.UNICODE)


def flip_name(name):
    # strip end dots like this: "Smith, John." but not like this: "Smith, J."
    m = re_end_dot.search(name)
    if m:
        name = name[:-1]

    m = re_marc_name.match(name)
    return m.group(2) + ' ' + m.group(1)


for wikipedia, marc in (eval(i) for i in open("matches4")):
    dates = pick_first_date(v for k, v in marc if k == 'd')
    name = ' '.join(v for k, v in marc if k in 'abc')
    print name
    if ', ' in name:
        print flip_name(name)
    print dates