Python get_subfields示例，openlibrary.catalog.marc.fast_parse.get_subfields Python示例

示例#1

0

显示文件

文件： new_parser.py 项目： sribanta/openlibrary

def read_isbn(fields):
    if '020' not in fields:
        return {}

    found = []
    for line in fields['020']:
        if '\x1f' in line:
            for k, v in get_subfields(line, ['a', 'z']):
                m = re_isbn_and_price.match(v)
                if m:
                    found.append(m.group(1))
                else:
                    m = re_isbn.match(v)
                    if m:
                        found.append(m.group(1))
        else:
            m = re_isbn.match(line[3:-1])
            if m:
                found.append(m.group(1))
    ret = {}
    seen = set()

    for i in tidy_isbn(found):
        if i in seen: # avoid dups
            continue
        seen.add(i)
        if len(i) == 13:
            ret.setdefault('isbn_13', []).append(i)
        elif len(i) <= 16:
            ret.setdefault('isbn_10', []).append(i)
    return ret

示例#2

0

显示文件

def find_aspects(line):
    cur = [(i, j) for i, j in get_subfields(line, 'ax')]
    if len(cur) < 2 or cur[0][0] != 'a' or cur[1][0] != 'x':
        return
    a, x = cur[0][1], cur[1][1]
    x = x.strip('. ')
    a = a.strip('. ')
    if not re_aspects.search(x):
        return
    if a == 'Body, Human':
        a = 'the Human body'
    return x + ' of ' + flip_subject(a)

示例#3

0

显示文件

文件： work_subject.py 项目： lukasklein/openlibrary

def find_aspects(line):
    cur = [(i, j) for i, j in get_subfields(line, 'ax')]
    if len(cur) < 2 or cur[0][0] != 'a' or cur[1][0] != 'x':
        return
    a, x = cur[0][1], cur[1][1]
    x = x.strip('. ')
    a = a.strip('. ')
    if not re_aspects.search(x):
        return
    if a == 'Body, Human':
        a = 'the Human body'
    return x + ' of ' + flip_subject(a)

示例#4

0

显示文件

文件： new_parser.py 项目： sribanta/openlibrary

def read_oclc(fields):
    found = []
    if '003' in fields and '001' in fields \
            and fields['003'][0] == 'OCoLC':
        oclc = fields['001'][0]
        assert oclc.isdigit()
        found.append(oclc)

    for line in fields.get('035', []):
        for k, v in get_subfields(line, ['a']):
            m = re_oclc.match(v)
            if m:
                oclc = m.group(1)
                if oclc not in found:
                    found.append(oclc)
    return {'oclc_number': remove_duplicates(found) } if found else {}

示例#5

0

显示文件

文件： new_parser.py 项目： sribanta/openlibrary

def read_series(fields):
    found = []
    for tag in ('440', '490', '830'):
        if tag not in fields:
            continue
        for line in fields[tag]:
            this = []
            for k, v in get_subfields(line, ['a', 'v']):
                if k == 'v' and v:
                    this.append(v)
                    continue
                v = v.rstrip('.,; ')
                if v:
                    this.append(v)
            if this:
                found += [' -- '.join(this)]
    return {'series': found} if found else {}

示例#6

0

显示文件

文件： new_parser.py 项目： sribanta/openlibrary

def read_lccn(fields):
    if '010' not in fields:
        return {}

    found = []
    for line in fields['010']:
        for k, v in get_subfields(line, ['a']):
            lccn = v.strip()
            if re_question.match(lccn):
                continue
            m = re_lccn.search(lccn)
            if not m:
                continue
            lccn = re_letters.sub('', m.group(1)).strip()
            if lccn:
                found.append(lccn)

    return {'lccn': found}

示例#7

0

显示文件

文件： Abraham_ibn_Daud.py 项目： RaceList/openlibrary

def test_lookup():
    for line in marc:
        fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
        found = name_lookup(fields)
        for i in found:
            print i
        dates = pick_first_date(v for k, v in fields if k == 'd')
        print dates
        match = look_for_match(found, dates, False)
        print len(match)
        for i in match:
            print i
        #pprint(match)
        if len(match) != 1:
            match = pick_from_match(match)
        if len(match) != 1:
            for i in more_than_one_match(match):
                print i
        print

示例#8

0

显示文件

文件： Abraham_ibn_Daud.py 项目： Arpanray01/Open-Library

def test_lookup():
    for line in marc:
        fields = tuple(
            (k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd'))
        found = name_lookup(fields)
        for i in found:
            print i
        dates = pick_first_date(v for k, v in fields if k == 'd')
        print dates
        match = look_for_match(found, dates, False)
        print len(match)
        for i in match:
            print i
        #pprint(match)
        if len(match) != 1:
            match = pick_from_match(match)
        if len(match) != 1:
            for i in more_than_one_match(match):
                print i
        print

示例#9

0

显示文件

def find_subjects(marc_subjects):
    person = defaultdict(int)
    event = defaultdict(int)
    work = defaultdict(int)
    org = defaultdict(int)
    time = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects:
        for tag, line in lines:
            aspects = find_aspects(line)
            if aspects:
                subject[aspects] += 1
            if re_large_book.search(line):
                continue
            if tag == '600':  # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(
                        ' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    person[name] += 1
            elif tag == '610':  # org
                v = ' '.join(get_subfield_values(line, 'abcd'))
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    org[v] += 1

                for v in get_subfield_values(line, 'a'):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        org[v] += 1
            elif tag == '611':  # event
                v = ' '.join(j.strip() for i, j in get_all_subfields(line)
                             if i not in 'vxyz')
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    event[v] += 1
            elif tag == '630':  # work
                for v in get_subfield_values(line, ['a']):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        work[v] += 1
            elif tag == '650':  # topical
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            elif tag == '651':  # geo
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1
            else:
                print 'other', tag, list(get_all_subfields(line))

            cur = [
                v for k, v in get_all_subfields(line)
                if k == 'a' or v.strip('. ').lower() == 'fiction'
            ]

            # skip: 'Good, Sally (Fictitious character) in fiction'
            if len(cur) > 1 and cur[-1].strip(
                    '. ').lower() == 'fiction' and ')' not in cur[-2]:
                subject[flip_subject(cur[-2]) + ' in fiction'] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    time[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                v = tidy_subject(v)
                if v:
                    subject[v] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if not v:
                    continue
                if aspects and re_aspects.search(v):
                    continue
                v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if person:
        ret['person'] = dict(person)
    if time:
        ret['time'] = dict(time)
    if place:
        ret['place'] = dict(place)
    if subject:
        ret['subject'] = dict(subject)
    if event:
        ret['event'] = dict(event)
    if org:
        ret['org'] = dict(org)
    if work:
        ret['work'] = dict(work)
    return ret

示例#10

0

显示文件

文件： work_subject.py 项目： lukasklein/openlibrary

def find_subjects(marc_subjects):
    person = defaultdict(int)
    event = defaultdict(int)
    work = defaultdict(int)
    org = defaultdict(int)
    time = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects:
        for tag, line in lines:
            aspects = find_aspects(line)
            if aspects:
                subject[aspects] += 1
            if re_large_book.search(line):
                continue
            if tag == '600': # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    person[name] += 1
            elif tag == '610': # org
                v = ' '.join(get_subfield_values(line, 'abcd'))
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    org[v] += 1

                for v in get_subfield_values(line, 'a'):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        org[v] += 1
            elif tag == '611': # event
                v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz')
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    event[v] += 1
            elif tag == '630': # work
                for v in get_subfield_values(line, ['a']):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        work[v] += 1
            elif tag == '650': # topical
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            elif tag == '651': # geo
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1
            else:
                print 'other', tag, list(get_all_subfields(line))

            cur = [v for k, v in get_all_subfields(line) if k=='a' or v.strip('. ').lower() == 'fiction']

            # skip: 'Good, Sally (Fictitious character) in fiction'
            if len(cur) > 1 and cur[-1].strip('. ').lower() == 'fiction' and ')' not in cur[-2]:
                subject[flip_subject(cur[-2]) + ' in fiction'] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    time[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                v = tidy_subject(v)
                if v:
                    subject[v] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if not v:
                    continue
                if aspects and re_aspects.search(v):
                    continue
                v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if person:
        ret['person'] = dict(person)
    if time:
        ret['time'] = dict(time)
    if place:
        ret['place'] = dict(place)
    if subject:
        ret['subject'] = dict(subject)
    if event:
        ret['event'] = dict(event)
    if org:
        ret['org'] = dict(org)
    if work:
        ret['work'] = dict(work)
    return ret

示例#11

0

显示文件

文件： new_parser.py 项目： sribanta/openlibrary

def get_subfield_values(line, want):
    return [v for k, v in get_subfields(line, want)]

示例#12

0

显示文件

文件： new_parser.py 项目： sribanta/openlibrary

def get_contents(line, want):
    contents = {}
    for k, v in get_subfields(line, want):
        contents.setdefault(k, []).append(v)
    return contents

示例#13

0

显示文件

文件： work_subject.py 项目： sribanta/openlibrary

def find_subjects(w, marc_subjects=None):
    people = defaultdict(int)
    genres = defaultdict(int)
    when = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects or get_marc_subjects(w):
        for tag, line in lines:
            if re_large_book.match(line):
                continue
            if tag == '600': # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    people[name] += 1
            if tag == '650':
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            if tag == '651':
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    when[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    subject[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if people:
        ret['people'] = dict(people)
    if when:
        ret['times'] = dict(when)
    if place:
        ret['places'] = dict(place)
    if subject:
        ret['subjects'] = dict(subject)
    return ret