def missing_subtag(found, marc_alt): merge = defaultdict(set) for p1, p2 in combinations(found, 2): subtag1 = [k for k, v in p1 if k in 'abcdq'] subtag2 = [k for k, v in p2 if k in 'abcdq'] if subtag1 == subtag2: continue name1 = ' '.join(v.strip() for k, v in p1) name2 = ' '.join(v.strip() for k, v in p2) if not match_with_bad_chars(name1, name2) \ and normalize(name1) != normalize(name2) \ and normalize(remove_bad_marc_subtag(name1)) != normalize(remove_bad_marc_subtag(name2)) \ and normalize(name1.lower().replace(' the', '')) != normalize(name2.lower().replace(' the', '')): continue if len(subtag1) > len(subtag2): merge[p2].add(just_abcdq(p1)) else: merge[p1].add(just_abcdq(p2)) def flat_len(p): return len(' '.join(v for k, v in p)) for old, new in merge.items(): by_size = sorted((flat_len(p), p) for p in new) if len(by_size) > 1: assert by_size[-1][0] > by_size[-2][0] new_marc = by_size[-1][1] found[new_marc] += found.pop(old) marc_alt[old] = new_marc
def test_match_with_bad_chars(): samples = [ [ 'Machiavelli, Niccolo, 1469-1527', 'Machiavelli, Niccol\xf2 1469-1527' ], ['Humanitas Publica\xe7\xf5es', 'Humanitas Publicac?o?es'], [ 'A pesquisa ling\xfc\xedstica no Brasil', 'A pesquisa lingu?i?stica no Brasil', ], ['S\xe3o Paulo', 'Sa?o Paulo'], [ 'Diccionario espa\xf1ol-ingl\xe9s de bienes ra\xedces', 'Diccionario Espan\u0303ol-Ingle\u0301s de bienes rai\u0301ces', ], [ 'Konfliktunterdru?ckung in O?sterreich seit 1918', 'Konfliktunterdru\u0308ckung in O\u0308sterreich seit 1918', 'Konfliktunterdr\xfcckung in \xd6sterreich seit 1918', ], [ 'Soi\ufe20u\ufe21z khudozhnikov SSSR.', 'Soi?u?z khudozhnikov SSSR.', 'Soi\u0361uz khudozhnikov SSSR.', ], [ 'Andrzej Weronski', 'Andrzej Wero\u0144ski', 'Andrzej Weron\u0301ski' ], ] for l in samples: for a, b in combinations(l, 2): assert match_with_bad_chars(a, b)
def merge_authors(ol, keys, debug=False): # print 'merge author %s:"%s" and %s:"%s"' % (author['key'], author['name'], merge_with['key'], merge_with['name']) # print 'becomes: "%s"' % repr(new_name) authors = [ a for a in (withKey(k) for k in keys) if a['type']['key'] != '/type/redirect' ] not_redirect = set(a['key'] for a in authors) if debug: for a in authors: print(a) assert all(a['type']['key'] == '/type/author' for a in authors) name1 = authors[0]['name'] for a in authors: print(repr(a['key'], a['name'])) assert all(match_with_bad_chars(a['name'], name1) for a in authors[1:]) best_key = pick_best_author(authors)['key'] imgs = [ a['key'] for a in authors if a['key'] != '/a/OL2688880A' and has_image(a['key']) ] if len(imgs) == 1: new_key = imgs[0] else: new_key = "/a/OL%dA" % min(key_int(a) for a in authors) # Molière and O. J. O. Ferreira if len(imgs) != 0: print('imgs:', imgs) return # skip if not (imgs == [u'/a/OL21848A', u'/a/OL4280680A'] \ or imgs == [u'/a/OL325189A', u'/a/OL266422A'] \ or imgs == [u'/a/OL5160945A', u'/a/OL5776228A']): print(imgs) assert len(imgs) == 0 print(new_key) print(best_key) do_normalize(new_key, best_key, authors) old_keys = set(k for k in keys if k != new_key) print('old keys:', old_keys) for old in old_keys: # /b/OL21291659M switch_author(ol, old, new_key, old_keys, debug=True) if old in not_redirect: make_redirect(ol, old, new_key) q = { 'authors': old, 'type': '/type/edition', } if list(get_things(q)) != []: switch_author(ol, old, new_key, old_keys, debug=True)
def merge_authors(ol, keys, debug=False): # print 'merge author %s:"%s" and %s:"%s"' % (author['key'], author['name'], merge_with['key'], merge_with['name']) # print 'becomes: "%s"' % repr(new_name) authors = [a for a in (withKey(k) for k in keys) if a['type']['key'] != '/type/redirect'] not_redirect = set(a['key'] for a in authors) if debug: for a in authors: print(a) assert all(a['type']['key'] == '/type/author' for a in authors) name1 = authors[0]['name'] for a in authors: print(repr(a['key'], a['name'])) assert all(match_with_bad_chars(a['name'], name1) for a in authors[1:]) best_key = pick_best_author(authors)['key'] imgs = [a['key'] for a in authors if a['key'] != '/a/OL2688880A' and has_image(a['key'])] if len(imgs) == 1: new_key = imgs[0] else: new_key = "/a/OL%dA" % min(key_int(a) for a in authors) # Molière and O. J. O. Ferreira if len(imgs) != 0: print('imgs:', imgs) return # skip if not (imgs == [u'/a/OL21848A', u'/a/OL4280680A'] \ or imgs == [u'/a/OL325189A', u'/a/OL266422A'] \ or imgs == [u'/a/OL5160945A', u'/a/OL5776228A']): print(imgs) assert len(imgs) == 0 print(new_key) print(best_key) do_normalize(new_key, best_key, authors) old_keys = set(k for k in keys if k != new_key) print('old keys:', old_keys) for old in old_keys: # /b/OL21291659M switch_author(ol, old, new_key, old_keys, debug=True) if old in not_redirect: make_redirect(ol, old, new_key) q = { 'authors': old, 'type': '/type/edition', } if list(get_things(q)) != []: switch_author(ol, old, new_key, old_keys, debug=True)
def bad_char_name_match(found, marc_alt): merge = [] for p1, p2 in combinations(found, 2): if p1 == p2: continue if get_marc_date(p1) != get_marc_date(p2): continue p1, p2 = sorted([p1, p2], key=lambda i:found[i]) if found[p1] != found[p2]: name1 = ' '.join(v for k, v in p1 if k in 'abc') name2 = ' '.join(v for k, v in p2 if k in 'abc') if match_with_bad_chars(name1, name2): found[p2] += found.pop(p1) marc_alt[p1] = p2 for a, b in merge: if b not in found: continue found[a] += found.pop(b) marc_alt[b] = a
def test_match_with_bad_chars(): samples = [ [u'Machiavelli, Niccolo, 1469-1527', u'Machiavelli, Niccol\xf2 1469-1527'], [u'Humanitas Publica\xe7\xf5es', 'Humanitas Publicac?o?es'], [u'A pesquisa ling\xfc\xedstica no Brasil', 'A pesquisa lingu?i?stica no Brasil'], [u'S\xe3o Paulo', 'Sa?o Paulo'], [u'Diccionario espa\xf1ol-ingl\xe9s de bienes ra\xedces', u'Diccionario Espan\u0303ol-Ingle\u0301s de bienes rai\u0301ces'], [u'Konfliktunterdru?ckung in O?sterreich seit 1918', u'Konfliktunterdru\u0308ckung in O\u0308sterreich seit 1918', u'Konfliktunterdr\xfcckung in \xd6sterreich seit 1918'], [u'Soi\ufe20u\ufe21z khudozhnikov SSSR.', u'Soi?u?z khudozhnikov SSSR.', u'Soi\u0361uz khudozhnikov SSSR.'], [u'Andrzej Weronski', u'Andrzej Wero\u0144ski', u'Andrzej Weron\u0301ski'], ] for l in samples: for a, b in combinations(l, 2): assert match_with_bad_chars(a, b)