def test_merge(): amazon = {'publishers': [u'Collins'], 'isbn': ['0002167360'], 'number_of_pages': 120, 'short_title': u'souvenirs', 'normalized_title': u'souvenirs', 'full_title': u'Souvenirs', 'titles': [u'Souvenirs', u'souvenirs'], 'publish_date': u'1975', 'authors': [u'David Hamilton', u'Photographer']} marc = {'publisher': [u'Collins'], 'isbn': [u'0002167360'], 'short_title': u'souvenirs', 'normalized_title': u'souvenirs', 'full_title': u'Souvenirs', 'titles': [u'Souvenirs', u'souvenirs'], 'publish_date': '1978', 'authors': [{'birth_date': u'1933', 'db_name': u'Hamilton, David 1933-', 'entity_type': 'person', 'name': u'Hamilton, David', 'personal_name': u'Hamilton, David'}], 'source_record_loc': 'marc_records_scriblio_net/part11.dat:155728070:617', 'number_of_pages': 120} # these records match with threshold = 650, but do not with threshold = 735 threshold = 735 assert attempt_merge(amazon, marc, 650) assert not attempt_merge(amazon, marc, threshold)
def test_merge6(): amazon = { 'publishers': ['Fount'], 'isbn_10': ['0002176157'], 'number_of_pages': 224, 'short_title': 'basil hume', 'normalized_title': 'basil hume', 'full_title': 'Basil Hume', 'titles': ['Basil Hume', 'basil hume'], 'publish_date': '1986', 'authors': [('Tony Castle', 'Editor')], } marc = { 'publisher': ['Collins'], 'isbn_10': ['0002176157'], 'short_title': 'basil hume a portrait', 'normalized_title': 'basil hume a portrait', 'full_title': 'Basil Hume : a portrait', 'titles': ['Basil Hume : a portrait', 'basil hume a portrait'], 'number_of_pages': 158, 'publish_date': '1986', 'by_statement': 'edited by Tony Castle.', 'source_record_loc': 'marc_records_scriblio_net/part19.dat:39883132:951', } threshold = 735 assert attempt_merge(amazon, marc, threshold)
def try_merge(edition, ekey, thing): thing_type = thing['type']['key'] if 'isbn_10' not in edition: print(edition) asin = edition.get('isbn_10', None) or edition['asin'] if 'authors' in edition: authors = [i['name'] for i in edition['authors']] else: authors = [] a = amazon_merge.build_amazon(edition, authors) assert isinstance(asin, str) assert thing_type == '/type/edition' # print edition['asin'], ekey if 'source_records' in thing: if 'amazon:' + asin in thing['source_records']: return True return source_records_match(a, thing) # print 'no source records' mc = get_mc(ekey) # print 'mc:', mc if mc == 'amazon:' + asin: return True if not mc: return False data = get_from_local(mc) e1 = build_marc(fast_parse.read_edition(data)) return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def ia_match(a, ia): try: loc, rec = get_ia(ia) except urllib.error.HTTPError: return False if rec is None or 'full_title' not in rec: return False try: e1 = build_marc(rec) except TypeError: print(rec) raise return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def test_merge5(): amazon = { 'publishers': ['HarperCollins Publishers (Australia) Pty Ltd'], 'isbn': ['0002174049'], 'number_of_pages': 120, 'short_title': 'netherlandish and german ', 'normalized_title': 'netherlandish and german paintings national gallery schools of painting', 'full_title': 'Netherlandish and German Paintings (National Gallery Schools of Painting)', 'titles': [ 'Netherlandish and German Paintings (National Gallery Schools of Painting)', 'netherlandish and german paintings national gallery schools of painting', 'Netherlandish and German Paintings', 'netherlandish and german paintings', ], 'publish_date': '1985', 'authors': ['Alistair Smith'], } marc = { 'publisher': ['National Gallery in association with W. Collins'], 'isbn': ['0002174049'], 'short_title': 'early netherlandish and g', 'normalized_title': 'early netherlandish and german paintings', 'full_title': 'Early Netherlandish and German paintings', 'titles': [ 'Early Netherlandish and German paintings', 'early netherlandish and german paintings', ], 'publish_date': '1985', 'authors': [{ 'db_name': 'National Gallery (Great Britain)', 'name': 'National Gallery (Great Britain)', 'entity_type': 'org', }], 'number_of_pages': 116, 'by_statement': 'Alistair Smith.', 'source_record_loc': 'marc_records_scriblio_net/part17.dat:170029527:1210', } threshold = 735 assert attempt_merge(amazon, marc, threshold)
def test_merge8(): amazon = { 'publishers': ['Shambhala'], 'isbn': ['1590301390'], 'number_of_pages': 144, 'short_title': 'the spiritual teaching of', 'normalized_title': 'the spiritual teaching of ramana maharshi', 'full_title': 'The Spiritual Teaching of Ramana Maharshi', 'titles': [ 'The Spiritual Teaching of Ramana Maharshi', 'the spiritualteaching of ramana maharshi', 'Spiritual Teaching of Ramana Maharshi', 'spiritual teaching of ramana maharshi', ], 'publish_date': '2004', 'authors': ['Ramana Maharshi.'], } marc = { 'isbn': [], 'number_of_pages': 180, 'short_title': 'the spiritual teaching of', 'normalized_title': 'the spiritual teaching of mary of the incarnation', 'full_title': 'The spiritual teaching of Mary of the Incarnation', 'titles': [ 'The spiritual teaching of Mary of the Incarnation', 'the spiritual teaching of mary of the incarnation', 'spiritual teaching of Mary of the Incarnation', 'spiritual teaching of mary of the incarnation', ], 'publish_date': '1963', 'publish_country': 'nyu', 'authors': [{ 'db_name': 'Jett\u00e9, Fernand.', 'name': 'Jett\u00e9, Fernand.' }], } threshold = 735 assert not attempt_merge(amazon, marc, threshold)
def amazon_match(e1, thing): try: a = try_amazon(thing) except IndexError: print(thing['key']) raise except AttributeError: return False if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except: print(a) print(e1) print(thing['key']) raise
def amazon_match(e1, thing): try: a = try_amazon(thing) except IndexError: print thing['key'] raise except AttributeError: return False if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except: print a print e1 print thing['key'] raise
def test_merge7(): amazon = { 'publishers': ['HarperCollins Publishers Ltd'], 'isbn': ['0002176319'], 'number_of_pages': 256, 'short_title': 'pucklers progress', 'normalized_title': 'pucklers progress', 'full_title': "Puckler's Progress", 'titles': ["Puckler's Progress", 'pucklers progress'], 'publish_date': '1987', 'authors': ['Flora Brennan'], } marc = { 'publisher': ['Collins'], 'isbn': ['0002176319'], 'short_title': 'pucklers progress the ad', 'normalized_title': 'pucklers progress the adventures of prince puckler muskau in england wales and ireland as told in letters to his former wife 1826 9', 'full_title': "Puckler's progress : the adventures of Prince Pu\u0308ckler-Muskau in England, Wales, and Ireland as told in letters to his former wife, 1826-9", 'titles': [ "Puckler's progress : the adventures of Prince Pu\u0308ckler-Muskau in England, Wales, and Ireland as told in letters to his former wife, 1826-9", 'pucklers progress the adventures of prince puckler muskau in england wales and ireland as told in letters to his former wife 1826 9', ], 'publish_date': '1987', 'authors': [{ 'name': 'Pu\u0308ckler-Muskau, Hermann Furst von', 'title': 'Furst von', 'death_date': '1871.', 'db_name': 'Pu\u0308ckler-Muskau, Hermann Furst von 1785-1871.', 'birth_date': '1785', 'personal_name': 'Pu\u0308ckler-Muskau, Hermann', 'entity_type': 'person', }], 'number_of_pages': 254, 'by_statement': 'translated by Flora Brennan.', 'source_record_loc': 'marc_records_scriblio_net/part19.dat:148554594:1050', } threshold = 735 assert attempt_merge(amazon, marc, threshold)
def test_merge2(): amazon = { 'publishers': ['Collins'], 'isbn': ['0002167530'], 'number_of_pages': 287, 'short_title': 'sea birds britain ireland', 'normalized_title': 'sea birds britain ireland', 'full_title': 'Sea Birds Britain Ireland', 'titles': ['Sea Birds Britain Ireland', 'sea birds britain ireland'], 'publish_date': '1975', 'authors': ['Stanley Cramp'], } marc = { 'publisher': ['Collins'], 'isbn': ['0002167530'], 'short_title': 'seabirds of britain and i', 'normalized_title': 'seabirds of britain and ireland', 'full_title': 'seabirds of Britain and Ireland', 'titles': [ 'seabirds of Britain and Ireland', 'seabirds of britain and ireland', ], 'publish_date': '1974', 'authors': [{ 'db_name': 'Cramp, Stanley.', 'entity_type': 'person', 'name': 'Cramp, Stanley.', 'personal_name': 'Cramp, Stanley.', }], 'source_record_loc': 'marc_records_scriblio_net/part08.dat:61449973:855', } threshold = 735 assert attempt_merge(amazon, marc, threshold)
def test_merge3(): amazon = { 'publishers': ['Intl Specialized Book Service Inc'], 'isbn_10': ['0002169770'], 'number_of_pages': 207, 'short_title': 'women of the north', 'normalized_title': 'women of the north', 'full_title': 'Women of the North', 'titles': ['Women of the North', 'women of the north'], 'publish_date': '1985', 'authors': [('Jane Wordsworth', 'Author')], } marc = { 'publisher': ['Collins', 'Exclusive distributor ISBS'], 'isbn_10': ['0002169770'], 'short_title': 'women of the north', 'normalized_title': 'women of the north', 'full_title': 'Women of the North', 'titles': ['Women of the North', 'women of the north'], 'publish_date': '1981', 'number_of_pages': 207, 'authors': [{ 'db_name': 'Wordsworth, Jane.', 'entity_type': 'person', 'name': 'Wordsworth, Jane.', 'personal_name': 'Wordsworth, Jane.', }], 'source_record_loc': 'marc_records_scriblio_net/part17.dat:110989084:798', } threshold = 735 assert attempt_merge(amazon, marc, threshold)
def test_merge4(): amazon = { 'publishers': ['HarperCollins Publishers Ltd'], 'isbn_10': ['0002173433'], 'number_of_pages': 128, 'short_title': 'd day to victory', 'normalized_title': 'd day to victory', 'full_title': 'D-Day to Victory', 'titles': ['D-Day to Victory', 'd day to victory'], 'publish_date': '1984', 'authors': [('Wynfod Vaughan-Thomas', 'Editor, Introduction')], } marc = { 'publisher': ['Collins'], 'isbn_10': ['0002173433'], 'short_title': 'great front pages d day ', 'normalized_title': 'great front pages d day to victory 1944 1945', 'full_title': 'Great front pages : D-Day to victory 1944-1945', 'titles': [ 'Great front pages : D-Day to victory 1944-1945', 'great front pages dday to victory 1944 1945', ], 'publish_date': '1984', 'number_of_pages': 128, 'by_statement': 'introduced by Wynford Vaughan-Thomas.', 'source_record_loc': 'marc_records_scriblio_net/part17.dat:102360356:983', } threshold = 735 assert attempt_merge(amazon, marc, threshold)
if not mc: mc = get_mc(thing['key']) if not mc or mc == 'initial import': return False if mc.startswith('amazon:'): try: a = try_amazon(thing) except IndexError: print thing['key'] raise except AttributeError: return False if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except: print a print e1 print thing['key'] raise print 'mc:', mc try: assert not mc.startswith('ia:') data = get_from_archive(mc) if not data: return True rec2 = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print mc print edition_key
def marc_match(a, loc): assert loc rec = fast_parse.read_edition(get_from_local(loc)) e1 = build_marc(rec) # print 'amazon:', a return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def try_merge(e1, edition_key, thing): thing_type = thing['type'] if thing_type != Reference('/type/edition'): print(thing['key'], 'is', str(thing['type'])) if thing_type == Reference('/type/delete'): return False assert thing_type == Reference('/type/edition') if 'source_records' in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get('ocaid', None) print(edition_key) mc = get_mc(edition_key) print(mc) if mc: if mc.startswith('ia:'): ia = mc[3:] elif mc.endswith('.xml') or mc.endswith('.mrc'): ia = mc[:mc.find('/')] if '_meta.mrc:' in mc: print(thing) if 'ocaid' not in thing: return False ia = thing['ocaid'] rec2 = None if ia: if is_dark_or_bad(ia): return False try: rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print('no MARCXML') pass except urllib2.HTTPError as error: print(error.code) assert error.code in (404, 403) if not rec2: return True if not rec2: if not mc: mc = get_mc(thing['key']) if not mc or mc == 'initial import': return False if mc.startswith('amazon:'): try: a = try_amazon(thing) except IndexError: print(thing['key']) raise except AttributeError: return False if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except: print(a) print(e1) print(thing['key']) raise print('mc:', mc) try: assert not mc.startswith('ia:') data = get_from_archive(mc) if not data: return True rec2 = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print(mc) print(edition_key) return False except: print(mc) print(edition_key) raise if not rec2: return False try: e2 = build_marc(rec2) except TypeError: print(rec2) raise return attempt_merge(e1, e2, threshold, debug=False)
if not mc: mc = get_mc(thing['key']) if not mc or mc == 'initial import': return False if mc.startswith('amazon:'): try: a = try_amazon(thing) except IndexError: print thing['key'] raise except AttributeError: return False if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except: print a print e1 print thing['key'] raise print 'mc:', mc try: assert not mc.startswith('ia:') data = get_from_local(mc) if not data: return True rec2 = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print mc print edition_key