def test_index(self): with open(self.bibdata_brief) as bibdata: brief_bibdata = hathi.HathiBibliographicRecord(json.load(bibdata)) digwork = DigitizedWork(source_id='njp.32101013082597') digwork.populate_from_bibdata(brief_bibdata) digwork.save() solr, solr_collection = get_solr_connection() # digwork should be unindexed res = solr.query(solr_collection, {'q': '*:*'}) assert res.get_results_count() == 0 # reindex to check that the method works on a saved object digwork.index() # digwork should be unindexed still because no commitWithin res = solr.query(solr_collection, {'q': '*:*'}) assert res.get_results_count() == 0 digwork.index(params={'commitWithin': 500}) sleep(1) # digwork should be returned by a query res = solr.query(solr_collection, {'q': '*:*'}) assert res.get_results_count() == 1 assert res.docs[0]['id'] == 'njp.32101013082597'
def test_populate_from_bibdata(self): with open(self.bibdata_full) as bibdata: full_bibdata = hathi.HathiBibliographicRecord(json.load(bibdata)) with open(self.bibdata_brief) as bibdata: brief_bibdata = hathi.HathiBibliographicRecord(json.load(bibdata)) digwork = DigitizedWork(source_id='njp.32101013082597') digwork.populate_from_bibdata(brief_bibdata) assert digwork.record_id == brief_bibdata.record_id assert digwork.title == brief_bibdata.title assert digwork.pub_date == brief_bibdata.pub_dates[0] # no enumcron in this record assert digwork.enumcron == '' # fields from marc not set assert not digwork.author assert not digwork.pub_place assert not digwork.publisher # test no pub date brief_bibdata.info['publishDates'] = [] digwork = DigitizedWork(source_id='njp.32101013082597') digwork.populate_from_bibdata(brief_bibdata) assert not digwork.pub_date # TODO: test enumcron from copy details # populate from full record digwork.populate_from_bibdata(full_bibdata) # title and subtitle set from marc assert digwork.title == full_bibdata.marcxml['245']['a'] assert digwork.subtitle == full_bibdata.marcxml['245']['b'] # fixture has indicator 0, no non-sort characters assert digwork.sort_title == ' '.join([digwork.title, digwork.subtitle]) # authors should have trailing period removed assert digwork.author == full_bibdata.marcxml.author().rstrip('.') # comma should be stripped from publication place and publisher assert digwork.pub_place == full_bibdata.marcxml['260']['a'].strip(',') assert digwork.publisher == full_bibdata.marcxml['260']['b'].strip(',') # second bibdata record with sort title with open(self.bibdata_full2) as bibdata: full_bibdata = hathi.HathiBibliographicRecord(json.load(bibdata)) digwork = DigitizedWork(source_id='aeu.ark:/13960/t1pg22p71') digwork.populate_from_bibdata(full_bibdata) assert digwork.title == full_bibdata.marcxml['245']['a'] # subtitle should omit last two characters (trailing space and slash) assert digwork.subtitle == full_bibdata.marcxml['245']['b'][:-2] # fixture has title with non-sort characters assert digwork.sort_title == ' '.join([ digwork.title[int(full_bibdata.marcxml['245'].indicators[1]):], full_bibdata.marcxml['245']['b'] ]) # store title before modifying it for tests orig_bibdata_title = full_bibdata.marcxml['245']['a'] # test error in record (title non-sort character non-numeric) with open(self.bibdata_full2) as bibdata: full_bibdata = hathi.HathiBibliographicRecord(json.load(bibdata)) full_bibdata.marcxml['245'].indicators[1] = ' ' digwork.populate_from_bibdata(full_bibdata) assert digwork.sort_title == \ ' '.join([digwork.title, full_bibdata.marcxml['245']['b']]) # test error in title sort (doesn't include space after definite article) full_bibdata.marcxml['245'].indicators[1] = 3 digwork.populate_from_bibdata(full_bibdata) assert not digwork.sort_title.startswith(' ') # test cleaning up leading punctuation full_bibdata.marcxml['245'].indicators[1] = 0 full_bibdata.marcxml['245']['a'] = '"Elocutionary Language."' digwork.populate_from_bibdata(full_bibdata) assert not digwork.sort_title.startswith('"') full_bibdata.marcxml['245']['a'] = "[Pamphlets on Language.]" digwork.populate_from_bibdata(full_bibdata) assert not digwork.sort_title.startswith('[') # test title cleanup full_bibdata.marcxml['245']['a'] = orig_bibdata_title # - remove trailing slash from title full_bibdata.marcxml['245']['a'] += ' /' digwork.populate_from_bibdata(full_bibdata) # title should omit last two characters assert digwork.title == orig_bibdata_title # - remove initial open bracket full_bibdata.marcxml['245']['a'] = '[{}'.format(orig_bibdata_title) digwork.populate_from_bibdata(full_bibdata) assert digwork.title == orig_bibdata_title # - internal brackets should be unchanged full_bibdata.marcxml['245']['a'] = 'A third[-fourth] class reader.' digwork.populate_from_bibdata(full_bibdata) assert digwork.title == full_bibdata.marcxml['245']['a'] # author trailing period not removed for single initials # - name with initials, no date full_bibdata.marcxml['100']['a'] = 'Mitchell, M. S.' full_bibdata.marcxml['100']['d'] = '' digwork.populate_from_bibdata(full_bibdata) assert digwork.author == full_bibdata.marcxml['100']['a'] # - initials with no space full_bibdata.marcxml['100']['a'] = 'Mitchell, M.S.' digwork.populate_from_bibdata(full_bibdata) assert digwork.author == full_bibdata.marcxml['100']['a'] # - esquire full_bibdata.marcxml['100']['a'] = 'Wilson, Richard, Esq.' digwork.populate_from_bibdata(full_bibdata) assert digwork.author == full_bibdata.marcxml['100']['a'] # - remove '[from old catalog]' full_bibdata.marcxml['100']['a'] = 'Thurber, Samuel. [from old catalog]' digwork.populate_from_bibdata(full_bibdata) assert digwork.author == 'Thurber, Samuel' # sine loco/nomine should be cleared out full_bibdata.marcxml['260']['a'] = '[S.l.]' full_bibdata.marcxml['260']['b'] = '[s.n.]' digwork.populate_from_bibdata(full_bibdata) assert not digwork.pub_place assert not digwork.publisher # brackets around publisher and pub place should be removed full_bibdata.marcxml['260']['a'] = '[London]' full_bibdata.marcxml['260']['b'] = '[Faber]' digwork.populate_from_bibdata(full_bibdata) assert digwork.pub_place == full_bibdata.marcxml['260']['a'].strip('[]') assert digwork.publisher == full_bibdata.marcxml['260']['b'].strip('[]') full_bibdata.marcxml['260']['a'] = 'New Brunswick [N.J.]' digwork.populate_from_bibdata(full_bibdata) assert digwork.pub_place == full_bibdata.marcxml['260']['a'] # clean up publisher preliminary text publisher = 'James Humphreys' variants = [ 'Printed at', 'Printed and sold by', 'Printed and published by', 'Pub. for', 'Published for the', 'Publisht for the', ] for prefix in variants: full_bibdata.marcxml['260']['b'] = ' '.join([prefix, publisher]) digwork.populate_from_bibdata(full_bibdata) assert digwork.publisher == publisher # handle subtitle, publisher, place of publication unset full_bibdata.marcxml['245']['b'] = None full_bibdata.marcxml['260']['a'] = None full_bibdata.marcxml['260']['b'] = None digwork.populate_from_bibdata(full_bibdata) assert digwork.subtitle == '' assert digwork.pub_place == '' assert digwork.publisher == '' # NOTE: not currently testing publication info unavailable with open(self.bibdata_full2) as bibdata: full_bibdata = hathi.HathiBibliographicRecord(json.load(bibdata)) # test that protected fields are respected digwork = DigitizedWork(source_id='aeu.ark:/13960/t1pg22p71') # set each of the protected fields digwork.title = 'Fake title' digwork.subtitle = 'Silly subtitle' digwork.sort_title = 'Sort title fake' digwork.enumcron = '0001' digwork.author = 'Not an author' digwork.pub_place = 'Nowhere' digwork.publisher = 'Not a publisher' digwork.pub_date = 2200 # set all fields as protected digwork.protected_fields = ProtectedWorkFieldFlags.all_flags # fake bibdata for empty fields full_bibdata.copy_details = Mock() full_bibdata.copy_details.return_value = { 'enumcron': '0002', 'itemURL': 'http://example.com', } # fields have not changed digwork.populate_from_bibdata(full_bibdata) assert digwork.title == 'Fake title' assert digwork.subtitle == 'Silly subtitle' assert digwork.sort_title == 'Sort title fake' assert digwork.enumcron == '0001' assert digwork.pub_place == 'Nowhere' assert digwork.publisher == 'Not a publisher' assert digwork.pub_date == 2200 # check fallbacks for title digwork.populate_from_bibdata(full_bibdata) assert digwork.title == 'Fake title' assert digwork.subtitle == 'Silly subtitle' assert digwork.sort_title == 'Sort title fake' # no protected fields digwork.protected_fields = ProtectedWorkFieldFlags.no_flags digwork.populate_from_bibdata(full_bibdata) # all fields overwritten assert digwork.title != 'Fake title' assert digwork.subtitle != 'Silly subtitle' assert digwork.sort_title != 'Sort title fake' assert digwork.enumcron != '0001' assert digwork.pub_place != 'Nowhere' assert digwork.publisher != 'Not a publisher' assert digwork.pub_date != 2200