Пример #1
0
    def test_index(self):
        with open(self.bibdata_brief) as bibdata:
            brief_bibdata = hathi.HathiBibliographicRecord(json.load(bibdata))

        digwork = DigitizedWork(source_id='njp.32101013082597')
        digwork.populate_from_bibdata(brief_bibdata)
        digwork.save()
        solr, solr_collection = get_solr_connection()
        # digwork should be unindexed
        res = solr.query(solr_collection, {'q': '*:*'})
        assert res.get_results_count() == 0
        # reindex to check that the method works on a saved object
        digwork.index()
        # digwork should be unindexed still because no commitWithin
        res = solr.query(solr_collection, {'q': '*:*'})
        assert res.get_results_count() == 0
        digwork.index(params={'commitWithin': 500})
        sleep(1)
        # digwork should be returned by a query
        res = solr.query(solr_collection, {'q': '*:*'})
        assert res.get_results_count() == 1
        assert res.docs[0]['id'] == 'njp.32101013082597'
Пример #2
0
    def test_populate_from_bibdata(self):
        with open(self.bibdata_full) as bibdata:
            full_bibdata = hathi.HathiBibliographicRecord(json.load(bibdata))

        with open(self.bibdata_brief) as bibdata:
            brief_bibdata = hathi.HathiBibliographicRecord(json.load(bibdata))

        digwork = DigitizedWork(source_id='njp.32101013082597')
        digwork.populate_from_bibdata(brief_bibdata)
        assert digwork.record_id == brief_bibdata.record_id
        assert digwork.title == brief_bibdata.title
        assert digwork.pub_date == brief_bibdata.pub_dates[0]
        # no enumcron in this record
        assert digwork.enumcron == ''
        # fields from marc not set
        assert not digwork.author
        assert not digwork.pub_place
        assert not digwork.publisher

        # test no pub date
        brief_bibdata.info['publishDates'] = []
        digwork = DigitizedWork(source_id='njp.32101013082597')
        digwork.populate_from_bibdata(brief_bibdata)
        assert not digwork.pub_date

        # TODO: test enumcron from copy details

        # populate from full record
        digwork.populate_from_bibdata(full_bibdata)
        # title and subtitle set from marc
        assert digwork.title == full_bibdata.marcxml['245']['a']
        assert digwork.subtitle == full_bibdata.marcxml['245']['b']
        # fixture has indicator 0, no non-sort characters
        assert digwork.sort_title == ' '.join([digwork.title, digwork.subtitle])
        # authors should have trailing period removed
        assert digwork.author == full_bibdata.marcxml.author().rstrip('.')
        # comma should be stripped from publication place and publisher
        assert digwork.pub_place == full_bibdata.marcxml['260']['a'].strip(',')
        assert digwork.publisher == full_bibdata.marcxml['260']['b'].strip(',')

        # second bibdata record with sort title
        with open(self.bibdata_full2) as bibdata:
            full_bibdata = hathi.HathiBibliographicRecord(json.load(bibdata))

        digwork = DigitizedWork(source_id='aeu.ark:/13960/t1pg22p71')
        digwork.populate_from_bibdata(full_bibdata)
        assert digwork.title == full_bibdata.marcxml['245']['a']
        # subtitle should omit last two characters (trailing space and slash)
        assert digwork.subtitle == full_bibdata.marcxml['245']['b'][:-2]
        # fixture has title with non-sort characters
        assert digwork.sort_title == ' '.join([
            digwork.title[int(full_bibdata.marcxml['245'].indicators[1]):],
            full_bibdata.marcxml['245']['b']
        ])
        # store title before modifying it for tests
        orig_bibdata_title = full_bibdata.marcxml['245']['a']

        # test error in record (title non-sort character non-numeric)
        with open(self.bibdata_full2) as bibdata:
            full_bibdata = hathi.HathiBibliographicRecord(json.load(bibdata))
            full_bibdata.marcxml['245'].indicators[1] = ' '
            digwork.populate_from_bibdata(full_bibdata)
            assert digwork.sort_title == \
                ' '.join([digwork.title, full_bibdata.marcxml['245']['b']])

            # test error in title sort (doesn't include space after definite article)
            full_bibdata.marcxml['245'].indicators[1] = 3
            digwork.populate_from_bibdata(full_bibdata)
            assert not digwork.sort_title.startswith(' ')

            # test cleaning up leading punctuation
            full_bibdata.marcxml['245'].indicators[1] = 0
            full_bibdata.marcxml['245']['a'] = '"Elocutionary Language."'
            digwork.populate_from_bibdata(full_bibdata)
            assert not digwork.sort_title.startswith('"')

            full_bibdata.marcxml['245']['a'] = "[Pamphlets on Language.]"
            digwork.populate_from_bibdata(full_bibdata)
            assert not digwork.sort_title.startswith('[')

        # test title cleanup
        full_bibdata.marcxml['245']['a'] = orig_bibdata_title
        # - remove trailing slash from title
        full_bibdata.marcxml['245']['a'] += ' /'
        digwork.populate_from_bibdata(full_bibdata)
        # title should omit last two characters
        assert digwork.title == orig_bibdata_title
        # - remove initial open bracket
        full_bibdata.marcxml['245']['a'] = '[{}'.format(orig_bibdata_title)
        digwork.populate_from_bibdata(full_bibdata)
        assert digwork.title == orig_bibdata_title
        # - internal brackets should be unchanged
        full_bibdata.marcxml['245']['a'] = 'A third[-fourth] class reader.'
        digwork.populate_from_bibdata(full_bibdata)
        assert digwork.title == full_bibdata.marcxml['245']['a']

        # author trailing period not removed for single initials
        # - name with initials, no date
        full_bibdata.marcxml['100']['a'] = 'Mitchell, M. S.'
        full_bibdata.marcxml['100']['d'] = ''
        digwork.populate_from_bibdata(full_bibdata)
        assert digwork.author == full_bibdata.marcxml['100']['a']
        # - initials with no space
        full_bibdata.marcxml['100']['a'] = 'Mitchell, M.S.'
        digwork.populate_from_bibdata(full_bibdata)
        assert digwork.author == full_bibdata.marcxml['100']['a']
        # - esquire
        full_bibdata.marcxml['100']['a'] = 'Wilson, Richard, Esq.'
        digwork.populate_from_bibdata(full_bibdata)
        assert digwork.author == full_bibdata.marcxml['100']['a']
        # - remove '[from old catalog]'
        full_bibdata.marcxml['100']['a'] = 'Thurber, Samuel. [from old catalog]'
        digwork.populate_from_bibdata(full_bibdata)
        assert digwork.author == 'Thurber, Samuel'

        # sine loco/nomine should be cleared out
        full_bibdata.marcxml['260']['a'] = '[S.l.]'
        full_bibdata.marcxml['260']['b'] = '[s.n.]'
        digwork.populate_from_bibdata(full_bibdata)
        assert not digwork.pub_place
        assert not digwork.publisher

        # brackets around publisher and pub place should be removed
        full_bibdata.marcxml['260']['a'] = '[London]'
        full_bibdata.marcxml['260']['b'] = '[Faber]'
        digwork.populate_from_bibdata(full_bibdata)
        assert digwork.pub_place == full_bibdata.marcxml['260']['a'].strip('[]')
        assert digwork.publisher == full_bibdata.marcxml['260']['b'].strip('[]')
        full_bibdata.marcxml['260']['a'] = 'New Brunswick [N.J.]'
        digwork.populate_from_bibdata(full_bibdata)
        assert digwork.pub_place == full_bibdata.marcxml['260']['a']

        # clean up publisher preliminary text
        publisher = 'James Humphreys'
        variants =  [
            'Printed at',
            'Printed and sold by',
            'Printed and published by',
            'Pub. for',
            'Published for the',
            'Publisht for the',
        ]
        for prefix in variants:
            full_bibdata.marcxml['260']['b'] = ' '.join([prefix, publisher])
            digwork.populate_from_bibdata(full_bibdata)
            assert digwork.publisher == publisher

        # handle subtitle, publisher, place of publication unset
        full_bibdata.marcxml['245']['b'] = None
        full_bibdata.marcxml['260']['a'] = None
        full_bibdata.marcxml['260']['b'] = None
        digwork.populate_from_bibdata(full_bibdata)
        assert digwork.subtitle == ''
        assert digwork.pub_place == ''
        assert digwork.publisher == ''

        # NOTE: not currently testing publication info unavailable

        with open(self.bibdata_full2) as bibdata:
            full_bibdata = hathi.HathiBibliographicRecord(json.load(bibdata))

        # test that protected fields are respected
        digwork = DigitizedWork(source_id='aeu.ark:/13960/t1pg22p71')
        # set each of the protected fields
        digwork.title = 'Fake title'
        digwork.subtitle = 'Silly subtitle'
        digwork.sort_title = 'Sort title fake'
        digwork.enumcron = '0001'
        digwork.author = 'Not an author'
        digwork.pub_place = 'Nowhere'
        digwork.publisher = 'Not a publisher'
        digwork.pub_date = 2200
        # set all fields as protected
        digwork.protected_fields = ProtectedWorkFieldFlags.all_flags
        # fake bibdata for empty fields

        full_bibdata.copy_details = Mock()
        full_bibdata.copy_details.return_value = {
            'enumcron': '0002',
            'itemURL': 'http://example.com',

        }
        # fields have not changed
        digwork.populate_from_bibdata(full_bibdata)
        assert digwork.title == 'Fake title'
        assert digwork.subtitle == 'Silly subtitle'
        assert digwork.sort_title == 'Sort title fake'
        assert digwork.enumcron == '0001'
        assert digwork.pub_place == 'Nowhere'
        assert digwork.publisher == 'Not a publisher'
        assert digwork.pub_date == 2200
        # check fallbacks for title
        digwork.populate_from_bibdata(full_bibdata)
        assert digwork.title == 'Fake title'
        assert digwork.subtitle == 'Silly subtitle'
        assert digwork.sort_title == 'Sort title fake'
        # no protected fields
        digwork.protected_fields = ProtectedWorkFieldFlags.no_flags
        digwork.populate_from_bibdata(full_bibdata)
        # all fields overwritten
        assert digwork.title != 'Fake title'
        assert digwork.subtitle != 'Silly subtitle'
        assert digwork.sort_title != 'Sort title fake'
        assert digwork.enumcron != '0001'
        assert digwork.pub_place != 'Nowhere'
        assert digwork.publisher != 'Not a publisher'
        assert digwork.pub_date != 2200