def test_initialization(self): with pytest.raises(ValueError) as excinfo: ContributorLane(self._default_library, None) assert "ContributorLane can't be created without contributor" in str( excinfo.value ) parent = WorkList() parent.initialize(self._default_library) lane = ContributorLane( self._default_library, self.contributor, parent, languages=["a"], audiences=["b"], ) assert self.contributor == lane.contributor assert ["a"] == lane.languages assert ["b"] == lane.audiences assert [lane] == parent.children # The contributor_key will be used in links to other pages # of this Lane and so on. assert "Lois Lane" == lane.contributor_key # If the contributor used to create a ContributorLane has no # display name, their sort name is used as the # contributor_key. contributor = ContributorData(sort_name="Lane, Lois") lane = ContributorLane(self._default_library, contributor) assert contributor == lane.contributor assert "Lane, Lois" == lane.contributor_key
def test_lifecycle_with_worklist(self): facets = Facets.default(self._default_library) pagination = Pagination.default() lane = WorkList() lane.initialize(self._default_library) # Fetch a cached feed from the database. It comes out updated. refresher = MockFeedGenerator() args = (self._db, lane, facets, pagination, refresher) feed = CachedFeed.fetch(*args, max_age=0, raw=True) assert "This is feed #1" == feed.content assert pagination.query_string == feed.pagination assert facets.query_string == feed.facets assert None == feed.lane_id assert lane.unique_key == feed.unique_key # Fetch it again, with a high max_age, and it's cached! feed = CachedFeed.fetch(*args, max_age=1000, raw=True) assert "This is feed #1" == feed.content # Fetch it with a low max_age, and it gets updated again. feed = CachedFeed.fetch(*args, max_age=0, raw=True) assert "This is feed #2" == feed.content # The special constant CACHE_FOREVER means it's always cached. feed = CachedFeed.fetch(*args, max_age=CachedFeed.CACHE_FOREVER, raw=True) assert "This is feed #2" == feed.content
def test_initialization(self): assert_raises_regexp( ValueError, "ContributorLane can't be created without contributor", ContributorLane, self._default_library, None) parent = WorkList() parent.initialize(self._default_library) lane = ContributorLane( self._default_library, self.contributor, parent, languages=['a'], audiences=['b'], ) eq_(self.contributor, lane.contributor) eq_(['a'], lane.languages) eq_(['b'], lane.audiences) eq_([lane], parent.children) # The contributor_key will be used in links to other pages # of this Lane and so on. eq_("Lois Lane", lane.contributor_key) # If the contributor used to create a ContributorLane has no # display name, their sort name is used as the # contributor_key. contributor = ContributorData(sort_name="Lane, Lois") lane = ContributorLane(self._default_library, contributor) eq_(contributor, lane.contributor) eq_("Lane, Lois", lane.contributor_key)
def make_child(): # Set up a WorkList with settings that contradict the # settings of the work we'll be using as the basis for our # WorkBasedLane. child = WorkList() child.initialize(self._default_library, 'sublane', languages=['eng'], audiences=[Classifier.AUDIENCE_ADULT]) return child
def test_load_facets_from_request(self): # The library has two EntryPoints enabled. self._default_library.setting( EntryPoint.ENABLED_SETTING).value = json.dumps([ EbooksEntryPoint.INTERNAL_NAME, AudiobooksEntryPoint.INTERNAL_NAME ]) with self.app.test_request_context("/?order=%s" % Facets.ORDER_TITLE): flask.request.library = self._default_library facets = load_facets_from_request() assert Facets.ORDER_TITLE == facets.order # Enabled facets are passed in to the newly created Facets, # in case the load method received a custom config. assert facets.facets_enabled_at_init != None with self.app.test_request_context("/?order=bad_facet"): flask.request.library = self._default_library problemdetail = load_facets_from_request() assert INVALID_INPUT.uri == problemdetail.uri # An EntryPoint will be picked up from the request and passed # into the Facets object, assuming the EntryPoint is # configured on the present library. worklist = WorkList() worklist.initialize(self._default_library) with self.app.test_request_context("/?entrypoint=Audio"): flask.request.library = self._default_library facets = load_facets_from_request(worklist=worklist) assert AudiobooksEntryPoint == facets.entrypoint assert False == facets.entrypoint_is_default # If the requested EntryPoint not configured, the default # EntryPoint is used. with self.app.test_request_context("/?entrypoint=NoSuchEntryPoint"): flask.request.library = self._default_library default_entrypoint = object() facets = load_facets_from_request( worklist=worklist, default_entrypoint=default_entrypoint) assert default_entrypoint == facets.entrypoint assert True == facets.entrypoint_is_default # Load a SearchFacets object that pulls information from an # HTTP header. with self.app.test_request_context("/", headers={"Accept-Language": "ja"}): flask.request.library = self._default_library facets = load_facets_from_request(base_class=SearchFacets) assert ["jpn"] == facets.languages
def load_lanes(_db, library): """Return a WorkList that reflects the current lane structure of the Library. If no top-level visible lanes are configured, the WorkList will be configured to show every book in the collection. If a single top-level Lane is configured, it will returned as the WorkList. Otherwise, a WorkList containing the visible top-level lanes is returned. """ top_level = WorkList.top_level_for_library(_db, library) # It's likely this WorkList will be used across sessions, so # expunge any data model objects from the database session. # # TODO: This is the cause of a lot of problems in the cached OPDS # feed generator. There, these Lanes are used in a normal database # session and we end up needing hacks to merge them back into the # session. if isinstance(top_level, Lane): to_expunge = [top_level] else: to_expunge = [x for x in top_level.children if isinstance(x, Lane)] map(_db.expunge, to_expunge) return top_level
def test_should_process_lane(self): parent = self._lane() parent.size = 100 child = self._lane(parent=parent) child.size = 10 grandchild = self._lane(parent=child) grandchild.size = 1 wl = WorkList() empty = self._lane(fiction=False) empty.size = 0 script = CacheMARCFiles(self._db, cmd_args=[]) script.max_depth = 1 eq_(True, script.should_process_lane(parent)) eq_(True, script.should_process_lane(child)) eq_(False, script.should_process_lane(grandchild)) eq_(True, script.should_process_lane(wl)) eq_(False, script.should_process_lane(empty)) script.max_depth = 0 eq_(True, script.should_process_lane(parent)) eq_(False, script.should_process_lane(child)) eq_(False, script.should_process_lane(grandchild)) eq_(True, script.should_process_lane(wl)) eq_(False, script.should_process_lane(empty))
def load_lanes(_db, library): """Return a WorkList that reflects the current lane structure of the Library. If no top-level visible lanes are configured, the WorkList will be configured to show every book in the collection. If a single top-level Lane is configured, it will returned as the WorkList. Otherwise, a WorkList containing the visible top-level lanes is returned. """ top_level = WorkList.top_level_for_library(_db, library) # It's likely this WorkList will be used across sessions, so # expunge any data model objects from the database session. if isinstance(top_level, Lane): to_expunge = [top_level] else: to_expunge = [x for x in top_level.children if isinstance(x, Lane)] map(_db.expunge, to_expunge) return top_level
def test_do_generate_handles_all_entrypoints(self): self.called_with = [] @classmethod def mock_groups(cls, *args, **kwargs): self.called_with.append((args, kwargs)) old_groups = AcquisitionFeed.groups AcquisitionFeed.groups = mock_groups # Here's a normal WorkList with no EntryPoints. worklist = WorkList() library = self._default_library worklist.initialize(library) script = CacheOPDSGroupFeedPerLane(self._db, cmd_args=[]) with script.app.test_request_context("/"): list(script.do_generate(worklist)) # AcquisitionFeed.groups was called once, with a FeaturedFacets # object that did not include an EntryPoint. args, kwargs = self.called_with.pop() facets = kwargs['facets'] assert isinstance(facets, FeaturedFacets) eq_(library.minimum_featured_quality, facets.minimum_featured_quality) eq_(worklist.uses_customlists, facets.uses_customlists) eq_(None, facets.entrypoint) # Now give the WorkList some EntryPoints. worklist.initialize( library, entrypoints=[AudiobooksEntryPoint, EbooksEntryPoint]) with script.app.test_request_context("/"): list(script.do_generate(worklist)) # AcquisitionFeed.groups was called once for each # EntryPoint available to the WorkList. eq_([AudiobooksEntryPoint, EbooksEntryPoint], [ kwargs['facets'].entrypoint for (args, kwargs) in self.called_with ]) AcquisitionFeed.groups = old_groups
def test__prepare_keys(self): # Verify the method that turns WorkList, Facets, and Pagination # into a unique set of values for CachedFeed fields. # First, prepare some mock classes. class MockCachedFeed(CachedFeed): feed_type_called_with = None @classmethod def feed_type(cls, worklist, facets): cls.feed_type_called_with = (worklist, facets) return "mock type" class MockFacets(object): query_string = b"facets query string" class MockPagination(object): query_string = b"pagination query string" m = MockCachedFeed._prepare_keys # A WorkList of some kind is required. with pytest.raises(ValueError) as excinfo: m(self._db, None, MockFacets, MockPagination) assert "Cannot prepare a CachedFeed without a WorkList." in str(excinfo.value) # Basic Lane case, no facets or pagination. lane = self._lane() # The response object is a named tuple. feed_type, library and # lane_id are the only members set. keys = m(self._db, lane, None, None) assert "mock type" == keys.feed_type assert lane.library == keys.library assert None == keys.work assert lane.id == keys.lane_id assert None == keys.unique_key assert "" == keys.facets_key assert "" == keys.pagination_key # When pagination and/or facets are available, facets_key and # pagination_key are set appropriately. keys = m(self._db, lane, MockFacets, MockPagination) assert "facets query string" == keys.facets_key assert "pagination query string" == keys.pagination_key # Now we can check that feed_type was obtained by passing # `worklist` and `facets` into MockCachedFeed.feed_type. assert "mock type" == keys.feed_type assert (lane, MockFacets) == MockCachedFeed.feed_type_called_with # When a WorkList is used instead of a Lane, keys.lane_id is None # but keys.unique_key is set to worklist.unique_key. worklist = WorkList() worklist.initialize( library=self._default_library, display_name="wl", languages=["eng", "spa"], audiences=[Classifier.AUDIENCE_CHILDREN], ) keys = m(self._db, worklist, None, None) assert "mock type" == keys.feed_type assert worklist.get_library(self._db) == keys.library assert None == keys.work assert None == keys.lane_id assert "wl-eng,spa-Children" == keys.unique_key assert keys.unique_key == worklist.unique_key assert "" == keys.facets_key assert "" == keys.pagination_key # When a WorkList is associated with a specific .work, # that information is included as keys.work. work = object() worklist.work = work keys = m(self._db, worklist, None, None) assert work == keys.work
def test_response_format(self): # Verify that fetch() can be told to return an appropriate # OPDSFeedResponse object. This is the default behavior, since # it preserves some useful information that would otherwise be # lost. facets = Facets.default(self._default_library) pagination = Pagination.default() wl = WorkList() wl.initialize(self._default_library) def refresh(): return "Here's a feed." private = object() r = CachedFeed.fetch( self._db, wl, facets, pagination, refresh, max_age=102, private=private ) assert isinstance(r, OPDSFeedResponse) assert 200 == r.status_code assert OPDSFeed.ACQUISITION_FEED_TYPE == r.content_type assert 102 == r.max_age assert "Here's a feed." == str(r) # The extra argument `private`, not used by CachedFeed.fetch, was # passed on to the OPDSFeedResponse constructor. assert private == r.private # The CachedFeed was created; just not returned. cf = self._db.query(CachedFeed).one() assert "Here's a feed." == cf.content # Try it again as a cache hit. r = CachedFeed.fetch( self._db, wl, facets, pagination, refresh, max_age=102, private=private ) assert isinstance(r, OPDSFeedResponse) assert 200 == r.status_code assert OPDSFeed.ACQUISITION_FEED_TYPE == r.content_type assert 102 == r.max_age assert "Here's a feed." == str(r) # If we tell CachedFeed to cache its feed 'forever', that only # applies to the _database_ cache. The client is told to cache # the feed for the default period. r = CachedFeed.fetch( self._db, wl, facets, pagination, refresh, max_age=CachedFeed.CACHE_FOREVER, private=private, ) assert isinstance(r, OPDSFeedResponse) assert OPDSFeed.DEFAULT_MAX_AGE == r.max_age # If the Library associated with the WorkList used in the feed # has root lanes, `private` is always set to True, even if we # asked for the opposite. from core.model import Library Library._has_root_lane_cache[self._default_library.id] = True r = CachedFeed.fetch(self._db, wl, facets, pagination, refresh, private=False) assert isinstance(r, OPDSFeedResponse) assert True == r.private
def test_no_race_conditions(self): # Why do we look up a CachedFeed again after feed generation? # Well, let's see what happens if someone else messes around # with the CachedFeed object _while the refresher is running_. # # This is a race condition that happens in real life. Rather # than setting up a multi-threaded test, we can have the # refresher itself simulate a background modification by # messing around with the CachedFeed object we know will # eventually be returned. # # The most up-to-date feed always wins, so background # modifications will take effect only if they made the # CachedFeed look _newer_ than the foreground process does. facets = Facets.default(self._default_library) pagination = Pagination.default() wl = WorkList() wl.initialize(self._default_library) m = CachedFeed.fetch # In this case, two simulated threads try to create the same # CachedFeed at the same time. We end up with a single # CachedFeed containing the result of the last code that ran. def simultaneous_refresher(): # This refresher method simulates another thread creating # a CachedFeed for this feed while this thread's # refresher is running. def other_thread_refresher(): return "Another thread made a feed." m(self._db, wl, facets, pagination, other_thread_refresher, 0, raw=True) return "Then this thread made a feed." # This will call simultaneous_refresher(), which will call # CachedFeed.fetch() _again_, which will call # other_thread_refresher(). result = m( self._db, wl, facets, pagination, simultaneous_refresher, 0, raw=True ) # We ended up with a single CachedFeed containing the # latest information. assert [result] == self._db.query(CachedFeed).all() assert "Then this thread made a feed." == result.content # If two threads contend for an existing CachedFeed, the one that # sets CachedFeed.timestamp to the later value wins. # # Here, the other thread wins by setting .timestamp on the # existing CachedFeed to a date in the future. now = utc_now() tomorrow = now + datetime.timedelta(days=1) yesterday = now - datetime.timedelta(days=1) def tomorrow_vs_now(): result.content = "Someone in the background set tomorrow's content." result.timestamp = tomorrow return "Today's content can't compete." tomorrow_result = m( self._db, wl, facets, pagination, tomorrow_vs_now, 0, raw=True ) assert tomorrow_result == result assert ( "Someone in the background set tomorrow's content." == tomorrow_result.content ) assert tomorrow_result.timestamp == tomorrow # Here, the other thread sets .timestamp to a date in the past, and # it loses out to the (apparently) newer feed. def yesterday_vs_now(): result.content = "Someone in the background set yesterday's content." result.timestamp = yesterday return "Today's content is fresher." now_result = m(self._db, wl, facets, pagination, yesterday_vs_now, 0, raw=True) # We got the same CachedFeed we've been getting this whole # time, but the outdated data set by the 'background thread' # has been fixed. assert result == now_result assert "Today's content is fresher." == result.content assert result.timestamp > yesterday # This shouldn't happen, but if the CachedFeed's timestamp or # content are *cleared out* in the background, between the # time the CacheFeed is fetched and the time the refresher # finishes, then we don't know what's going on and we don't # take chances. We create a whole new CachedFeed object for # the updated version of the feed. # First, try the situation where .timestamp is cleared out in # the background. def timestamp_cleared_in_background(): result.content = "Someone else sets content and clears timestamp." result.timestamp = None return "Non-weird content." result2 = m( self._db, wl, facets, pagination, timestamp_cleared_in_background, 0, raw=True, ) now = utc_now() # result2 is a brand new CachedFeed. assert result2 != result assert "Non-weird content." == result2.content assert (now - result2.timestamp).total_seconds() < 2 # We let the background process do whatever it wants to do # with the old one. assert "Someone else sets content and clears timestamp." == result.content assert None == result.timestamp # Next, test the situation where .content is cleared out. def content_cleared_in_background(): result2.content = None result2.timestamp = tomorrow return "Non-weird content." result3 = m( self._db, wl, facets, pagination, content_cleared_in_background, 0, raw=True ) now = utc_now() # Again, a brand new CachedFeed. assert result3 != result2 assert result3 != result assert "Non-weird content." == result3.content assert (now - result3.timestamp).total_seconds() < 2 # Again, we let the background process have the old one for # whatever weird thing it wants to do. assert None == result2.content assert tomorrow == result2.timestamp
def test_records(self): integration = self._integration() now = utc_now() exporter = MARCExporter.from_config(self._default_library) annotator = Annotator() lane = self._lane("Test Lane", genres=["Mystery"]) w1 = self._work(genre="Mystery", with_open_access_download=True) w2 = self._work(genre="Mystery", with_open_access_download=True) search_engine = MockExternalSearchIndex() search_engine.bulk_update([w1, w2]) # If there's a storage protocol but not corresponding storage integration, # it raises an exception. pytest.raises(Exception, exporter.records, lane, annotator) # If there is a storage integration, the output file is mirrored. mirror_integration = self._external_integration( ExternalIntegration.S3, ExternalIntegration.STORAGE_GOAL, username="******", password="******", ) mirror = MockS3Uploader() exporter.records( lane, annotator, mirror_integration, mirror=mirror, query_batch_size=1, upload_batch_size=1, search_engine=search_engine, ) # The file was mirrored and a CachedMARCFile was created to track the mirrored file. assert 1 == len(mirror.uploaded) [cache] = self._db.query(CachedMARCFile).all() assert self._default_library == cache.library assert lane == cache.lane assert mirror.uploaded[0] == cache.representation assert None == cache.representation.content assert ("https://test-marc-bucket.s3.amazonaws.com/%s/%s/%s.mrc" % ( self._default_library.short_name, quote(str(cache.representation.fetched_at)), quote(lane.display_name), ) == mirror.uploaded[0].mirror_url) assert None == cache.start_time assert cache.end_time > now # The content was uploaded in two parts. assert 2 == len(mirror.content[0]) complete_file = b"".join(mirror.content[0]) records = list(MARCReader(complete_file)) assert 2 == len(records) title_fields = [record.get_fields("245") for record in records] titles = [fields[0].get_subfields("a")[0] for fields in title_fields] assert set([w1.title, w2.title]) == set(titles) assert w1.title in w1.marc_record assert w2.title in w2.marc_record self._db.delete(cache) # It also works with a WorkList instead of a Lane, in which case # there will be no lane in the CachedMARCFile. worklist = WorkList() worklist.initialize(self._default_library, display_name="All Books") mirror = MockS3Uploader() exporter.records( worklist, annotator, mirror_integration, mirror=mirror, query_batch_size=1, upload_batch_size=1, search_engine=search_engine, ) assert 1 == len(mirror.uploaded) [cache] = self._db.query(CachedMARCFile).all() assert self._default_library == cache.library assert None == cache.lane assert mirror.uploaded[0] == cache.representation assert None == cache.representation.content assert ("https://test-marc-bucket.s3.amazonaws.com/%s/%s/%s.mrc" % ( self._default_library.short_name, quote(str(cache.representation.fetched_at)), quote(worklist.display_name), ) == mirror.uploaded[0].mirror_url) assert None == cache.start_time assert cache.end_time > now assert 2 == len(mirror.content[0]) complete_file = b"".join(mirror.content[0]) records = list(MARCReader(complete_file)) assert 2 == len(records) self._db.delete(cache) # If a start time is set, it's used in the mirror url. # # (Our mock search engine returns everthing in its 'index', # so this doesn't test that the start time is actually used to # find works -- that's in the search index tests and the # tests of MARCExporterFacets.) start_time = now - datetime.timedelta(days=3) mirror = MockS3Uploader() exporter.records( lane, annotator, mirror_integration, start_time=start_time, mirror=mirror, query_batch_size=2, upload_batch_size=2, search_engine=search_engine, ) [cache] = self._db.query(CachedMARCFile).all() assert self._default_library == cache.library assert lane == cache.lane assert mirror.uploaded[0] == cache.representation assert None == cache.representation.content assert ("https://test-marc-bucket.s3.amazonaws.com/%s/%s-%s/%s.mrc" % ( self._default_library.short_name, quote(str(start_time)), quote(str(cache.representation.fetched_at)), quote(lane.display_name), ) == mirror.uploaded[0].mirror_url) assert start_time == cache.start_time assert cache.end_time > now self._db.delete(cache) # If the search engine returns no contents for the lane, # nothing will be mirrored, but a CachedMARCFile is still # created to track that we checked for updates. empty_search_engine = MockExternalSearchIndex() mirror = MockS3Uploader() exporter.records( lane, annotator, mirror_integration, mirror=mirror, search_engine=empty_search_engine, ) assert [] == mirror.content[0] [cache] = self._db.query(CachedMARCFile).all() assert cache.representation == mirror.uploaded[0] assert self._default_library == cache.library assert lane == cache.lane assert None == cache.representation.content assert None == cache.start_time assert cache.end_time > now self._db.delete(cache)