forked from NYPL-Simplified/metadata_wrangler
/
coverage.py
430 lines (363 loc) · 16.4 KB
/
coverage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
from nose.tools import set_trace
from core.coverage import (
CoverageFailure,
CatalogCoverageProvider,
)
from core.metadata_layer import (
ReplacementPolicy,
)
from core.model import (
Collection,
CoverageRecord,
DataSource,
Edition,
ExternalIntegration,
get_one_or_create,
Identifier,
PresentationCalculationPolicy,
)
from core.overdrive import OverdriveAPI
from core.s3 import (
S3Uploader,
)
from core.util import fast_query_count
from overdrive import (
OverdriveBibliographicCoverageProvider,
OverdriveCoverImageMirror,
)
from content_cafe import (
ContentCafeCoverageProvider,
)
from content_server import (
LookupClientCoverageProvider,
)
from oclc_classify import (
OCLCClassifyCoverageProvider,
)
from mirror import ImageScaler
from oclc import (
LinkedDataCoverageProvider,
)
from viaf import (
VIAFClient,
)
class IdentifierResolutionCoverageProvider(CatalogCoverageProvider):
"""Make sure all Identifiers registered as needing coverage by this
CoverageProvider become Works with Editions and (probably dummy)
LicensePools.
Coverage happens by running the Identifier through _other_
CoverageProviders, filling in the blanks with additional data from
third-party entities.
For ISBNs, we end up with a bunch of Resources, rather than
Works. TODO: This needs to change.
"""
SERVICE_NAME = "Identifier Resolution Coverage Provider"
DATA_SOURCE_NAME = DataSource.INTERNAL_PROCESSING
INPUT_IDENTIFIER_TYPES = [
Identifier.OVERDRIVE_ID, Identifier.ISBN, Identifier.URI,
Identifier.GUTENBERG_ID
]
OPERATION = CoverageRecord.RESOLVE_IDENTIFIER_OPERATION
LICENSE_SOURCE_NOT_ACCESSIBLE = (
"Could not access underlying license source over the network.")
UNKNOWN_FAILURE = "Unknown failure."
DEFAULT_OVERDRIVE_COLLECTION_NAME = u'Default Overdrive'
def __init__(
self, collection, uploader=None, viaf_client=None,
linked_data_coverage_provider=None, content_cafe_api=None,
overdrive_api_class=OverdriveAPI, **kwargs
):
super(IdentifierResolutionCoverageProvider, self).__init__(
collection, **kwargs
)
# Since we are the metadata wrangler, any resources we find,
# we mirror to S3.
if not uploader:
uploader = S3Uploader.from_config(self._db)
self.uploader = uploader
# We're going to be aggressive about recalculating the presentation
# for this work because either the work is currently not set up
# at all, or something went wrong trying to set it up.
self.policy = PresentationCalculationPolicy(
regenerate_opds_entries=True
)
self.overdrive_api = self.create_overdrive_api(overdrive_api_class)
self.content_cafe_api = content_cafe_api
# Determine the optional and required coverage providers.
# Each Identifier in this Collection's catalog will be run
# through all relevant providers.
self.required_coverage_providers, self.optional_coverage_providers = self.providers()
# When we need to look up a contributor via VIAF we will use this
# client.
self.viaf_client = viaf_client or VIAFClient(self._db)
# Books are not looked up in OCLC Linked Data directly, since
# there is no Collection that identifies a book by its OCLC Number.
# However, when a book is looked up through OCLC Classify, some
# OCLC Numbers may be associated with it, and _those_ numbers
# can be run through OCLC Linked Data.
#
# TODO: We get many books identified by ISBN, and those books
# _could_ be run through a LinkedDataCoverageProvider if it
# worked a little differently. However, I don't think this
# would be very useful, since those books will get looked up
# through OCLC Classify, which will probably result in us
# finding that same ISBN via OCLC Number.
self.oclc_linked_data = (
linked_data_coverage_provider or
LinkedDataCoverageProvider(self._db, viaf_api=self.viaf_client)
)
# The ordinary OverdriveBibliographicCoverageProvider
# doesn't upload images, so we need to create our own
# mirror and scaler.
#
# TODO: This class would be neater if we were to subclass
# OverdriveBibliographicCoverageProvider to do the scaling and
# uploading.
self.image_mirrors = {
DataSource.OVERDRIVE : OverdriveCoverImageMirror(
self._db, uploader=uploader
)
}
self.image_scaler = ImageScaler(
self._db, self.image_mirrors.values(), uploader=uploader
)
def create_overdrive_api(self, overdrive_api_class):
collection, is_new = Collection.by_name_and_protocol(
self._db, self.DEFAULT_OVERDRIVE_COLLECTION_NAME,
ExternalIntegration.OVERDRIVE
)
if is_new:
raise ValueError('Default Overdrive collection has not been configured.')
return overdrive_api_class(self._db, collection)
def providers(self):
"""Instantiate required and optional CoverageProviders.
All Identifiers in this Collection's catalog will be run
through each provider. If an optional provider fails, nothing
will happen. If a required provider fails, the coverage
operation as a whole will fail.
NOTE: This method creates CoverageProviders that go against
real servers. Because of this, tests must use a subclass that
mocks providers(), such as
MockIdentifierResolutionCoverageProvider.
"""
# All books must be run through Content Cafe and OCLC
# Classify, assuming their identifiers are of the right
# type.
content_cafe = ContentCafeCoverageProvider(
self._db, api=self.content_cafe_api, uploader=self.uploader
)
oclc_classify = OCLCClassifyCoverageProvider(self._db)
optional = []
required = [content_cafe, oclc_classify]
# All books derived from OPDS import against the open-access
# content server must be looked up in that server.
#
# TODO: This could stand some generalization. Any OPDS server
# that also supports the lookup protocol can be used here.
if (self.collection.protocol == ExternalIntegration.OPDS_IMPORT
and self.collection.data_source
and self.collection.data_source.name == DataSource.OA_CONTENT_SERVER):
required.append(LookupClientCoverageProvider(self.collection))
# All books obtained from Overdrive must be looked up via the
# Overdrive API.
if self.collection.protocol == ExternalIntegration.OVERDRIVE:
required.append(
OverdriveBibliographicCoverageProvider(
self.collection, api_class=self.overdrive_api
)
)
return optional, required
def items_that_need_coverage(self, identifiers=None, **kwargs):
"""Find all identifiers lacking coverage from this CoverageProvider.
Only identifiers that have CoverageRecords in the 'transient
failure' state will be returned. Unlike with other
CoverageProviders, Identifiers that have no CoverageRecord at
all will not be processed.
"""
qu = super(IdentifierResolutionCoverageProvider, self).items_that_need_coverage(
identifiers=identifiers, **kwargs
)
qu = qu.filter(CoverageRecord.id != None)
return qu
def process_item(self, identifier):
"""For this identifier, checks that it has all of the available
3rd party metadata, and if not, obtains it.
If metadata failed to be obtained, and the coverage was deemed
required, then returns a CoverageFailure.
"""
self.log.info("Ensuring coverage for %r", identifier)
# Make sure there's a LicensePool for this Identifier in this
# Collection. Since we're the metadata wrangler, the
# LicensePool will probably be a stub that doesn't actually
# represent the right to loan the book, but that's okay.
license_pool = self.license_pool(identifier)
if not license_pool.licenses_owned:
license_pool.update_availability(1, 1, 0, 0)
# Go through all relevant providers and try to ensure coverage.
failure = self.run_through_relevant_providers(
identifier, self.required_coverage_providers,
fail_on_any_failure=True
)
if failure:
return failure
# Now go through relevant optional providers and try to ensure
# coverage.
failure = self.run_through_relevant_providers(
identifier, self.optional_coverage_providers,
fail_on_any_failure=False
)
if failure:
return failure
# We got coverage from all the required coverage providers,
# and none of the optional coverage providers raised an exception,
# so we're ready.
try:
self.finalize(identifier)
except Exception as e:
return self.transform_exception_into_failure(e, identifier)
return identifier
def run_through_relevant_providers(self, identifier, providers,
fail_on_any_failure):
"""Run the given Identifier through a set of CoverageProviders.
:param identifier: Process this Identifier.
:param providers: Run `identifier` through every relevant
CoverageProvider in this list.
:param fail_on_any_failure: True means that each
CoverageProvider must succeed or the whole operation
fails. False means that if a CoverageProvider fails it's
not a deal-breaker.
:return: A CoverageFailure if there was an unrecoverable failure,
None if everything went okay.
"""
for provider in providers:
if (provider.input_identifier_types
and not identifier.type in provider.input_identifier_types):
# The CoverageProvider under consideration doesn't
# handle Identifiers of this type.
continue
try:
record = provider.ensure_coverage(identifier, force=True)
if fail_on_any_failure and record.exception:
# As the CoverageProvider under consideration has
# fallen, so must this CoverageProvider also fall.
error_msg = "500: " + record.exception
transient = (
record.status == CoverageRecord.TRANSIENT_FAILURE
)
return self.failure(
identifier, error_msg, transient=transient
)
except Exception as e:
# An uncaught exception becomes a CoverageFailure no
# matter what.
return self.transform_exception_into_failure(e, identifier)
# Return None to indicate success.
return None
def transform_exception_into_failure(self, error, identifier):
"""Ensures coverage of a given identifier by a given provider with
appropriate error handling for broken providers.
"""
self.log.warn(
"Error completing coverage for %r: %r", identifier, error,
exc_info=error
)
return self.failure(identifier, repr(error), transient=True)
def finalize(self, identifier):
"""Sets equivalent identifiers from OCLC and processes the work."""
self.resolve_equivalent_oclc_identifiers(identifier)
if identifier.type==Identifier.ISBN:
# In order to create Works for ISBNs, we first have to
# create an edition associated with the ISBN as a primary
# identifier. At the moment, this is achieved via OCLC
# Linked Data.
self.generate_edition(identifier)
self.process_work(identifier)
def generate_edition(self, identifier):
"""Utilizes an ISBN's equivalent identifiers (OCLC Number or Work IDs)
to set an appropriate LicensePool presentation edition so a Work can
later be created.
"""
equivalent_ids = identifier.equivalent_identifier_ids()[identifier.id]
# Get the editions of equivalent identifiers (OCLC Number or Work IDs)
# to set as a presentation edition. These editions can be lower quality,
# and it's important that they have a title.
titled_equivalent_editions = self._db.query(Edition).\
join(Edition.primary_identifier).\
filter(Identifier.id.in_(equivalent_ids)).\
filter(Edition.title!=None)
# It's preferable that they have an author, too.
authored_equivalent_editions = titled_equivalent_editions.filter(
Edition.author!=None, Edition.author!=Edition.UNKNOWN_AUTHOR
)
if fast_query_count(authored_equivalent_editions):
# Prioritize editions with both a title and an author if available.
equivalent_editions = authored_equivalent_editions.all()
else:
equivalent_editions = titled_equivalent_editions.all()
if equivalent_editions:
# Set the presentation edition.
pool = identifier.licensed_through[0]
pool.set_presentation_edition(
equivalent_editions=equivalent_editions
)
def process_work(self, identifier):
"""Fill in VIAF data and cover images where possible before setting
a previously-unresolved identifier's work as presentation ready.
TODO: I think this should be split into a separate
WorkCoverageProvider which runs last. That way we have a record
of which Works have had this service.
"""
work = None
license_pools = identifier.licensed_through
if license_pools:
pool = license_pools[0]
work, created = pool.calculate_work(
even_if_no_author=True, exclude_search=True
)
if work:
self.resolve_viaf(work)
self.resolve_cover_image(work)
work.calculate_presentation(
policy=self.policy, exclude_search=True
)
work.set_presentation_ready(exclude_search=True)
else:
error_msg = "500; " + "Work could not be calculated for %r" % identifier
raise RuntimeError(error_msg)
def resolve_equivalent_oclc_identifiers(self, identifier):
"""Ensures OCLC coverage for an identifier.
This has to be called after the OCLCClassify coverage is run to confirm
that equivalent OCLC identifiers are available.
"""
oclc_ids = set()
if identifier.type == Identifier.ISBN:
# ISBNs won't have editions, so they should be run through OCLC
# to retrieve basic edition data (title, author).
oclc_ids.add(identifier)
types = [Identifier.OCLC_WORK, Identifier.OCLC_NUMBER, Identifier.ISBN]
for edition in identifier.primarily_identifies:
oclc_ids = oclc_ids.union(
edition.equivalent_identifiers(type=types)
)
for oclc_id in oclc_ids:
self.log.info("Currently processing equivalent identifier: %r", oclc_id)
self.oclc_linked_data.ensure_coverage(oclc_id)
def resolve_viaf(self, work):
"""Get VIAF data on all contributors."""
for pool in work.license_pools:
edition = pool.presentation_edition
if not edition:
continue
for contributor in edition.contributors:
self.viaf_client.process_contributor(contributor)
if not contributor.display_name:
contributor.family_name, contributor.display_name = (
contributor.default_names())
def resolve_cover_image(self, work):
"""Make sure we have the cover for all editions."""
for pool in work.license_pools:
edition = pool.presentation_edition
data_source_name = pool.data_source.name
if data_source_name in self.image_mirrors:
self.image_mirrors[data_source_name].mirror_edition(edition)
self.image_scaler.scale_edition(edition)