def sample_results(self): m1 = Metadata('The Great Gatsby', ['Francis Scott Fitzgerald']) m2 = Metadata('The Great Gatsby', ['F. Scott Fitzgerald']) m1.has_cached_cover_url = True m2.has_cached_cover_url = False m1.comments = 'Some comments '*10 m1.tags = ['tag%d'%i for i in range(20)] m1.rating = 4.4 m1.language = 'en' m2.language = 'fr' m1.pubdate = utcnow() m2.pubdate = fromordinal(1000000) m1.publisher = 'Publisher 1' m2.publisher = 'Publisher 2' return [m1, m2]
def sample_results(self): m1 = Metadata('The Great Gatsby', ['Francis Scott Fitzgerald']) m2 = Metadata('The Great Gatsby - An extra long title to test resizing', ['F. Scott Fitzgerald']) m1.has_cached_cover_url = True m2.has_cached_cover_url = False m1.comments = 'Some comments '*10 m1.tags = ['tag%d'%i for i in range(20)] m1.rating = 4.4 m1.language = 'en' m2.language = 'fr' m1.pubdate = utcnow() m2.pubdate = fromordinal(1000000) m1.publisher = 'Publisher 1' m2.publisher = 'Publisher 2' return [m1, m2]
def sample_results(self): m1 = Metadata("The Great Gatsby", ["Francis Scott Fitzgerald"]) m2 = Metadata("The Great Gatsby - An extra long title to test resizing", ["F. Scott Fitzgerald"]) m1.has_cached_cover_url = True m2.has_cached_cover_url = False m1.comments = "Some comments " * 10 m1.tags = ["tag%d" % i for i in range(20)] m1.rating = 4.4 m1.language = "en" m2.language = "fr" m1.pubdate = utcnow() m2.pubdate = fromordinal(1000000) m1.publisher = "Publisher 1" m2.publisher = "Publisher 2" return [m1, m2]
def merge(self, results, min_year, do_asr=True): ans = Metadata(_('Unknown')) # We assume the shortest title has the least cruft in it ans.title = self.length_merge('title', results, null_value=ans.title) # No harm in having extra authors, maybe something useful like an # editor or translator ans.authors = self.length_merge('authors', results, null_value=ans.authors, shortest=False) # We assume the shortest publisher has the least cruft in it ans.publisher = self.length_merge('publisher', results, null_value=ans.publisher) # We assume the smallest set of tags has the least cruft in it ans.tags = self.length_merge('tags', results, null_value=ans.tags, shortest=msprefs['fewer_tags']) # We assume the longest series has the most info in it ans.series = self.length_merge('series', results, null_value=ans.series, shortest=False) for r in results: if r.series and r.series == ans.series: ans.series_index = r.series_index break # Average the rating over all sources ratings = [] for r in results: rating = r.rating if rating and rating > 0 and rating <= 5: ratings.append(rating) if ratings: ans.rating = int(round(sum(ratings)/len(ratings))) # Smallest language is likely to be valid ans.language = self.length_merge('language', results, null_value=ans.language) # Choose longest comments ans.comments = self.length_merge('comments', results, null_value=ans.comments, shortest=False) # Published date if min_year: for r in results: year = getattr(r.pubdate, 'year', None) if year == min_year: ans.pubdate = r.pubdate break if getattr(ans.pubdate, 'year', None) == min_year: min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day, tzinfo=utc_tz) else: min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) ans.pubdate = min_date else: min_date = datetime(3001, 1, 1, tzinfo=utc_tz) for r in results: if r.pubdate is not None: candidate = as_utc(r.pubdate) if candidate < min_date: min_date = candidate if min_date.year < 3000: ans.pubdate = min_date # Identifiers for r in results: ans.identifiers.update(r.identifiers) # Cover URL ans.has_cached_cover_url = bool([r for r in results if getattr(r, 'has_cached_cover_url', False)]) # Merge any other fields with no special handling (random merge) touched_fields = set() for r in results: if hasattr(r, 'identify_plugin'): touched_fields |= r.identify_plugin.touched_fields for f in touched_fields: if f.startswith('identifier:') or not ans.is_null(f): continue setattr(ans, f, self.random_merge(f, results, null_value=getattr(ans, f))) if do_asr: avg = [x.relevance_in_source for x in results] avg = sum(avg)/len(avg) ans.average_source_relevance = avg return ans