def test_identity(self): """Verify that we ignore the order of words in titles, as well as non-alphanumeric characters.""" assert 1 == MetadataSimilarity.title_similarity("foo bar", "foo bar") assert 1 == MetadataSimilarity.title_similarity("foo bar", "bar, foo") assert 1 == MetadataSimilarity.title_similarity("foo bar.", "FOO BAR")
def set_equivalence(self, identifier, metadata): """Identify the OCLC Number with the OCLC Work""" primary_editions = identifier.primarily_identifies if primary_editions: strength = 0 for primary_edition in primary_editions: if metadata.title: title_strength = MetadataSimilarity.title_similarity( metadata.title, primary_edition.title ) else: title_strength = 0 edition_viafs = set( [c.viaf for c in primary_edition.contributors if c.viaf] ) metadata_viafs = set( [c.viaf for c in metadata.contributors if c.viaf] ) author_strength = MetadataSimilarity._proportion( edition_viafs, metadata_viafs ) edition_strength = (title_strength * 0.8) + (author_strength * 0.2) if edition_strength > strength: strength = edition_strength else: strength = 1 if strength > 0: primary_identifier, ignore = metadata.primary_identifier.load( self._db ) identifier.equivalent_to( self.output_source, primary_identifier, strength )
def set_equivalence(self, identifier, metadata): """Identify the OCLC Number with the OCLC Work""" primary_editions = identifier.primarily_identifies if primary_editions: strength = 0 for primary_edition in primary_editions: if metadata.title: title_strength = MetadataSimilarity.title_similarity( metadata.title, primary_edition.title) else: title_strength = 0 edition_viafs = set( [c.viaf for c in primary_edition.contributors if c.viaf]) metadata_viafs = set( [c.viaf for c in metadata.contributors if c.viaf]) author_strength = MetadataSimilarity._proportion( edition_viafs, metadata_viafs) edition_strength = (title_strength * 0.8) + (author_strength * 0.2) if edition_strength > strength: strength = edition_strength else: strength = 1 if strength > 0: primary_identifier, ignore = metadata.primary_identifier.load( self._db) identifier.equivalent_to(self.data_source, primary_identifier, strength)
def _arrange_by_confidence_level(self, title, *other_titles): matches = defaultdict(list) stopwords = set(["the", "a", "an"]) for other_title in other_titles: distance = MetadataSimilarity.histogram_distance([title], [other_title], stopwords) similarity = 1 - distance for confidence_level in 1, 0.8, 0.5, 0.25, 0: if similarity >= confidence_level: matches[confidence_level].append(other_title) break return matches
def _extract_basic_info(cls, _db, tag, existing_authors=None, **restrictions): """Extract information common to work tag and edition tag.""" title = tag.get('title') author_string = tag.get('author') authors_and_roles = cls.parse_author_string( _db, author_string, existing_authors) if 'language' in tag.keys(): language = tag.get('language') else: language = None if title and 'title' in restrictions: must_resemble_title = restrictions['title'] threshold = restrictions.get('title_similarity', 0.25) similarity = MetadataSimilarity.title_similarity( must_resemble_title, title) if similarity < threshold: # The title of the book under consideration is not # similar enough to the given title. cls.log.debug( "FAILURE TO RESEMBLE: %s vs %s (%.2f)", title, must_resemble_title, similarity ) return None # The semicolon is frequently used to separate multiple # works in an anthology. If there is no semicolon in the # original title, do not consider titles that contain # semicolons. if (not ' ; ' in must_resemble_title and ' ; ' in title and threshold > 0): cls.log.debug( "SEMICOLON DISQUALIFICATION: %s", title ) return None # Apply restrictions. If they're not met, return None. if 'language' in restrictions and language: # We know which language this record is for. Match it # against the language used in the Edition we're # matching against. restrict_to_language = set(restrictions['language']) if language != restrict_to_language: # This record is for a book in a different language cls.log.debug( "WRONG LANGUAGE: %s", language ) return None if 'authors' in restrictions: restrict_to_authors = restrictions['authors'] if restrict_to_authors and isinstance(restrict_to_authors[0], Contributor): restrict_to_authors = [x.sort_name for x in restrict_to_authors] primary_author = None for a, roles in authors_and_roles: if Contributor.PRIMARY_AUTHOR_ROLE in roles: primary_author = a break if (not primary_author or (primary_author not in restrict_to_authors and primary_author.sort_name not in restrict_to_authors)): # None of the given authors showed up as the # primary author of this book. They may have had # some other role in it, or the book may be about # them, or incorporate their work, but this book # is not *by* them. return None author_names = ", ".join([x.sort_name for x, y in authors_and_roles]) return title, authors_and_roles, language
def test_histogram_distance(self): # These two sets of titles generate exactly the same histogram. # Their distance is 0. a1 = ["The First Title", "The Second Title"] a2 = ["title the second", "FIRST, THE TITLE"] assert 0 == MetadataSimilarity.histogram_distance(a1, a2) # These two sets of titles are as far apart as it's # possible to be. Their distance is 1. a1 = ["These Words Have Absolutely"] a2 = ["Nothing In Common, Really"] assert 1 == MetadataSimilarity.histogram_distance(a1, a2) # Now we test a difficult real-world case. # "Tom Sawyer Abroad" and "Tom Sawyer, Detective" are # completely different books by the same author. Their titles # differ only by one word. They are frequently anthologized # together, so OCLC maps them to plenty of the same # titles. They are also frequently included with other stories, # which adds random junk to the titles. abroad = [ "Tom Sawyer abroad", "The adventures of Tom Sawyer, Tom Sawyer abroad [and] Tom Sawyer, detective", "Tom Sawyer abroad", "Tom Sawyer abroad", "Tom Sawyer Abroad", "Tom Sawyer Abroad", "Tom Sawyer Abroad", "Tom Sawyer abroad : and other stories", "Tom Sawyer abroad Tom Sawyer, detective : and other stories, etc. etc.", "Tom Sawyer abroad", "Tom Sawyer abroad", "Tom Sawyer abroad", "Tom Sawyer abroad and other stories", "Tom Sawyer abroad and other stories", "Tom Sawyer abroad and the American claimant,", "Tom Sawyer abroad and the American claimant", "Tom Sawyer abroad : and The American claimant: novels.", "Tom Sawyer abroad : and The American claimant: novels.", "Tom Sawyer Abroad - Tom Sawyer, Detective", ] detective = [ "Tom Sawyer, Detective", "Tom Sawyer Abroad - Tom Sawyer, Detective", "Tom Sawyer Detective : As Told by Huck Finn : And Other Tales.", "Tom Sawyer, Detective", "Tom Sawyer, Detective.", "The adventures of Tom Sawyer, Tom Sawyer abroad [and] Tom Sawyer, detective", "Tom Sawyer detective : and other stories every child should know", "Tom Sawyer, detective : as told by Huck Finn and other tales", "Tom Sawyer, detective, as told by Huck Finn and other tales...", "The adventures of Tom Sawyer, Tom Sawyer abroad [and] Tom Sawyer, detective,", "Tom Sawyer abroad, Tom Sawyer, detective, and other stories", "Tom Sawyer, detective", "Tom Sawyer, detective", "Tom Sawyer, detective", "Tom Sawyer, detective", "Tom Sawyer, detective", "Tom Sawyer, detective", "Tom Sawyer abroad Tom Sawyer detective", "Tom Sawyer, detective : as told by Huck Finn", "Tom Sawyer : detective", ] # The histogram distance of the two sets of titles is not # huge, but it is significant. d = MetadataSimilarity.histogram_distance(abroad, detective) # The histogram distance between two lists is symmetrical, within # a small range of error for floating-point rounding. difference = d - MetadataSimilarity.histogram_distance( detective, abroad) assert abs(difference) < 0.000001 # The histogram distance between the Gutenberg title of a book # and the set of all OCLC Classify titles for that book tends # to be fairly small. ab_ab = MetadataSimilarity.histogram_distance(["Tom Sawyer Abroad"], abroad) de_de = MetadataSimilarity.histogram_distance( ["Tom Sawyer, Detective"], detective) assert ab_ab < 0.5 assert de_de < 0.5 # The histogram distance between the Gutenberg title of a book # and the set of all OCLC Classify titles for that book tends # to be larger. ab_de = MetadataSimilarity.histogram_distance(["Tom Sawyer Abroad"], detective) de_ab = MetadataSimilarity.histogram_distance( ["Tom Sawyer, Detective"], abroad) assert ab_de > 0.5 assert de_ab > 0.5
def test_author_similarity(self): assert 1 == MetadataSimilarity.author_similarity([], [])
def test_identical_titles_are_identical(self): t = "a !@#$@#%& the #FDUSG($E% N%SDAMF_) and #$MI# asdff \N{SNOWMAN}" assert 1 == MetadataSimilarity.title_similarity(t, t)
def _extract_basic_info(cls, _db, tag, existing_authors=None, **restrictions): """Extract information common to work tag and edition tag.""" title = tag.get('title') author_string = tag.get('author') authors_and_roles = cls.parse_author_string(_db, author_string, existing_authors) if 'language' in tag.keys(): language = tag.get('language') else: language = None if title and 'title' in restrictions: must_resemble_title = restrictions['title'] threshold = restrictions.get('title_similarity', 0.25) similarity = MetadataSimilarity.title_similarity( must_resemble_title, title) if similarity < threshold: # The title of the book under consideration is not # similar enough to the given title. cls.log.debug("FAILURE TO RESEMBLE: %s vs %s (%.2f)", title, must_resemble_title, similarity) return None # The semicolon is frequently used to separate multiple # works in an anthology. If there is no semicolon in the # original title, do not consider titles that contain # semicolons. if (not ' ; ' in must_resemble_title and ' ; ' in title and threshold > 0): cls.log.debug("SEMICOLON DISQUALIFICATION: %s", title) return None # Apply restrictions. If they're not met, return None. if 'language' in restrictions and language: # We know which language this record is for. Match it # against the language used in the Edition we're # matching against. restrict_to_language = set(restrictions['language']) if language != restrict_to_language: # This record is for a book in a different language cls.log.debug("WRONG LANGUAGE: %s", language) return None if 'authors' in restrictions: restrict_to_authors = restrictions['authors'] if restrict_to_authors and isinstance(restrict_to_authors[0], Contributor): restrict_to_authors = [ x.sort_name for x in restrict_to_authors ] primary_author = None for a, roles in authors_and_roles: if Contributor.PRIMARY_AUTHOR_ROLE in roles: primary_author = a break if (not primary_author or (primary_author not in restrict_to_authors and primary_author.sort_name not in restrict_to_authors)): # None of the given authors showed up as the # primary author of this book. They may have had # some other role in it, or the book may be about # them, or incorporate their work, but this book # is not *by* them. return None author_names = ", ".join([x.sort_name for x, y in authors_and_roles]) return title, authors_and_roles, language