def test_identity(self): """Verify that we ignore the order of words in titles, as well as non-alphanumeric characters.""" assert 1 == MetadataSimilarity.title_similarity("foo bar", "foo bar") assert 1 == MetadataSimilarity.title_similarity("foo bar", "bar, foo") assert 1 == MetadataSimilarity.title_similarity("foo bar.", "FOO BAR")
def set_equivalence(self, identifier, metadata): """Identify the OCLC Number with the OCLC Work""" primary_editions = identifier.primarily_identifies if primary_editions: strength = 0 for primary_edition in primary_editions: if metadata.title: title_strength = MetadataSimilarity.title_similarity( metadata.title, primary_edition.title ) else: title_strength = 0 edition_viafs = set( [c.viaf for c in primary_edition.contributors if c.viaf] ) metadata_viafs = set( [c.viaf for c in metadata.contributors if c.viaf] ) author_strength = MetadataSimilarity._proportion( edition_viafs, metadata_viafs ) edition_strength = (title_strength * 0.8) + (author_strength * 0.2) if edition_strength > strength: strength = edition_strength else: strength = 1 if strength > 0: primary_identifier, ignore = metadata.primary_identifier.load( self._db ) identifier.equivalent_to( self.output_source, primary_identifier, strength )
def set_equivalence(self, identifier, metadata): """Identify the OCLC Number with the OCLC Work""" primary_editions = identifier.primarily_identifies if primary_editions: strength = 0 for primary_edition in primary_editions: if metadata.title: title_strength = MetadataSimilarity.title_similarity( metadata.title, primary_edition.title) else: title_strength = 0 edition_viafs = set( [c.viaf for c in primary_edition.contributors if c.viaf]) metadata_viafs = set( [c.viaf for c in metadata.contributors if c.viaf]) author_strength = MetadataSimilarity._proportion( edition_viafs, metadata_viafs) edition_strength = (title_strength * 0.8) + (author_strength * 0.2) if edition_strength > strength: strength = edition_strength else: strength = 1 if strength > 0: primary_identifier, ignore = metadata.primary_identifier.load( self._db) identifier.equivalent_to(self.data_source, primary_identifier, strength)
def _extract_basic_info(cls, _db, tag, existing_authors=None, **restrictions): """Extract information common to work tag and edition tag.""" title = tag.get('title') author_string = tag.get('author') authors_and_roles = cls.parse_author_string( _db, author_string, existing_authors) if 'language' in tag.keys(): language = tag.get('language') else: language = None if title and 'title' in restrictions: must_resemble_title = restrictions['title'] threshold = restrictions.get('title_similarity', 0.25) similarity = MetadataSimilarity.title_similarity( must_resemble_title, title) if similarity < threshold: # The title of the book under consideration is not # similar enough to the given title. cls.log.debug( "FAILURE TO RESEMBLE: %s vs %s (%.2f)", title, must_resemble_title, similarity ) return None # The semicolon is frequently used to separate multiple # works in an anthology. If there is no semicolon in the # original title, do not consider titles that contain # semicolons. if (not ' ; ' in must_resemble_title and ' ; ' in title and threshold > 0): cls.log.debug( "SEMICOLON DISQUALIFICATION: %s", title ) return None # Apply restrictions. If they're not met, return None. if 'language' in restrictions and language: # We know which language this record is for. Match it # against the language used in the Edition we're # matching against. restrict_to_language = set(restrictions['language']) if language != restrict_to_language: # This record is for a book in a different language cls.log.debug( "WRONG LANGUAGE: %s", language ) return None if 'authors' in restrictions: restrict_to_authors = restrictions['authors'] if restrict_to_authors and isinstance(restrict_to_authors[0], Contributor): restrict_to_authors = [x.sort_name for x in restrict_to_authors] primary_author = None for a, roles in authors_and_roles: if Contributor.PRIMARY_AUTHOR_ROLE in roles: primary_author = a break if (not primary_author or (primary_author not in restrict_to_authors and primary_author.sort_name not in restrict_to_authors)): # None of the given authors showed up as the # primary author of this book. They may have had # some other role in it, or the book may be about # them, or incorporate their work, but this book # is not *by* them. return None author_names = ", ".join([x.sort_name for x, y in authors_and_roles]) return title, authors_and_roles, language
def test_identical_titles_are_identical(self): t = "a !@#$@#%& the #FDUSG($E% N%SDAMF_) and #$MI# asdff \N{SNOWMAN}" assert 1 == MetadataSimilarity.title_similarity(t, t)
def _extract_basic_info(cls, _db, tag, existing_authors=None, **restrictions): """Extract information common to work tag and edition tag.""" title = tag.get('title') author_string = tag.get('author') authors_and_roles = cls.parse_author_string(_db, author_string, existing_authors) if 'language' in tag.keys(): language = tag.get('language') else: language = None if title and 'title' in restrictions: must_resemble_title = restrictions['title'] threshold = restrictions.get('title_similarity', 0.25) similarity = MetadataSimilarity.title_similarity( must_resemble_title, title) if similarity < threshold: # The title of the book under consideration is not # similar enough to the given title. cls.log.debug("FAILURE TO RESEMBLE: %s vs %s (%.2f)", title, must_resemble_title, similarity) return None # The semicolon is frequently used to separate multiple # works in an anthology. If there is no semicolon in the # original title, do not consider titles that contain # semicolons. if (not ' ; ' in must_resemble_title and ' ; ' in title and threshold > 0): cls.log.debug("SEMICOLON DISQUALIFICATION: %s", title) return None # Apply restrictions. If they're not met, return None. if 'language' in restrictions and language: # We know which language this record is for. Match it # against the language used in the Edition we're # matching against. restrict_to_language = set(restrictions['language']) if language != restrict_to_language: # This record is for a book in a different language cls.log.debug("WRONG LANGUAGE: %s", language) return None if 'authors' in restrictions: restrict_to_authors = restrictions['authors'] if restrict_to_authors and isinstance(restrict_to_authors[0], Contributor): restrict_to_authors = [ x.sort_name for x in restrict_to_authors ] primary_author = None for a, roles in authors_and_roles: if Contributor.PRIMARY_AUTHOR_ROLE in roles: primary_author = a break if (not primary_author or (primary_author not in restrict_to_authors and primary_author.sort_name not in restrict_to_authors)): # None of the given authors showed up as the # primary author of this book. They may have had # some other role in it, or the book may be about # them, or incorporate their work, but this book # is not *by* them. return None author_names = ", ".join([x.sort_name for x, y in authors_and_roles]) return title, authors_and_roles, language