Пример #1
0
    def test_identity(self):
        """Verify that we ignore the order of words in titles,
        as well as non-alphanumeric characters."""

        assert 1 == MetadataSimilarity.title_similarity("foo bar", "foo bar")
        assert 1 == MetadataSimilarity.title_similarity("foo bar", "bar, foo")
        assert 1 == MetadataSimilarity.title_similarity("foo bar.", "FOO BAR")
Пример #2
0
    def set_equivalence(self, identifier, metadata):
        """Identify the OCLC Number with the OCLC Work"""

        primary_editions = identifier.primarily_identifies
        if primary_editions:
            strength = 0
            for primary_edition in primary_editions:
                if metadata.title:
                    title_strength = MetadataSimilarity.title_similarity(
                        metadata.title, primary_edition.title
                    )
                else:
                    title_strength = 0
                edition_viafs = set(
                    [c.viaf for c in primary_edition.contributors if c.viaf]
                )
                metadata_viafs = set(
                    [c.viaf for c in metadata.contributors if c.viaf]
                )
                author_strength = MetadataSimilarity._proportion(
                    edition_viafs, metadata_viafs
                )
                edition_strength = (title_strength * 0.8) + (author_strength * 0.2)
                if edition_strength > strength:
                    strength = edition_strength
        else:
            strength = 1

        if strength > 0:
            primary_identifier, ignore = metadata.primary_identifier.load(
                self._db
            )
            identifier.equivalent_to(
                self.output_source, primary_identifier, strength
            )
Пример #3
0
    def set_equivalence(self, identifier, metadata):
        """Identify the OCLC Number with the OCLC Work"""

        primary_editions = identifier.primarily_identifies
        if primary_editions:
            strength = 0
            for primary_edition in primary_editions:
                if metadata.title:
                    title_strength = MetadataSimilarity.title_similarity(
                        metadata.title, primary_edition.title)
                else:
                    title_strength = 0
                edition_viafs = set(
                    [c.viaf for c in primary_edition.contributors if c.viaf])
                metadata_viafs = set(
                    [c.viaf for c in metadata.contributors if c.viaf])
                author_strength = MetadataSimilarity._proportion(
                    edition_viafs, metadata_viafs)
                edition_strength = (title_strength * 0.8) + (author_strength *
                                                             0.2)
                if edition_strength > strength:
                    strength = edition_strength
        else:
            strength = 1

        if strength > 0:
            primary_identifier, ignore = metadata.primary_identifier.load(
                self._db)
            identifier.equivalent_to(self.data_source, primary_identifier,
                                     strength)
Пример #4
0
    def _extract_basic_info(cls, _db, tag, existing_authors=None,
                            **restrictions):
        """Extract information common to work tag and edition tag."""
        title = tag.get('title')
        author_string = tag.get('author')
        authors_and_roles = cls.parse_author_string(
            _db, author_string, existing_authors)
        if 'language' in tag.keys():
            language = tag.get('language')
        else:
            language = None

        if title and 'title' in restrictions:
            must_resemble_title = restrictions['title']
            threshold = restrictions.get('title_similarity', 0.25)
            similarity = MetadataSimilarity.title_similarity(
                must_resemble_title, title)
            if similarity < threshold:
                # The title of the book under consideration is not
                # similar enough to the given title.
                cls.log.debug(
                    "FAILURE TO RESEMBLE: %s vs %s (%.2f)",
                    title, must_resemble_title, similarity
                )
                return None

            # The semicolon is frequently used to separate multiple
            # works in an anthology. If there is no semicolon in the
            # original title, do not consider titles that contain
            # semicolons.
            if (not ' ; ' in must_resemble_title
                and ' ; ' in title and threshold > 0):
                cls.log.debug(
                    "SEMICOLON DISQUALIFICATION: %s", title
                )
                return None

        # Apply restrictions. If they're not met, return None.
        if 'language' in restrictions and language:
            # We know which language this record is for. Match it
            # against the language used in the Edition we're
            # matching against.
            restrict_to_language = set(restrictions['language'])
            if language != restrict_to_language:
                # This record is for a book in a different language
                cls.log.debug(
                    "WRONG LANGUAGE: %s", language
                )
                return None

        if 'authors' in restrictions:
            restrict_to_authors = restrictions['authors']
            if restrict_to_authors and isinstance(restrict_to_authors[0], Contributor):
                restrict_to_authors = [x.sort_name for x in restrict_to_authors]
            primary_author = None

            for a, roles in authors_and_roles:
                if Contributor.PRIMARY_AUTHOR_ROLE in roles:
                    primary_author = a
                    break
            if (not primary_author
                or (primary_author not in restrict_to_authors
                    and primary_author.sort_name not in restrict_to_authors)):
                    # None of the given authors showed up as the
                    # primary author of this book. They may have had
                    # some other role in it, or the book may be about
                    # them, or incorporate their work, but this book
                    # is not *by* them.
                return None

        author_names = ", ".join([x.sort_name for x, y in authors_and_roles])

        return title, authors_and_roles, language
Пример #5
0
 def test_identical_titles_are_identical(self):
     t = "a !@#$@#%& the #FDUSG($E% N%SDAMF_) and #$MI# asdff \N{SNOWMAN}"
     assert 1 == MetadataSimilarity.title_similarity(t, t)
Пример #6
0
    def _extract_basic_info(cls,
                            _db,
                            tag,
                            existing_authors=None,
                            **restrictions):
        """Extract information common to work tag and edition tag."""
        title = tag.get('title')
        author_string = tag.get('author')
        authors_and_roles = cls.parse_author_string(_db, author_string,
                                                    existing_authors)
        if 'language' in tag.keys():
            language = tag.get('language')
        else:
            language = None

        if title and 'title' in restrictions:
            must_resemble_title = restrictions['title']
            threshold = restrictions.get('title_similarity', 0.25)
            similarity = MetadataSimilarity.title_similarity(
                must_resemble_title, title)
            if similarity < threshold:
                # The title of the book under consideration is not
                # similar enough to the given title.
                cls.log.debug("FAILURE TO RESEMBLE: %s vs %s (%.2f)", title,
                              must_resemble_title, similarity)
                return None

            # The semicolon is frequently used to separate multiple
            # works in an anthology. If there is no semicolon in the
            # original title, do not consider titles that contain
            # semicolons.
            if (not ' ; ' in must_resemble_title and ' ; ' in title
                    and threshold > 0):
                cls.log.debug("SEMICOLON DISQUALIFICATION: %s", title)
                return None

        # Apply restrictions. If they're not met, return None.
        if 'language' in restrictions and language:
            # We know which language this record is for. Match it
            # against the language used in the Edition we're
            # matching against.
            restrict_to_language = set(restrictions['language'])
            if language != restrict_to_language:
                # This record is for a book in a different language
                cls.log.debug("WRONG LANGUAGE: %s", language)
                return None

        if 'authors' in restrictions:
            restrict_to_authors = restrictions['authors']
            if restrict_to_authors and isinstance(restrict_to_authors[0],
                                                  Contributor):
                restrict_to_authors = [
                    x.sort_name for x in restrict_to_authors
                ]
            primary_author = None

            for a, roles in authors_and_roles:
                if Contributor.PRIMARY_AUTHOR_ROLE in roles:
                    primary_author = a
                    break
            if (not primary_author or
                (primary_author not in restrict_to_authors
                 and primary_author.sort_name not in restrict_to_authors)):
                # None of the given authors showed up as the
                # primary author of this book. They may have had
                # some other role in it, or the book may be about
                # them, or incorporate their work, but this book
                # is not *by* them.
                return None

        author_names = ", ".join([x.sort_name for x, y in authors_and_roles])

        return title, authors_and_roles, language