def test_identity(self): """Verify that we ignore the order of words in titles, as well as non-alphanumeric characters.""" eq_(1, MetadataSimilarity.title_similarity("foo bar", "foo bar")) eq_(1, MetadataSimilarity.title_similarity("foo bar", "bar, foo")) eq_(1, MetadataSimilarity.title_similarity("foo bar.", "FOO BAR"))
def _arrange_by_confidence_level(self, title, *other_titles): matches = defaultdict(list) stopwords = set(["the", "a", "an"]) for other_title in other_titles: distance = MetadataSimilarity.histogram_distance( [title], [other_title], stopwords) similarity = 1-distance for confidence_level in 1, 0.8, 0.5, 0.25, 0: if similarity >= confidence_level: matches[confidence_level].append(other_title) break return matches
def test_author_similarity(self): eq_(1, MetadataSimilarity.author_similarity([], []))
def test_identical_titles_are_identical(self): t = u"a !@#$@#%& the #FDUSG($E% N%SDAMF_) and #$MI# asdff \N{SNOWMAN}" eq_(1, MetadataSimilarity.title_similarity(t, t))
def test_histogram_distance(self): # These two sets of titles generate exactly the same histogram. # Their distance is 0. a1 = ["The First Title", "The Second Title"] a2 = ["title the second", "FIRST, THE TITLE"] eq_(0, MetadataSimilarity.histogram_distance(a1, a2)) # These two sets of titles are as far apart as it's # possible to be. Their distance is 1. a1 = ["These Words Have Absolutely"] a2 = ["Nothing In Common, Really"] eq_(1, MetadataSimilarity.histogram_distance(a1, a2)) # Now we test a difficult real-world case. # "Tom Sawyer Abroad" and "Tom Sawyer, Detective" are # completely different books by the same author. Their titles # differ only by one word. They are frequently anthologized # together, so OCLC maps them to plenty of the same # titles. They are also frequently included with other stories, # which adds random junk to the titles. abroad = ["Tom Sawyer abroad", "The adventures of Tom Sawyer, Tom Sawyer abroad [and] Tom Sawyer, detective", "Tom Sawyer abroad", "Tom Sawyer abroad", "Tom Sawyer Abroad", "Tom Sawyer Abroad", "Tom Sawyer Abroad", "Tom Sawyer abroad : and other stories", "Tom Sawyer abroad Tom Sawyer, detective : and other stories, etc. etc.", "Tom Sawyer abroad", "Tom Sawyer abroad", "Tom Sawyer abroad", "Tom Sawyer abroad and other stories", "Tom Sawyer abroad and other stories", "Tom Sawyer abroad and the American claimant,", "Tom Sawyer abroad and the American claimant", "Tom Sawyer abroad : and The American claimant: novels.", "Tom Sawyer abroad : and The American claimant: novels.", "Tom Sawyer Abroad - Tom Sawyer, Detective", ] detective = ["Tom Sawyer, Detective", "Tom Sawyer Abroad - Tom Sawyer, Detective", "Tom Sawyer Detective : As Told by Huck Finn : And Other Tales.", "Tom Sawyer, Detective", "Tom Sawyer, Detective.", "The adventures of Tom Sawyer, Tom Sawyer abroad [and] Tom Sawyer, detective", "Tom Sawyer detective : and other stories every child should know", "Tom Sawyer, detective : as told by Huck Finn and other tales", "Tom Sawyer, detective, as told by Huck Finn and other tales...", "The adventures of Tom Sawyer, Tom Sawyer abroad [and] Tom Sawyer, detective,", "Tom Sawyer abroad, Tom Sawyer, detective, and other stories", "Tom Sawyer, detective", "Tom Sawyer, detective", "Tom Sawyer, detective", "Tom Sawyer, detective", "Tom Sawyer, detective", "Tom Sawyer, detective", "Tom Sawyer abroad Tom Sawyer detective", "Tom Sawyer, detective : as told by Huck Finn", "Tom Sawyer : detective", ] # The histogram distance of the two sets of titles is not # huge, but it is significant. d = MetadataSimilarity.histogram_distance(abroad, detective) # The histogram distance between two lists is symmetrical, within # a small range of error for floating-point rounding. difference = d - MetadataSimilarity.histogram_distance( detective, abroad) assert abs(difference) < 0.000001 # The histogram distance between the Gutenberg title of a book # and the set of all OCLC Classify titles for that book tends # to be fairly small. ab_ab = MetadataSimilarity.histogram_distance( ["Tom Sawyer Abroad"], abroad) de_de = MetadataSimilarity.histogram_distance( ["Tom Sawyer, Detective"], detective) assert ab_ab < 0.5 assert de_de < 0.5 # The histogram distance between the Gutenberg title of a book # and the set of all OCLC Classify titles for that book tends # to be larger. ab_de = MetadataSimilarity.histogram_distance( ["Tom Sawyer Abroad"], detective) de_ab = MetadataSimilarity.histogram_distance( ["Tom Sawyer, Detective"], abroad) assert ab_de > 0.5 assert de_ab > 0.5