d1 = m.document(name="lion") d2 = m.document(name="tiger") d3 = m.document(name="dolphin") d4 = m.document(name="shark") d5 = m.document(name="parakeet") print("lion-tiger:", m.similarity(d1, d2)) print("lion-dolphin:", m.similarity(d1, d3)) print("dolphin-shark:", m.similarity(d3, d4)) print("dolphin-parakeet:", m.similarity(d3, d5)) print() print("Related to tiger:") print(m.neighbors(d2, top=3)) # Top three most similar. print() print("Related to a search query ('water'):") print(m.search("water", top=10)) # In summary: # A Document: # - takes a string of text, # - counts the words in the text, # - constructs a vector of words (features) and normalized word count (weight). # A Model: # - groups multiple vectors in a matrix, # - tweaks the weight with TF-IDF to find "unique" words in each document, # - computes cosine similarity (= distance between vectors), # - compares documents using cosine similatity.
# is called cosine similarity. This is what a Model uses: d1 = m.document(name="lion") d2 = m.document(name="tiger") d3 = m.document(name="dolphin") d4 = m.document(name="shark") d5 = m.document(name="parakeet") print "lion-tiger:", m.similarity(d1, d2) print "lion-dolphin:", m.similarity(d1, d3) print "dolphin-shark:", m.similarity(d3, d4) print "dolphin-parakeet:", m.similarity(d3, d5) print print "Related to tiger:" print m.neighbors(d2, top=3) # Top three most similar. print print "Related to a search query ('water'):" print m.search("water", top=10) # In summary: # A Document: # - takes a string of text, # - counts the words in the text, # - constructs a vector of words (features) and normalized word count (weight). # A Model: # - groups multiple vectors in a matrix, # - tweaks the weight with TF-IDF to find "unique" words in each document, # - computes cosine similarity (= distance between vectors), # - compares documents using cosine similatity.