def test_vectorizing_and_similar_terms(): # Simple test for vectorizing weighted terms assoc = AssocSpace.from_entries(ENTRIES, k=3) weighted_terms = [('apple', 5), ('banana', 22), ('not a term', 17)] apple = assoc.row_named('apple') banana = assoc.row_named('banana') vector = assoc.vector_from_terms(weighted_terms) # The similarity of 'apple' to itself is approximately 1 assert abs(assoc.assoc_between_two_terms('apple', 'apple') - 1.0) < 1e-3 # 'apple' and 'banana' are at least 10% less similar to each other than # to themselves assert assoc.assoc_between_two_terms('apple', 'banana') < 0.9 # The vector is some linear combination of apple and banana. Test this # by subtracting out apple and banana components, so that there is nothing # left. norm_apple = normalize(apple) banana_perp_apple = normalize(banana - norm_apple * norm_apple.dot(banana)) residual = vector - norm_apple * norm_apple.dot(vector) residual -= banana_perp_apple * banana_perp_apple.dot(residual) assert norm(residual) < 1e-3 # Simple test for finding similar terms labels, scores = zip(*assoc.terms_similar_to_vector(vector)) eq_(list(scores), sorted(scores, reverse=True)) most_similar = assoc.most_similar_to_vector(vector) eq_(most_similar[0], labels[0]) eq_(most_similar[1], scores[0]) assert labels.index('banana') < labels.index('apple') assert labels.index('apple') < labels.index('green') assert labels.index('apple') < labels.index('celery')
def vector_from_terms(self, terms): """ Get a vector representing a weighted set of terms, provided as a collection of (term, weight) tuples. Note that this does not normalize the rows of U e^(S/2) before taking their weighted sum; this applies a natural penalty to low-quality terms. """ result = np.zeros((self.k,)) for term, weight in terms: if term in self.labels: if term not in self._row_cache: # Prevent the cache from growing too large if len(self._row_cache) > 15000: self._row_cache = {} # Avoid keeping a slice of a memmap object; Numpy handles # these inefficiently if you have a lot of them (especially # in 1.7, but even in 1.6 or 1.8) row = np.copy(self.u[self.labels.index(term)]) self._row_cache[term] = row result += self._row_cache[term] * weight return eigenmath.normalize(result * np.exp(self.sigma / 2))
def test_norm_and_normalize(): vec = np.asarray([8.0, 9.0, 12.0]) assert np.allclose(norm(vec), 17.0) assert np.allclose(normalize(vec), vec / 17.0) # We normalize the zero vector to itself rather than raising an error assert (np.zeros(5) == normalize(np.zeros(5))).all()
def cos_diff(a, b): return normalize(a).dot(normalize(b))