def test_normalize_csr(self): """produce a unit-norm vector from csr""" n_csr = normalize_csr result1 = sparse_to_dense(n_csr(csr_matrix([0.0, 2.0, 0.0]))) self.assertListEqual(list(result1), [0.0, 1.0, 0.0]) result2 = sparse_to_dense(n_csr(csr_matrix([4.0, 0.0]))) self.assertListEqual(list(result2), [1.0, 0.0])
def test_encode_separate_together(self): """encoding using separated tokens and concatenated tokens""" cm = self.crossmap separate = dict(data="bcd cde def") together = dict(data="bcdef") v_separate = sparse_to_dense(cm.encoder.document(separate)) v_together = sparse_to_dense(cm.encoder.document(together)) self.assertListEqual(round_list(v_separate), round_list(v_together))
def test_diffusion_shifts_away_from_a_target(self): """example using diffusion unaffected by longword""" # before diffusion gh should be roughly equally distant to L3 and L4 v = self.encoder.document(self.gh) v_dense = sparse_to_dense(v) self.assertAlmostEqual(euc_dist(v_dense, self.data["L3"]), euc_dist(v_dense, self.data["L4"])) # after diffusion, L3 should be clearly preferred # the strengths of diffusion should not matter here # (any diffusion should break a tie in favor or L3) vd = self.diffuser.diffuse(v, dict(documents=1)) vd_dense = sparse_to_dense(vd) self.assertLess(euc_dist(vd_dense, self.data["L3"]), euc_dist(vd_dense, self.data["L4"]))
def test_threshold_to_zeros(self): """threshold and all values are set to zero""" x = csr_matrix([0.0, 0.0, 0.0, 0.4, 0.2, 0.0, -0.8, 0.1]) result = threshold_csr(x, 2) self.assertEqual(sum(sparse_to_dense(result)), 0) self.assertEqual(result.shape, (1, 8))
def delta(request): """suggest features for user-driven learning :param request: object with data_pos, data_neg, expected_id :return: dictionary with top features, suggest_pos, suggest_neg """ doc = parse_request(request) # process a query query_id = doc["query_id"] dataset = doc["dataset"] query_raw = find_vector(query_id, dataset) if query_raw is None: return {"error": "invalid item id: " + query_id} query_diffused = crossmap.diffuser.diffuse(query_raw, doc["diffusion"]) diffused = sparse_to_dense(query_diffused) targets, _ = crossmap.indexer.suggest(diffused, dataset=dataset, n=doc["n"]) # get feature vector for the expected result expected_raw = find_vector(doc["expected_id"], dataset) if expected_raw is None: return {"error": "invalid item id: " + doc["expected_id"]} expected = sparse_to_dense(expected_raw) # prepare a collection of dense vectors vectors = { "query": sparse_to_dense(query_raw), "diffused": diffused, "expected": expected, "error": diffused - expected } for i, hit_id in enumerate(targets): i_vector = sparse_to_dense(get_vector(dataset, hit_id)) vectors["hit_" + str(i + 1)] = i_vector vectors["delta_" + str(i + 1)] = i_vector - expected # prepare a smaller table that includes only features with non-zero values inv_feature_map = crossmap.indexer.encoder.inv_feature_map result = [] for i in range(len(inv_feature_map)): i_data = [abs(_[i]) for _ in vectors.values()] if sum(i_data) == 0: continue i_result = dict(feature=inv_feature_map[i]) for vector_id, vector_values in vectors.items(): i_result[vector_id] = vector_values[i] result.append(i_result) result = sorted(result, key=decr_by_query) return result
def test_threshold_w_negatives(self): """threshold preserves very negative values""" b = csr_matrix([0.0, 0.1, -0.5, 0.35, 0.9, -0.2, 0.0, -0.4]) result = threshold_csr(b, 0.3) expected = [0.0, 0.0, -0.5, 0.35, 0.9, 0.0, 0.0, -0.4] self.assertListEqual(list(sparse_to_dense(result)), expected) self.assertEqual(result.shape, (1, 8))
def test_threshold_positives(self): """using a simple vector with positive values""" a = csr_matrix([0.0, 0.1, 0.5, 0.35, 0.9, 0.0, 0.0, 0.4]) result = threshold_csr(a, 0.3) expected = [0.0, 0.0, 0.5, 0.35, 0.9, 0.0, 0.0, 0.4] self.assertListEqual(list(sparse_to_dense(result)), expected) self.assertEqual(result.shape, (1, 8))
def test_normalized_collapse(self): """collapse to dimension, with global rescaling/normalization""" a = csr_matrix([0.0, 0.1, 0.5, 0.35, 0.9, 0.0, 0.0, 0.4]) result = dimcollapse_csr(a, set([0, 1, 2, 3]), normalize=True) result_dense = sparse_to_dense(result) self.assertEqual(result_dense[4], 0.0) self.assertGreater(result_dense[1], 0.1)
def test_raw_collapse(self): """raw collapse, setting values to zero without normalization""" a = csr_matrix([0.0, 0.1, 0.5, 0.35, 0.9, 0.0, 0.0, 0.4]) result = dimcollapse_csr(a, set([0, 1, 2, 3]), normalize=False) expected = [0.0, 0.1, 0.5, 0.35, 0.0, 0.0, 0.0, 0.0] self.assertListEqual(list(sparse_to_dense(result)), expected) self.assertEqual(result.shape, (1, 8))
def test_diffusion_keeps_original_feature_strong(self): """diffusing from one feature should mantain that feature strong""" doc = dict(data="C") c_index = self.feature_map["c"][0] v = self.encoder.document(doc) # diffuse at different strengths # all should maintain feature "C" as the most important feature for w in [1, 2, 4, 8, 20]: result = self.diffuser.diffuse(v, dict(targets=w)) result_dense = sparse_to_dense(result) result_c = result_dense[c_index] result_max = max(result_dense) self.assertEqual(result_max, result_c)
def test_longword_document_before_diffusion(self): """encoding before diffusion accounts for overlapping tokens""" v = self.encoder.document(self.long_b) self.assertGreater(len(v.data), 4) # overlapping tokens from "longword" should be weighted lower than # tokens from "B" or "C" that are stand-alone v_dense = sparse_to_dense(v) fm = self.feature_map self.assertGreater(v_dense[fm["b"][0]], v_dense[fm["ngwor"][0]]) # document should be closer to L0 than to L1 d0 = euc_dist(v_dense, self.data["L0"]) d1 = euc_dist(v_dense, self.data["L1"]) self.assertLess(d0, d1)
def setUpClass(cls): settings = CrossmapSettings(config_longword, create_dir=True) cls.indexer = CrossmapIndexer(settings) cls.indexer.build() cls.diffuser = CrossmapDiffuser(settings) cls.diffuser.build() cls.feature_map = cls.diffuser.feature_map cls.db = cls.diffuser.db cls.encoder = cls.indexer.encoder cls.plain_tokenizer = CrossmapTokenizer(settings) cls.diff_tokenizer = CrossmapDiffusionTokenizer(settings) # extract data vectors cls.data = dict() temp = cls.db.get_data(dataset="targets", ids=["L0", "L1", "L2", "L3", "L4"]) for _ in temp: cls.data[_["id"]] = sparse_to_dense(_["data"])
def test_normalize_csr_in_place(self): """produce a unit-norm vector from csr (in place)""" vec = csr_matrix([0.0, 2.0, 0.0]) normalize_csr(vec) self.assertListEqual(list(sparse_to_dense(vec)), [0.0, 1.0, 0.0])
def round_list(x, digits=6): """helper for approximate tests, round a list""" if isinstance(x, csr_matrix): x = sparse_to_dense(x) return [round(_, digits) for _ in list(x)]