示例#1
0
    def test_normalize_csr(self):
        """produce a unit-norm vector from csr"""

        n_csr = normalize_csr
        result1 = sparse_to_dense(n_csr(csr_matrix([0.0, 2.0, 0.0])))
        self.assertListEqual(list(result1), [0.0, 1.0, 0.0])
        result2 = sparse_to_dense(n_csr(csr_matrix([4.0, 0.0])))
        self.assertListEqual(list(result2), [1.0, 0.0])
    def test_encode_separate_together(self):
        """encoding using separated tokens and concatenated tokens"""

        cm = self.crossmap
        separate = dict(data="bcd cde def")
        together = dict(data="bcdef")
        v_separate = sparse_to_dense(cm.encoder.document(separate))
        v_together = sparse_to_dense(cm.encoder.document(together))
        self.assertListEqual(round_list(v_separate), round_list(v_together))
示例#3
0
    def test_diffusion_shifts_away_from_a_target(self):
        """example using diffusion unaffected by longword"""

        # before diffusion gh should be roughly equally distant to L3 and L4
        v = self.encoder.document(self.gh)
        v_dense = sparse_to_dense(v)
        self.assertAlmostEqual(euc_dist(v_dense, self.data["L3"]),
                               euc_dist(v_dense, self.data["L4"]))

        # after diffusion, L3 should be clearly preferred
        # the strengths of diffusion should not matter here
        # (any diffusion should break a tie in favor or L3)
        vd = self.diffuser.diffuse(v, dict(documents=1))
        vd_dense = sparse_to_dense(vd)
        self.assertLess(euc_dist(vd_dense, self.data["L3"]),
                        euc_dist(vd_dense, self.data["L4"]))
示例#4
0
    def test_threshold_to_zeros(self):
        """threshold and all values are set to zero"""

        x = csr_matrix([0.0, 0.0, 0.0, 0.4, 0.2, 0.0, -0.8, 0.1])
        result = threshold_csr(x, 2)
        self.assertEqual(sum(sparse_to_dense(result)), 0)
        self.assertEqual(result.shape, (1, 8))
示例#5
0
def delta(request):
    """suggest features for user-driven learning

    :param request: object with data_pos, data_neg, expected_id
    :return: dictionary with top features, suggest_pos, suggest_neg
    """
    doc = parse_request(request)
    # process a query
    query_id = doc["query_id"]
    dataset = doc["dataset"]
    query_raw = find_vector(query_id, dataset)
    if query_raw is None:
        return {"error": "invalid item id: " + query_id}
    query_diffused = crossmap.diffuser.diffuse(query_raw, doc["diffusion"])
    diffused = sparse_to_dense(query_diffused)
    targets, _ = crossmap.indexer.suggest(diffused,
                                          dataset=dataset,
                                          n=doc["n"])
    # get feature vector for the expected result
    expected_raw = find_vector(doc["expected_id"], dataset)
    if expected_raw is None:
        return {"error": "invalid item id: " + doc["expected_id"]}
    expected = sparse_to_dense(expected_raw)
    # prepare a collection of dense vectors
    vectors = {
        "query": sparse_to_dense(query_raw),
        "diffused": diffused,
        "expected": expected,
        "error": diffused - expected
    }
    for i, hit_id in enumerate(targets):
        i_vector = sparse_to_dense(get_vector(dataset, hit_id))
        vectors["hit_" + str(i + 1)] = i_vector
        vectors["delta_" + str(i + 1)] = i_vector - expected
    # prepare a smaller table that includes only features with non-zero values
    inv_feature_map = crossmap.indexer.encoder.inv_feature_map
    result = []
    for i in range(len(inv_feature_map)):
        i_data = [abs(_[i]) for _ in vectors.values()]
        if sum(i_data) == 0:
            continue
        i_result = dict(feature=inv_feature_map[i])
        for vector_id, vector_values in vectors.items():
            i_result[vector_id] = vector_values[i]
        result.append(i_result)
    result = sorted(result, key=decr_by_query)
    return result
示例#6
0
    def test_threshold_w_negatives(self):
        """threshold preserves very negative values"""

        b = csr_matrix([0.0, 0.1, -0.5, 0.35, 0.9, -0.2, 0.0, -0.4])
        result = threshold_csr(b, 0.3)
        expected = [0.0, 0.0, -0.5, 0.35, 0.9, 0.0, 0.0, -0.4]
        self.assertListEqual(list(sparse_to_dense(result)), expected)
        self.assertEqual(result.shape, (1, 8))
示例#7
0
    def test_threshold_positives(self):
        """using a simple vector with positive values"""

        a = csr_matrix([0.0, 0.1, 0.5, 0.35, 0.9, 0.0, 0.0, 0.4])
        result = threshold_csr(a, 0.3)
        expected = [0.0, 0.0, 0.5, 0.35, 0.9, 0.0, 0.0, 0.4]
        self.assertListEqual(list(sparse_to_dense(result)), expected)
        self.assertEqual(result.shape, (1, 8))
示例#8
0
    def test_normalized_collapse(self):
        """collapse to dimension, with global rescaling/normalization"""

        a = csr_matrix([0.0, 0.1, 0.5, 0.35, 0.9, 0.0, 0.0, 0.4])
        result = dimcollapse_csr(a, set([0, 1, 2, 3]), normalize=True)
        result_dense = sparse_to_dense(result)
        self.assertEqual(result_dense[4], 0.0)
        self.assertGreater(result_dense[1], 0.1)
示例#9
0
    def test_raw_collapse(self):
        """raw collapse, setting values to zero without normalization"""

        a = csr_matrix([0.0, 0.1, 0.5, 0.35, 0.9, 0.0, 0.0, 0.4])
        result = dimcollapse_csr(a, set([0, 1, 2, 3]), normalize=False)
        expected = [0.0, 0.1, 0.5, 0.35, 0.0, 0.0, 0.0, 0.0]
        self.assertListEqual(list(sparse_to_dense(result)), expected)
        self.assertEqual(result.shape, (1, 8))
示例#10
0
    def test_diffusion_keeps_original_feature_strong(self):
        """diffusing from one feature should mantain that feature strong"""

        doc = dict(data="C")
        c_index = self.feature_map["c"][0]
        v = self.encoder.document(doc)
        # diffuse at different strengths
        # all should maintain feature "C" as the most important feature
        for w in [1, 2, 4, 8, 20]:
            result = self.diffuser.diffuse(v, dict(targets=w))
            result_dense = sparse_to_dense(result)
            result_c = result_dense[c_index]
            result_max = max(result_dense)
            self.assertEqual(result_max, result_c)
示例#11
0
    def test_longword_document_before_diffusion(self):
        """encoding before diffusion accounts for overlapping tokens"""

        v = self.encoder.document(self.long_b)
        self.assertGreater(len(v.data), 4)
        # overlapping tokens from "longword" should be weighted lower than
        # tokens from "B" or "C" that are stand-alone
        v_dense = sparse_to_dense(v)
        fm = self.feature_map
        self.assertGreater(v_dense[fm["b"][0]], v_dense[fm["ngwor"][0]])
        # document should be closer to L0 than to L1
        d0 = euc_dist(v_dense, self.data["L0"])
        d1 = euc_dist(v_dense, self.data["L1"])
        self.assertLess(d0, d1)
示例#12
0
 def setUpClass(cls):
     settings = CrossmapSettings(config_longword, create_dir=True)
     cls.indexer = CrossmapIndexer(settings)
     cls.indexer.build()
     cls.diffuser = CrossmapDiffuser(settings)
     cls.diffuser.build()
     cls.feature_map = cls.diffuser.feature_map
     cls.db = cls.diffuser.db
     cls.encoder = cls.indexer.encoder
     cls.plain_tokenizer = CrossmapTokenizer(settings)
     cls.diff_tokenizer = CrossmapDiffusionTokenizer(settings)
     # extract data vectors
     cls.data = dict()
     temp = cls.db.get_data(dataset="targets",
                            ids=["L0", "L1", "L2", "L3", "L4"])
     for _ in temp:
         cls.data[_["id"]] = sparse_to_dense(_["data"])
示例#13
0
    def test_normalize_csr_in_place(self):
        """produce a unit-norm vector from csr (in place)"""

        vec = csr_matrix([0.0, 2.0, 0.0])
        normalize_csr(vec)
        self.assertListEqual(list(sparse_to_dense(vec)), [0.0, 1.0, 0.0])
def round_list(x, digits=6):
    """helper for approximate tests, round a list"""
    if isinstance(x, csr_matrix):
        x = sparse_to_dense(x)
    return [round(_, digits) for _ in list(x)]