Пример #1
0
def test_cluster_scoring_and_weights():
    G = ex_graph_fig1()

    logger.info('=====================')
    logger.info('Testing cid_list_score')
    cids = list(ct.cids_from_range(4))
    n2c_random = {
        'a': cids[0],
        'b': cids[0],
        'f': cids[0],
        'c': cids[1],
        'g': cids[1],
        'd': cids[2],
        'e': cids[2],
        'i': cids[2],
        'h': cids[3],
        'j': cids[3],
        'k': cids[3],
    }
    clustering_random = ct.build_clustering(n2c_random)
    score = ct.cid_list_score(G, clustering_random, n2c_random,
                              [cids[0], cids[2], cids[3]])
    logger.info('Score between clusters [c0, c2, c3] should be -5 and is %s' %
                (score, ))

    logger.info('=====================')
    logger.info('Testing clustering_score')
    """ First clustering:  all together """
    n2c_single_cluster = {n: 'c0' for n in G.nodes}
    logger.info('Score with all together should be 21.  Score = %s' %
                (ct.clustering_score(G, n2c_single_cluster), ))
    """ Second clustering:  all separate """
    n2c_all_separate = {n: 'c' + str(i) for i, n in enumerate(G.nodes)}
    logger.info('Score with all together should be -21.  Score = %s' %
                (ct.clustering_score(G, n2c_all_separate), ))
    """ Third clustering: optimal, by hand """
    cids = list(ct.cids_from_range(4))
    n2c_optimal = {
        'a': cids[0],
        'b': cids[0],
        'd': cids[0],
        'e': cids[0],
        'c': cids[1],
        'h': cids[2],
        'i': cids[2],
        'f': cids[3],
        'g': cids[3],
        'j': cids[3],
        'k': cids[3],
    }
    logger.info('Optimal score should be 49. Score = %s' %
                (ct.clustering_score(G, n2c_optimal), ))

    negatives, positives = ct.get_weight_lists(G, sort_positive=True)
    logger.info('Length of negatives should be 10.  It is %s' %
                (len(negatives), ))
    logger.info('Length of positives should be 11.  It is %s' %
                (len(positives), ))
    logger.info('0th positive should be 8.  It is %s' % (positives[0], ))
    logger.info('Last positive should be 2.  It is %s' % (positives[-1], ))
Пример #2
0
def test_build_clustering_from_clusters():
    logger.info('================================')
    logger.info('test_build_clustering_from_clusters')
    clist = [['h', 'i', 'j'], ['k', 'm'], ['p']]
    n = len(clist)
    cids = list(ct.cids_from_range(n))
    clustering = ct.build_clustering_from_clusters(cids, clist)
    logger.info('Returned clustering:')
    logger.info(clustering)
    correct = len(clustering) == 3
    logger.info('Correct number of clusters %s' % (correct, ))
    correct = (set(clist[0]) == clustering[cids[0]]
               and set(clist[1]) == clustering[cids[1]]
               and set(clist[2]) == clustering[cids[2]])
    logger.info('Clusters are correct: %s' % (correct, ))

    #  Catching error from repeated entry
    clist = [['h', 'i', 'j'], ['k', 'm'], ['p', 'p']]
    n = len(clist)
    try:
        clustering = ct.build_clustering_from_clusters(ct.cids_from_range(n),
                                                       clist)
    except AssertionError:
        logger.info('Caught error from having repeated entry in one cluster')

    #  Catching error from intersecting lists
    clist = [['h', 'i', 'k'], ['k', 'm'], ['p', 'q']]
    n = len(clist)
    try:
        clustering = ct.build_clustering_from_clusters(ct.cids_from_range(n),
                                                       clist)
    except AssertionError:
        logger.info('Caught error from having intersecting lists')
Пример #3
0
def test_count_equal():
    """"""
    cids = list(ct.cids_from_range(99))
    gt = {
        cids[0]: {'a', 'b'},
        cids[3]: {'c'},
        cids[4]: {'d', 'e'},
        cids[6]: {'f', 'g', 'h'},
        cids[8]: {'i', 'j', 'k', 'l', 'm'},
        cids[10]: {'o'},
        cids[13]: {'p', 'q'},
        cids[15]: {'r', 's', 't'},
        cids[16]: {'u', 'v', 'w'},
        cids[19]: {'y', 'z', 'aa'},
    }

    est = {
        cids[25]: {'y', 'z', 'aa'},
        cids[29]: {'u', 'v'},
        cids[31]: {'w', 'r', 's', 't'},
        cids[37]: {'p'},
        cids[41]: {'q', 'o', 'm'},
        cids[43]: {'i', 'j', 'k', 'l'},
        cids[47]: {'a', 'b'},
        cids[53]: {'c'},
        cids[59]: {'d', 'e'},
        cids[61]: {'f', 'g', 'h'},
    }

    est_n2c = ct.build_node_to_cluster_mapping(est)
    n = ct.count_equal_clustering(gt, est, est_n2c)
    logger.info('test_count_equal: should be 5 and is %s' % (n, ))
Пример #4
0
def test_replace_clusters():
    logger.info('===========================')
    logger.info('test replace_clusters')
    cids = list(ct.cids_from_range(8))
    n2c = {
        'a': cids[0],
        'b': cids[0],
        'd': cids[0],
        'e': cids[0],
        'c': cids[1],
        'h': cids[2],
        'i': cids[2],
        'f': cids[3],
        'g': cids[3],
        'j': cids[4],
        'k': cids[4],
    }
    clustering = ct.build_clustering(n2c)
    old_cids = [cids[2], cids[4]]
    added_clusters = {cids[5]: set(['j']), cids[7]: set(['h', 'i', 'k'])}
    ct.replace_clusters(old_cids, added_clusters, clustering, n2c)
    logger.info('Cluster ids, should be c0, c1, c3, c5, c7.  Are: %s' %
                (list(clustering.keys()), ))
    logger.info("clustering[c5] should be {'j'}!! and is %s" %
                (clustering[cids[5]], ))
    logger.info("clustering[c7] should be {'h', 'i', 'k'} and is %s" %
                (clustering[cids[7]], ))
    logger.info("n2c['h'] should be c7 and is %s" % (n2c['h'], ))
    logger.info("n2c['j'] should be c5 and is %s" % (n2c['j'], ))
Пример #5
0
def test_comparisons():
    """"""
    cids = list(ct.cids_from_range(99))
    gt = {
        cids[0]: {'a', 'b'},
        cids[3]: {'c'},
        cids[4]: {'d', 'e'},
        cids[6]: {'f', 'g', 'h'},
        cids[8]: {'i', 'j', 'k', 'l', 'm'},
        cids[10]: {'o'},
        cids[13]: {'p', 'q'},
        cids[15]: {'r', 's', 't'},
        cids[16]: {'u', 'v', 'w'},
        cids[19]: {'y', 'z', 'aa'},
    }
    gt_n2c = ct.build_node_to_cluster_mapping(gt)

    est = {
        cids[25]: {'y', 'z', 'aa'},
        cids[29]: {'u', 'v'},
        cids[31]: {'w', 'r', 's', 't'},
        cids[37]: {'p'},
        cids[41]: {'q', 'o', 'm'},
        cids[43]: {'i', 'j', 'k', 'l'},
        cids[47]: {'a', 'b'},
        cids[53]: {'c'},
        cids[59]: {'d', 'e'},
        cids[61]: {'f', 'g', 'h'},
    }
    est_n2c = ct.build_node_to_cluster_mapping(est)

    logger.info('================')
    logger.info('test_comparisons')
    logger.info('ct.compare_by_lengths')

    ct.compare_by_lengths(est, est_n2c, gt)

    logger.info('Output for this example should be:\n'
                '1, 2, 1, 0.50, 0.667\n'
                '2, 3, 2, 0.67, 0.833\n'
                '3, 4, 2, 0.50, 0.854\n'
                '5, 1, 0, 0.00, 0.800')

    logger.info('------')
    logger.info('ct.pairwise_eval')
    # result = ct.compare_to_ground_truth(est, est_n2c, gt, gt_n2c)
    result = ct.percent_and_PR(est, est_n2c, gt, gt_n2c)
    logger.info('Result is [%1.3f, %1.3f, %1.3f]' % tuple(result))
    num_clusters = len(est)
    num_correct = 5
    tp, fp, fn = 18, 6, 7
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    logger.info('Should be [%1.3f, %1.3f, %1.3f]' %
                (num_correct / num_clusters, precision, recall))
Пример #6
0
def test_form_connected_cluster_pairs():
    logger.info('=================================')
    logger.info('test form_connected_cluster_pairs')
    G = ex_graph_fig1()
    cids = list(ct.cids_from_range(5))
    n2c = {
        'a': cids[0],
        'b': cids[0],
        'd': cids[0],
        'e': cids[0],
        'c': cids[1],
        'h': cids[2],
        'i': cids[2],
        'f': cids[3],
        'g': cids[3],
        'j': cids[4],
        'k': cids[4],
    }
    clustering = ct.build_clustering(n2c)

    cid_pairs = ct.form_connected_cluster_pairs(G, clustering, n2c)
    logger.info('form_connected_cluster_pairs(G, clustering, n2c)')
    logger.info('result:  %s' % (cid_pairs, ))
    logger.info('expecting: %s' % ([
        (cids[0], cids[1]),
        (cids[0], cids[2]),
        (cids[0], cids[3]),
        (cids[1], cids[3]),
        (cids[2], cids[3]),
        (cids[2], cids[4]),
        (cids[3], cids[4]),
    ], ))

    new_cids = [cids[1], cids[4]]
    cid_pairs = ct.form_connected_cluster_pairs(G, clustering, n2c, new_cids)
    logger.info('form_connected_cluster_pairs(G, clustering, n2c, new_cids)')
    logger.info('result:  %s' % (cid_pairs, ))
    logger.info('expecting: %s' % ([
        (cids[0], cids[1]),
        (cids[1], cids[3]),
        (cids[2], cids[4]),
        (cids[3], cids[4]),
    ], ))
Пример #7
0
def test_same_clustering():
    """"""
    cids = list(ct.cids_from_range(99))

    clustering0 = {
        cids[0]: {'a', 'b'},
        cids[3]: {'c'},
        cids[4]: {'d', 'e'},
        cids[6]: {'f', 'g', 'h'},
        cids[8]: {'i', 'j', 'k', 'l', 'm'},
    }
    clustering1 = {
        cids[6]: {'d', 'e'},
        cids[8]: {'c'},
        cids[16]: {'f', 'g', 'h'},
        cids[19]: {'i', 'k', 'l', 'm', 'j'},
        cids[20]: {'b', 'a'},
    }
    clustering2 = {
        cids[6]: {'d', 'c', 'e'},
        cids[16]: {'f', 'g', 'h'},
        cids[22]: {'i', 'j', 'k', 'l', 'm'},
        cids[25]: {'b', 'a'},
    }

    logger.info('====================')
    logger.info('test_same_clustering')
    logger.info('first test should generate no output and then return True')
    logger.info(ct.same_clustering(clustering0, clustering1, True))
    logger.info('second test should generate no output and then return False')
    logger.info(ct.same_clustering(clustering0, clustering2, False))
    logger.info(
        'third test should generate mismatch output and then return False')
    logger.info('Expected:')
    logger.info("['c'] not in 2nd")
    logger.info("['d', 'e'] not in 2nd")
    logger.info("['c', 'd', 'e'] not in 1st")
    result = ct.same_clustering(clustering0, clustering2, True)
    logger.info('It returned %s' % (result, ))
Пример #8
0
def test_shift_between_clusters():
    logger.info('===========================')
    logger.info('test_shift_between_clusters')
    cids = list(ct.cids_from_range(4))
    n2c_optimal = {
        'a': cids[0],
        'b': cids[0],
        'd': cids[0],
        'e': cids[0],
        'c': cids[1],
        'h': cids[2],
        'i': cids[2],
        'f': cids[3],
        'g': cids[3],
        'j': cids[3],
        'k': cids[3],
    }
    clustering = ct.build_clustering(n2c_optimal)

    n0_cid, n1_cid = cids[3], cids[2]
    n0_nodes_to_move = {'f', 'j'}
    logger.info('Shifting from cluster %s to %s:' % (n0_cid, n1_cid))
    logger.info('Nodes to move: %s' % (sorted(n0_nodes_to_move), ))
    logger.info('Cluster %s: %s' % (n0_cid, sorted(clustering[n0_cid])))
    logger.info('Cluster %s: %s' % (n1_cid, sorted(clustering[n1_cid])))

    ct.shift_between_clusters(n0_cid, n0_nodes_to_move, n1_cid, clustering,
                              n2c_optimal)
    logger.info('After shift, cluster %s: %s' %
                (n0_cid, sorted(clustering[n0_cid])))
    logger.info('After shift, cluster %s: %s' %
                (n1_cid, sorted(clustering[n1_cid])))
    logger.info("n2c['f'] = %s" % (n2c_optimal['f'], ))
    logger.info("n2c['j'] = %s" % (n2c_optimal['j'], ))
    logger.info("n2c['h'] = %s" % (n2c_optimal['h'], ))
    logger.info("n2c['i'] = %s" % (n2c_optimal['i'], ))
    logger.info("n2c['g'] = %s" % (n2c_optimal['g'], ))
    logger.info("n2c['k'] = %s" % (n2c_optimal['k'], ))
Пример #9
0
    def generate(self):
        expected_nodes = 1 + self.params['gamma_shape'] * self.params['gamma_scale']

        digits_per_node = 2 + int(m.log10(expected_nodes))
        next_index = 0
        nodes = []  # list of node ids
        edges = []  # list of edge 3-tuples

        num_correct_positive = 0
        num_correct_negative = 0
        num_correct_zero = 0
        num_incorrect_positive = 0
        num_incorrect_negative = 0
        num_incorrect_zero = 0

        """
        Step 0:
        """
        samples = np.random.gamma(
            self.params['gamma_shape'],
            self.params['gamma_scale'],
            self.params['num_clusters'],
        )
        samples = 1 + np.round(samples)
        samples = samples.astype(int)

        """
        Step 1:
        Generate the clusters, the nodes within the cluster, and the
        "correct" inter-cluster edges.  Note that since we are
        assuming an imperfect ranking algorithm, this does not ensure
        that each cluster is connected.
        """
        num_from_ranker = self.params['num_from_ranker']
        cids = ct.cids_from_range(len(samples), prefix='ct')
        for i, cid in enumerate(cids):
            self.gt_clustering[cid] = list()

            n = samples[i]

            # Create the nodes in the cluster
            for i in range(n):
                node_id = 'n' + str(next_index).zfill(digits_per_node)
                next_index += 1
                nodes.append(node_id)
                self.gt_clustering[cid].append(node_id)
                self.gt_node2cid[node_id] = cid
                self.ranker_matches[node_id] = set()

            #  Create the positive edges between nodes in a cluster.
            #  These are symmetric. Don't allow more than num_from_ranker
            #  matches / edges for any node.
            for i, ith_node in enumerate(self.gt_clustering[cid]):
                for j in range(i + 1, len(self.gt_clustering[cid])):
                    prob = random.uniform(0, 1)
                    jth_node = self.gt_clustering[cid][j]
                    if prob < self.params['p_ranker_correct'] and \
                       len(self.ranker_matches[ith_node]) < num_from_ranker and \
                       len(self.ranker_matches[jth_node]) < num_from_ranker:
                        self.ranker_matches[ith_node].add(jth_node)
                        self.ranker_matches[jth_node].add(ith_node)
                        is_match_correct = True
                        wgt = self.wgtr.random_wgt(is_match_correct)
                        if wgt > 0:
                            num_correct_positive += 1
                        elif wgt == 0:
                            num_correct_zero += 1
                        else:
                            num_correct_negative += 1

                        e = (ith_node, jth_node, wgt)
                        edges.append(e)

        assert num_from_ranker > 0
        num_nodes = len(nodes)

        # Change the list to a set
        self.gt_clustering = {
            cid: set(cluster) for cid, cluster in self.gt_clustering.items()
        }

        """
        Step 2:
        Generate "incorrect" match edges, sufficient to have the required
        number of edges generated by the ranking algorithm.
        """
        for i, ith_node in enumerate(nodes):
            matches = self.ranker_matches[ith_node]
            cid = self.gt_node2cid[ith_node]
            cluster = set(self.gt_clustering[cid])

            """
            Generate (incorrect) edges between clusters
            """
            is_match_correct = False
            while len(matches) < num_from_ranker:
                j = random.randint(0, num_nodes - 1)
                jth_node = nodes[j]
                if jth_node not in matches and jth_node not in cluster:
                    matches.add(jth_node)
                    wgt = self.wgtr.random_wgt(is_match_correct)
                    if wgt > 0:
                        num_incorrect_positive += 1
                    elif wgt == 0:
                        num_incorrect_zero += 1
                    else:
                        num_incorrect_negative += 1

                    if ith_node < jth_node:
                        e = (ith_node, jth_node, wgt)
                    else:
                        e = (jth_node, ith_node, wgt)
                    edges.append(e)

        self.G.add_weighted_edges_from(edges)
        logging.info('simulator::generate: adding %d edges' % len(edges))
        logging.info('%d correct match edges have positive weight' % num_correct_positive)
        logging.info('%d correct match edges have zero weight' % num_correct_zero)
        logging.info('%d correct match edges have negative weight' % num_correct_negative)
        logging.info(
            '%d incorrect match edges have positive weight' % num_incorrect_positive
        )
        logging.info('%d incorrect match edges have zero weight' % num_incorrect_zero)
        logging.info(
            '%d incorrect match edges have negative weight' % num_incorrect_negative
        )

        self.G_orig.add_nodes_from(self.G)
        self.G_orig.add_weighted_edges_from(edges)

        """
        Step 3: Generate the "reachable" ground truth, the obtainable
        result given simulated failures to match that could disconnect
        a correct match.
        """
        self.r_clustering = dict()
        k = 0
        for cc in self.gt_clustering.values():
            H = self.G.subgraph(cc)
            prev_k = k
            for new_cc in nx.connected_components(H):
                self.r_clustering[k] = new_cc
                k += 1
            if k - prev_k > 1:
                logger.info('GT cluster %a split into %a ...' % (cc, k - prev_k))
                for i in range(prev_k, k):
                    logger.info('   %a' % self.r_clustering[i])
            else:
                logger.info('GT cluster %a is intact' % cc)
        self.r_node2cid = ct.build_node_to_cluster_mapping(self.r_clustering)

        """
        Step 4: Reconfigure edges to maks the expected input to the
        graph algorithm weight manager.
        """
        aug_names = ['verifier', 'human']
        edges = [(n0, n1, w, aug_names[0]) for n0, n1, w in edges]

        return edges, aug_names
Пример #10
0
def test_merge():
    logger.info('===========================')
    logger.info('test_merge')
    G = ex_graph_fig1()
    cids = list(ct.cids_from_range(4))
    logger.info(cids)
    n2c_optimal = {
        'a': cids[0],
        'b': cids[0],
        'd': cids[0],
        'e': cids[0],
        'c': cids[1],
        'h': cids[2],
        'i': cids[2],
        'f': cids[3],
        'g': cids[3],
        'j': cids[3],
        'k': cids[3],
    }
    clustering = ct.build_clustering(n2c_optimal)

    logger.info('-------------')
    logger.info('score_delta_after_merge')
    delta = ct.score_delta_after_merge(cids[2], cids[3], G, clustering)
    logger.info('possible merge of 2, 3; delta should be -4, and is %s' %
                (delta, ))

    logger.info('-------------')
    logger.info('merge_clusters')
    score_before = ct.clustering_score(G, n2c_optimal)
    delta = ct.merge_clusters(cids[0], cids[2], G, clustering, n2c_optimal)
    score_after = ct.clustering_score(G, n2c_optimal)
    logger.info('delta = %s should be %s' % (
        delta,
        score_after - score_before,
    ))
    logger.info('---')
    for c in clustering:
        logger.info('%s: %s' % (c, clustering[c]))
    logger.info('---')
    for n in G.nodes:
        logger.info('%s: %s' % (n, n2c_optimal[n]))

    logger.info('--------')
    logger.info('Retesting merge with order of clusters reversed')
    n2c_optimal = {
        'a': cids[0],
        'b': cids[0],
        'd': cids[0],
        'e': cids[0],
        'c': cids[1],
        'h': cids[2],
        'i': cids[2],
        'f': cids[3],
        'g': cids[3],
        'j': cids[3],
        'k': cids[3],
    }
    clustering = ct.build_clustering(n2c_optimal)

    logger.info('-------------')
    logger.info('score_delta_after_merge')
    delta = ct.score_delta_after_merge(cids[3], cids[2], G, clustering)
    logger.info('possible merge of 3, 2; delta should be -4, and is %s' %
                (delta, ))

    logger.info('-------------')
    logger.info('merge_clusters')
    score_before = ct.clustering_score(G, n2c_optimal)
    delta = ct.merge_clusters(cids[2], cids[0], G, clustering, n2c_optimal)
    score_after = ct.clustering_score(G, n2c_optimal)
    logger.info('delta = %s should be %s' % (
        delta,
        score_after - score_before,
    ))
    logger.info('---')
    for c in clustering:
        logger.info('%s: %s' % (c, clustering[c]))
    logger.info('---')
    for n in G.nodes:
        logger.info('%s: %s' % (n, n2c_optimal[n]))