def pprint_short(self, initial_str='', stop_after_from=False): out_str = initial_str + 'From cids:' for cid in sorted(self.from_clusters.keys()): out_str += ' %s: %a' % (cid, sorted(self.from_clusters[cid])) if logger.getEffectiveLevel() <= logging.DEBUG: check_score = ct.clustering_score(self.subgraph, self.from_n2c) if abs(check_score - self.from_score) > 1e-7: out_str += 'from score error: should be %d, but is %d' % ( check_score, self.from_score, ) if stop_after_from or self.to_clusters is None: logger.info(out_str) return out_str += '; to:' for cid in sorted(self.to_clusters.keys()): out_str += ' %a' % sorted(self.to_clusters[cid]) if logger.getEffectiveLevel() <= logging.DEBUG: check_score = ct.clustering_score(self.subgraph, self.to_n2c) if check_score != self.to_score: out_str += '\nto score error: should be %d, but is %d\n' % ( check_score, self.to_score, ) out_str += '; delta %d' % self.delta_score() logger.info(out_str)
def test_cluster_scoring_and_weights(): G = ex_graph_fig1() logger.info('=====================') logger.info('Testing cid_list_score') cids = list(ct.cids_from_range(4)) n2c_random = { 'a': cids[0], 'b': cids[0], 'f': cids[0], 'c': cids[1], 'g': cids[1], 'd': cids[2], 'e': cids[2], 'i': cids[2], 'h': cids[3], 'j': cids[3], 'k': cids[3], } clustering_random = ct.build_clustering(n2c_random) score = ct.cid_list_score(G, clustering_random, n2c_random, [cids[0], cids[2], cids[3]]) logger.info('Score between clusters [c0, c2, c3] should be -5 and is %s' % (score, )) logger.info('=====================') logger.info('Testing clustering_score') """ First clustering: all together """ n2c_single_cluster = {n: 'c0' for n in G.nodes} logger.info('Score with all together should be 21. Score = %s' % (ct.clustering_score(G, n2c_single_cluster), )) """ Second clustering: all separate """ n2c_all_separate = {n: 'c' + str(i) for i, n in enumerate(G.nodes)} logger.info('Score with all together should be -21. Score = %s' % (ct.clustering_score(G, n2c_all_separate), )) """ Third clustering: optimal, by hand """ cids = list(ct.cids_from_range(4)) n2c_optimal = { 'a': cids[0], 'b': cids[0], 'd': cids[0], 'e': cids[0], 'c': cids[1], 'h': cids[2], 'i': cids[2], 'f': cids[3], 'g': cids[3], 'j': cids[3], 'k': cids[3], } logger.info('Optimal score should be 49. Score = %s' % (ct.clustering_score(G, n2c_optimal), )) negatives, positives = ct.get_weight_lists(G, sort_positive=True) logger.info('Length of negatives should be 10. It is %s' % (len(negatives), )) logger.info('Length of positives should be 11. It is %s' % (len(positives), )) logger.info('0th positive should be 8. It is %s' % (positives[0], )) logger.info('Last positive should be 2. It is %s' % (positives[-1], ))
def run_lca_alg2(G, best_clustering, exp_alt_clustering, msg, trace_on=False): exp_alt_node2cid = ct.build_node_to_cluster_mapping(exp_alt_clustering) exp_alt_score = ct.clustering_score(G, exp_alt_node2cid) best_node2cid = ct.build_node_to_cluster_mapping(best_clustering) alt_clustering, alt_score = lca_alg2(G, best_clustering, best_node2cid, trace_on=trace_on) failed = False if not ct.same_clustering(alt_clustering, exp_alt_clustering): failed = True logger.info('%s FAILED' % (msg, )) else: logger.info('%s success' % (msg, )) if alt_score != exp_alt_score: failed = True logger.info('score %d, expected_score %d. FAILED' % (alt_score, exp_alt_score)) if failed: logger.info('current structures with failure:') alt_node2cid = ct.build_node_to_cluster_mapping(alt_clustering) ct.print_structures(G, alt_clustering, alt_node2cid, alt_score)
def test_lca_alg1_constrained(): logger.info('\n=========================\n' 'Test lca_alg1_constrained\n' '=========================') G = tct.ex_graph_fig1() G['g']['j']['weight'] = -4 # a little larger than original to break a tie in_same = [('f', 'i')] in_different = [('d', 'e')] clustering, score = lca_alg1_constrained(G, in_same, in_different) node2cid = ct.build_node_to_cluster_mapping(clustering) correct_score = ct.clustering_score(G, node2cid) exp_clustering = { 0: {'a', 'b', 'd'}, 1: {'f', 'g', 'h', 'i', 'k'}, 2: {'c'}, 3: {'e'}, 4: {'j'}, } is_same = ct.same_clustering(clustering, exp_clustering, output_differences=True) if is_same: logger.info('constrained (d,e) different and (f,i) same: success') else: logger.info('constrained (d,e) different and (f,i) same: FAIL') if score != correct_score: logger.info('scoring error: actual %a, correct %a' % (score, correct_score)) else: logger.info('scoring correct: actual %a, correct %a' % (score, correct_score))
def build_example_LCA(): G = tct.ex_graph_fig1() n2c_optimal = { 'a': 0, 'b': 0, 'd': 0, 'e': 0, 'c': 1, 'h': 2, 'i': 2, 'f': 3, 'g': 3, 'j': 3, 'k': 3, } clustering_opt = ct.build_clustering(n2c_optimal) cid0 = 2 cid1 = 3 nodes_in_clusters = list(clustering_opt[2] | clustering_opt[3]) subG = G.subgraph(nodes_in_clusters) score = ct.cid_list_score(subG, clustering_opt, n2c_optimal, [cid0, cid1]) a = LCA(subG, clustering_opt, [cid0, cid1], score) to_clusters = {0: {'f', 'h', 'i', 'j'}, 1: {'g', 'k'}} subG = G.subgraph(nodes_in_clusters) to_node2cid = { n: cid for cid in range(len(to_clusters)) for n in to_clusters[cid] } to_score = ct.clustering_score(subG, to_node2cid) a.set_to_clusters(to_clusters, to_score) return a, G
def best_alternative_len2(G, clustering, node2cid): """Return the best alternative to the current clustering when G has exactly two nodes. """ if len(clustering) == 2: alt_clustering = {0: set(G.nodes())} else: alt_clustering = {c: {n} for c, n in enumerate(G.nodes())} alt_node2cid = ct.build_node_to_cluster_mapping(alt_clustering) alt_score = ct.clustering_score(G, alt_node2cid) return alt_clustering, alt_score
def __init__(self, edges, clusters, aug_names, params, aug_request_cb, aug_result_cb): self.params = params logger.info('======================================') logger.info('Construction of graph_algorithm object') logger.info(dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) self.weight_mgr = wm.weight_manager(aug_names, params['tries_before_edge_done'], aug_request_cb, aug_result_cb) self.G = nx.Graph() weighted_edges = self.weight_mgr.get_initial_edges(edges) self.G.add_weighted_edges_from(weighted_edges) logger.info('Initial graph has %d nodes and %d edges' % (len(self.G.nodes), len(self.G.edges))) self._next_cid = 0 self.build_clustering(clusters) self.node2cid = ct.build_node_to_cluster_mapping(self.clustering) self.score = ct.clustering_score(self.G, self.node2cid) self.phase = 'scoring' self.cid2lca = cid_to_lca.CID2LCA() self.queues = lca_queues.lca_queues() self.new_lcas(self.clustering.keys(), use_pairs=True, use_singles=False) if self.queues.num_lcas() == 0: logger.info("Phase shift immediately into 'splitting'") self.phase = 'splitting' self.new_lcas(self.clustering.keys(), use_pairs=False, use_singles=True) self.queues.info_long(max_entries=10) self.num_verifier_results = 0 self.num_human_results = 0 self.removed_nodes = set() self.draw_obj = None if self.params['draw_iterations']: self.draw_obj = draw_lca.draw_lca(self.params['drawing_prefix']) """ Need to set these callbacks to request and receive information from the verfication algorithm and to do the same from human reviewers. """ self.remove_nodes_cb = None self.status_request_cb = None self.status_return_cb = None self.results_request_cb = None self.results_return_cb = None self.log_request_cb = None self.log_return_cb = None self.trace_start_human_gt_cb = None self.trace_iter_compare_to_gt_cb = None self.should_stop_cb = None logger.info('Completed graph algorithm initialization')
def pprint(self, stop_after_from=False): logger.info('from_n2c: %s' % (self.from_n2c, )) logger.info('subgraph nodes %s' % (self.subgraph.nodes(), )) check_score = ct.clustering_score(self.subgraph, self.from_n2c) logger.info('from clusters (score = %a, checking %a):' % (self.from_score, check_score)) if self.from_score != check_score: logger.info('lca: SCORING ERROR in from') for cid in sorted(self.from_clusters.keys()): logger.info(' %s: %a' % (cid, self.from_clusters[cid])) if stop_after_from: return check_score = ct.clustering_score(self.subgraph, self.to_n2c) logger.info('to clusters (score = %a, checking = %a):' % (self.to_score, check_score)) if self.to_score != check_score: logger.info('SCORING ERROR in to') for cid in sorted(self.to_clusters.keys()): logger.info(' %d: %a' % (cid, self.to_clusters[cid])) logger.info('score_difference %a' % self.delta_score()) logger.info('inconsistent_pairs: %s' % (self.inconsistent, ))
def run_lca_alg1(G, expected_clustering, msg, stop_at_two=False, trace_on=False): node2cid = ct.build_node_to_cluster_mapping(expected_clustering) expected_score = ct.clustering_score(G, node2cid) clustering, score = lca_alg1(G, stop_at_two=stop_at_two, trace_on=trace_on) failed = False if not ct.same_clustering(clustering, expected_clustering): failed = True logger.info('%s FAILED' % (msg, )) else: logger.info('%s success' % (msg, )) if score != expected_score: failed = True logger.info('score %d, expected_score %d. FAILED' % (score, expected_score)) if failed: logger.info('current structures with failure:') node2cid = ct.build_node_to_cluster_mapping(clustering) ct.print_structures(G, clustering, node2cid, score)
def lca_alg1(curr_G, stop_at_two=False, trace_on=False): if len(curr_G) == 0: return {}, 0 elif len(curr_G) == 1: clustering = {0: set(curr_G.nodes())} return clustering, 0 neg_edges, pos_edges = ct.get_weight_lists(curr_G, sort_positive=True) clustering = {c: {n} for c, n in enumerate(sorted(curr_G.nodes()))} node2cid = ct.build_node_to_cluster_mapping(clustering) G_prime = nx.Graph() G_prime.add_nodes_from(curr_G) G_prime.add_weighted_edges_from(neg_edges) score = ct.clustering_score(G_prime, node2cid) if trace_on: logger.info('====================') logger.info('==== lca_alg1 ====') logger.info('====================') ct.print_structures(G_prime, clustering, node2cid, score) for e in pos_edges: if trace_on: logger.info('=======================') logger.info('Start of next iteration') logger.info('=======================') if e[0] < e[1]: n0, n1 = e[0], e[1] else: n1, n0 = e[0], e[1] wgt = e[2] n0_cid, n1_cid = node2cid[n0], node2cid[n1] if trace_on: logger.info('n0=%s, n1=%s, wgt=%a, n0_cid=%a, n1_cid=%a' % (n0, n1, wgt, n0_cid, n1_cid)) is_merge_allowed = not stop_at_two or len(clustering) > 2 if trace_on: logger.info('is_merge_allowed %s' % (is_merge_allowed, )) if n0_cid == n1_cid: if trace_on: logger.info('In the same cluster') score += wgt elif is_merge_allowed and not ct.has_edges_between_them( G_prime, clustering[n0_cid], clustering[n1_cid]): if trace_on: logger.info('Merging disjoint clusters') sc_delta = ct.merge_clusters(n0_cid, n1_cid, G_prime, clustering, node2cid) assert sc_delta == 0 score += sc_delta + wgt # why might sc_delta be non-zero here??? else: sc_merged = (ct.score_delta_after_merge(n0_cid, n1_cid, G_prime, clustering) + wgt) if trace_on: logger.info('sc_merged=%a' % sc_merged) sc_unmerged = -wgt if trace_on: logger.info('sc_unmerged=%a' % sc_unmerged) if len(clustering[n0_cid]) == 1 or len(clustering[n1_cid]) == 1: sc_n0_to_n1 = sc_n1_to_n0 = min(sc_merged, sc_unmerged) - 9999 n0_to_move = n1_to_move = [] if trace_on: logger.info('not checking moving nodes because ' 'at least one cluster is length 1') else: sc_n0_to_n1, n0_to_move = best_shift(n0, n1, G_prime, clustering, node2cid, trace_on=trace_on) sc_n0_to_n1 += wgt if trace_on: logger.info('sc_n0_to_n1=%a, n0_to_move=%a' % (sc_n0_to_n1, n0_to_move)) sc_n1_to_n0, n1_to_move = best_shift(n1, n0, G_prime, clustering, node2cid, trace_on=trace_on) sc_n1_to_n0 += wgt if trace_on: logger.info('sc_n1_to_n0=%a, n1_to_move=%a' % (sc_n1_to_n0, n1_to_move)) if is_merge_allowed and sc_merged >= max(sc_unmerged, sc_n0_to_n1, sc_n1_to_n0): ct.merge_clusters(n0_cid, n1_cid, G_prime, clustering, node2cid) score += sc_merged if trace_on: logger.info('Choose merge') elif sc_unmerged >= max(sc_n0_to_n1, sc_n1_to_n0): score += sc_unmerged if trace_on: logger.info('Choose unmerged - unchanged') elif sc_n0_to_n1 >= sc_n1_to_n0: ct.shift_between_clusters(n0_cid, n0_to_move, n1_cid, clustering, node2cid) score += sc_n0_to_n1 if trace_on: logger.info('Choose to shift from cluster %a to %a' % (n0_cid, n1_cid)) else: ct.shift_between_clusters(n1_cid, n1_to_move, n0_cid, clustering, node2cid) score += sc_n1_to_n0 if trace_on: logger.info('Choose to shift from cluster %a to %a' % (n1_cid, n0_cid)) G_prime.add_weighted_edges_from([e]) if trace_on: ct.print_structures(G_prime, clustering, node2cid, score) return clustering, score
def lca_alg1_constrained(curr_G, in_same=[], in_different=[], trace_on=False): """ Use algorithm 1 to find the best clustering of the current subgraph subject to the constraints that all pairs of nodes from in_same must be in the same cluster and all pairs of nodes from in_different must be in different clusters. This does not check that the constraints from in_same and in_different can all be satisfied. In implementation the in_same constraints take precedence, but in use, one of the two in_same and in_different lists will be empty. """ clustering = build_initial_from_constraints(curr_G, in_same) node2cid = ct.build_node_to_cluster_mapping(clustering) neg_edges, pos_edges = ct.get_weight_lists(curr_G, sort_positive=True) G_prime = nx.Graph() G_prime.add_nodes_from(curr_G) G_prime.add_weighted_edges_from(neg_edges) edges = [(p[0], p[1], curr_G[p[0]][p[1]]['weight']) for p in in_same] G_prime.add_weighted_edges_from(edges) score = ct.clustering_score(G_prime, node2cid) if trace_on: logger.info('=================================') logger.info('===== lca_alg1_constrained ====') logger.info('=================================') ct.print_structures(G_prime, clustering, node2cid, score) for e in pos_edges: if trace_on: logger.info('=======================') logger.info('Start of next iteration') logger.info('=======================') if e[0] < e[1]: n0, n1 = e[0], e[1] else: n1, n0 = e[0], e[1] if (n0, n1) in in_same: if trace_on: logger.info('Skipping (%a, %a) because already in graph' % (n0, n1)) continue wgt = e[2] n0_cid, n1_cid = node2cid[n0], node2cid[n1] if trace_on: logger.info('n0=%s, n1=%s, wgt=%a, n0_cid=%a, n1_cid=%a' % (n0, n1, wgt, n0_cid, n1_cid)) if n0_cid == n1_cid: if trace_on: logger.info('Already in the same cluster') score += wgt elif keep_separate(clustering[n0_cid], clustering[n1_cid], in_different): if trace_on: logger.info('Must be kept separate') score -= wgt elif not ct.has_edges_between_them(G_prime, clustering[n0_cid], clustering[n1_cid]): if trace_on: logger.info('Merging disjoint clusters') sc_delta = ct.merge_clusters(n0_cid, n1_cid, G_prime, clustering, node2cid) assert sc_delta == 0 score += sc_delta + wgt else: sc_merged = (ct.score_delta_after_merge(n0_cid, n1_cid, G_prime, clustering) + wgt) if trace_on: logger.info('sc_merged=%a' % sc_merged) sc_unmerged = -wgt if trace_on: logger.info('sc_unmerged=%a' % sc_unmerged) if sc_merged > sc_unmerged: ct.merge_clusters(n0_cid, n1_cid, G_prime, clustering, node2cid) score += sc_merged if trace_on: logger.info('Merging clusters with edges between') else: score += sc_unmerged if trace_on: logger.info('No merge of clusters with edges between ') G_prime.add_weighted_edges_from([e]) if trace_on: ct.print_structures(G_prime, clustering, node2cid, score) return clustering, score
def lca_alg2(G, clustering, node2cid, trace_on=False): """ If it is a single cluster, then stop the original algorithm when there are two clusters. Perhaps can run alternative multiple times If there are multiple clusterings, then one option is a merge, but add others based on inconsistency Don't allow len(G) <= 1it is two, and the nodes are disconnected, there is also no alternative. If it is two, then split/merging vs. merging/splitting is the alternative. """ assert len(G) >= 2 if len(G) == 2: return best_alternative_len2(G, clustering, node2cid) """ Form the first estimate of the best alternative. If there is just one cluster in the current (local) best clustering then rerun Alg1 constrained to stop at at most two. Otherwise, just form a single clustering. """ if len(clustering) == 1: best_clustering, best_score = a1.lca_alg1(G, stop_at_two=True) best_node2cid = ct.build_node_to_cluster_mapping(best_clustering) else: best_clustering = {0: set(G.nodes())} best_node2cid = {n: 0 for n in G.nodes()} best_score = ct.clustering_score(G, best_node2cid) if trace_on: logger.info('In lca_alg2, before checking inconsistent\n' 'best_clustering %a, best_score %d, checking %d' % (best_clustering, best_score, ct.clustering_score(G, best_node2cid))) inconsistent = inconsistent_edges(G, clustering, node2cid) inconsistent.sort(key=lambda e: abs(e[2]), reverse=True) if trace_on: logger.info('In lca_alg2: clustering %s' % (clustering, )) logger.info('In lca_alg2: inconsistent edges %s' % (inconsistent, )) logger.info('Starting inconsistent edge loop') for e in inconsistent: if trace_on: logger.info('e = %s' % (e, )) if e[2] < 0: if trace_on: logger.info('Forcing edge into different clusters') new_clustering, new_score = lca_alg1_constrained(G, in_same=[], in_different=[ (e[0], e[1]) ]) else: if trace_on: logger.info('Forcing edge into same cluster') new_clustering, new_score = lca_alg1_constrained(G, in_same=[(e[0], e[1])], in_different=[]) if trace_on: logger.info('Best score returned by lca_alg1_constrained is %s' % (new_score, )) logger.info( 'Checking', ct.clustering_score( G, ct.build_node_to_cluster_mapping(new_clustering)), ) if new_score > best_score: if trace_on: logger.info('New best') best_score = new_score best_clustering = new_clustering return best_clustering, best_score
def test_merge(): logger.info('===========================') logger.info('test_merge') G = ex_graph_fig1() cids = list(ct.cids_from_range(4)) logger.info(cids) n2c_optimal = { 'a': cids[0], 'b': cids[0], 'd': cids[0], 'e': cids[0], 'c': cids[1], 'h': cids[2], 'i': cids[2], 'f': cids[3], 'g': cids[3], 'j': cids[3], 'k': cids[3], } clustering = ct.build_clustering(n2c_optimal) logger.info('-------------') logger.info('score_delta_after_merge') delta = ct.score_delta_after_merge(cids[2], cids[3], G, clustering) logger.info('possible merge of 2, 3; delta should be -4, and is %s' % (delta, )) logger.info('-------------') logger.info('merge_clusters') score_before = ct.clustering_score(G, n2c_optimal) delta = ct.merge_clusters(cids[0], cids[2], G, clustering, n2c_optimal) score_after = ct.clustering_score(G, n2c_optimal) logger.info('delta = %s should be %s' % ( delta, score_after - score_before, )) logger.info('---') for c in clustering: logger.info('%s: %s' % (c, clustering[c])) logger.info('---') for n in G.nodes: logger.info('%s: %s' % (n, n2c_optimal[n])) logger.info('--------') logger.info('Retesting merge with order of clusters reversed') n2c_optimal = { 'a': cids[0], 'b': cids[0], 'd': cids[0], 'e': cids[0], 'c': cids[1], 'h': cids[2], 'i': cids[2], 'f': cids[3], 'g': cids[3], 'j': cids[3], 'k': cids[3], } clustering = ct.build_clustering(n2c_optimal) logger.info('-------------') logger.info('score_delta_after_merge') delta = ct.score_delta_after_merge(cids[3], cids[2], G, clustering) logger.info('possible merge of 3, 2; delta should be -4, and is %s' % (delta, )) logger.info('-------------') logger.info('merge_clusters') score_before = ct.clustering_score(G, n2c_optimal) delta = ct.merge_clusters(cids[2], cids[0], G, clustering, n2c_optimal) score_after = ct.clustering_score(G, n2c_optimal) logger.info('delta = %s should be %s' % ( delta, score_after - score_before, )) logger.info('---') for c in clustering: logger.info('%s: %s' % (c, clustering[c])) logger.info('---') for n in G.nodes: logger.info('%s: %s' % (n, n2c_optimal[n]))