def __init__(self, size): self.grid = numpy.zeros((size, size)) # self.grid = [[0 for i in range(size)] for i in range(size)] self.open_sites = 0.0 self.index = union_find.UnionFind(size * size) self.size = size self.percolate = False
def persistence(im): h, w = im.shape # Get indices orderd by value from high to low indices = [(i, j) for i in range(h) for j in range(w)] indices.sort(key=lambda p: get(im, p), reverse=True) # print indices # Maintains the growing sets uf = union_find.UnionFind() groups0 = {} def get_comp_birth(p): return get(im, uf[p]) # Process pixels from high to low for i, p in enumerate(indices): v = get(im, p) ni = [] ff = iter_neighbors(p, w, h) print "------" for q in ff: if q in uf: print q ni.append(uf[q]) print "------" nc = sorted([(get_comp_birth(q), q) for q in set(ni)], reverse=True) if i == 0: groups0[p] = (v, v, None) uf.add(p, -i) if len(nc) > 0: oldp = nc[0][1] uf.union(oldp, p) # Merge all others with oldp for bl, q in nc[1:]: if uf[q] not in groups0: #print(i, ": Merge", uf[q], "with", oldp, "via", p) groups0[uf[q]] = (bl, bl-v, p) uf.union(oldp, q) # groups1 = [(k, groups0[k][0], groups0[k][1], groups0[k][2]) for k in groups0] # groups1.sort(key=lambda g: g[2], reverse=True) groups1 = [] for k in groups0: groups1.append((k, groups0[k][0], groups0[k][1], groups0[k][2])) groups1.sort(key=lambda g: g[2], reverse=True) print groups1 return groups1
def kruskalMST(self): self.graph = sorted(self.graph, key=lambda x: x[2]) print("sorted graph", self.graph) uf = union_find.UnionFind() e = 0 for i in self.graph: if (e == self.nodes - 1 ): # MST will hva eatmost nodes-1 edges so break when satsfies break if i[0] not in uf.sets: uf.makeSet(i[0]) # add node to sets in union_find if i[1] not in uf.sets: uf.makeSet(i[1]) #print("i0 =", i[0]) #print("i1 =", i[1]) if (False == uf.union(uf.sets[i[0]], uf.sets[i[1]]) ): # can't make union beacuse this edge formas cycle continue self.mst.append(i) e += 1
def kruskal(self, graph): """ Given a connected undirected graph G = (V, E) with positive edge weights, computes a minimum spanning tree that consists of a subset of edges E′ ⊆ E of minimum total weight such that the graph (V, E′) is connected. Greedy Strategy: Repeatedly adds the next lightest edge if this doesn’t produce a cycle. Note: The graph does not have to be undirected. """ minimum_spanning_tree = Graph() set = union_find.UnionFind() node_to_wrapper_node_map = {} priority_queue = heap.BinHeap(heap.HeapMode.min) for node in graph.nodes(): minimum_spanning_tree.add_node(node) wrapper_node = union_find.Node(node) node_to_wrapper_node_map[node] = wrapper_node set.make_set(wrapper_node) for u, v in graph.edges(): edge = (node_to_wrapper_node_map[u], node_to_wrapper_node_map[v]) priority_queue.insert(heap.HeapItem(graph.weight((u, v)), edge)) while priority_queue.size > 0: min_item = priority_queue.extract() u_node, v_node = min_item.datum if set.find(u_node) != set.find(v_node): minimum_spanning_tree.add_undirected_edge( u_node.value, v_node.value, min_item.priority) set.union(u_node, v_node) return minimum_spanning_tree
def C3W2_2(): """Input nodes of 24 bits. Edge cost is Hamming Distance""" """largest value of k such that there is a k-clustering with spacing at least 3""" # input i = -1 with open('data\\clustering_big.txt') as file: nodes = [] for line in file: if i == -1: i += 1 continue bit = int(''.join(line.split()), 2) # converted to decimal nodes.append(bit) nodes = set( nodes ) # this equals union nodes with distance = 0 (we only care about distince nodes in this problem) mask1 = [1 << i for i in range(24)] # 1-bit mask (distance = 1) _tmp = [i + 1 for i in mask1[1:]] mask1 = set(mask1) mask2 = { x << i for i in range(24) for x in _tmp if (x << i) <= int('1' * 24, 2) } # 2-bit mask (distance = 2) # clustering union = UF.UnionFind(nodes) for node in nodes: # union this node with other nodes where distance = 1 for m1 in mask1: if (node ^ m1) in nodes and not union.inSameUnion(node, node ^ m1): union.union(node, node ^ m1) # union this node with other nodes where distance = 2 for m2 in mask2: if (node ^ m2) in nodes and not union.inSameUnion(node, node ^ m2): union.union(node, node ^ m2) # after distance=1 nodes and distance=2 nodes are unioned. Current K is the largest with spacing at least 3 # if continue union, shortest distance = 3 nodes will be unioned, and K will decrease. print(f'current largest K with spacing at least 3 is {union.n_of_union}')
def groupTPL(TPL, distance=1): # TO-DO: # Rethink ways to cluster points # K-d tree may be an alternative # Currently algorithm runs on O(n^2) print 'Inside groupTPL()' U = union_find.UnionFind() for (i, x) in enumerate(TPL): for j in range(i + 1, len(TPL)): y = TPL[j] if max(abs(x[0] - y[0]), abs(x[1] - y[1])) <= distance: U.union(x, y) disjSets = {} for x in TPL: s = disjSets.get(U[x], set()) s.add(x) disjSets[U[x]] = s return [list(x) for x in disjSets.values()]
def dump_unions(bug_list): path_map = dict() pair_list = [] ex_dumps = [] result = [] for bug_id in bug_list: bug = b.get_bug(bug_id) child = b.get_bug(bug.dupe_of) # add to pair if exists if child.cf_crashdump_location: path_map[bug_id] = bug.cf_crashdump_location path_map[bug.dupe_of] = child.cf_crashdump_location pair_list.append([bug_id, bug.dupe_of]) # remove single bug_id ex_bugs = list(path_map.keys()) # apply union_find uf = union_find.UnionFind(len(ex_bugs)) for pair in pair_list: uf.unite(ex_bugs.index(pair[0]), ex_bugs.index(pair[1])) uf.id = [uf.find(i) for i in uf.id] # convert bug_id to dump for bug_id in ex_bugs: ex_dumps.append(path_map[bug_id]) dump_dict = dict(zip(ex_dumps, uf.id)) # extract dump path for group_id in set(uf.id): res = [] for k, v in dump_dict.items(): if v == group_id: res = split_paths(k) # filter single group if len(res) > 1: result.append(res) tgt_path = os.path.join(os.getcwd(), "json", "dump_unions.json") with open(tgt_path, "w") as fp: json.dump(result, fp, indent=4, sort_keys=True)
def setUp(self): self.structure = union_find.UnionFind()
def __init__(self, fragments_fn, bcs_to_use, bam_fn): ''' Compute sparse barcode x genome bin coverage matrix. Each row is normalized to 1, so that the expected overlap for uncorrelated barcodes is 1 ''' self.uf = union_find.UnionFind() bam_in = tk_bam.create_bam_infile(bam_fn) # Choose the set of fragments to use if type(fragments_fn) is str or type(fragments_fn) is unicode: fragments = self.load_fragments_filtered(fragments_fn, bcs_to_use) elif type(fragments_fn) is p.DataFrame: fragments = fragments_fn else: raise Exception( "unrecognized fragments_fn argument type: %s, must be filename or pandas.DataFrame" % str(type(fragments_fn))) # Setup genome bins genome_length = sum(bam_in.lengths) bin_size = max(1, genome_length / GENOME_BINS) chrom_bins = np.ceil( np.array([float(l) / bin_size for l in bam_in.lengths])) total_bins = chrom_bins.sum() start_bin = np.concatenate([[0], np.cumsum(chrom_bins)[:-1]]) chrom_map = {c: idx for (idx, c) in enumerate(bam_in.references)} npartitions = len(bcs_to_use) # Number the selected barcodes -- the assigned number is their row in the BC-bin matrix bcs = fragments.bc.values bc_ids = {} self.bcs_to_use = [] c = 0 for bc in bcs: if bc_ids.has_key(bc): continue else: self.bcs_to_use.append(bc) bc_ids[bc] = c c += 1 martian.log_info("making sparse matrix") indexes = np.empty((2, len(fragments)), dtype=np.int32) data = np.ones((len(fragments), ), dtype=np.float32) chroms = fragments.chrom.values pos_bin = fragments.start_pos.values / bin_size for fidx in range(len(fragments)): chrom_id = chrom_map[chroms[fidx]] which_bin = start_bin[chrom_id] + pos_bin[fidx] which_bc = bc_ids[bcs[fidx]] indexes[0, fidx] = which_bc indexes[1, fidx] = which_bin mat = scipy.sparse.csr_matrix((data, indexes), shape=(npartitions, total_bins), dtype=np.float32) # If there are multiple fragments for the same BC in the same bin, the csr_matrix constructor above will sum them up, leading # to entries greater than 1. Cap everything at 1. mat.data = np.ones(mat.data.shape, dtype=mat.dtype) ''' mat1 = scipy.sparse.lil_matrix((npartitions, total_bins), dtype=np.float32) bc_grps = fragments.groupby(["bc"]) bc_count = 0 # For each barcode, mark the genome bins covered by a fragment for (bc, bc_grp) in bc_grps: # Track the reads per fragment in tested partitions for reporting l = len(bc_grp) bins = np.zeros(l, dtype=np.int32) chroms = bc_grp.chrom.values starts = bc_grp.start_pos.values pos_bin = starts / bin_size for i in range(l): chrom_id = chrom_map[chroms[i]] which_bin = start_bin[chrom_id] + pos_bin[i] bins[i] = which_bin mat1[bc_count, bins] = 1.0 bc_count += 1 if bc_count % 1000 == 0: print bc_count ''' eps = 0.0001 # Get the genome bin occupancy genome_bin_counts = np.array( (mat > np.float32(0)).sum(axis=0)).flatten().astype( 'float') # total BC counts per bin high_cov_threshold = np.percentile(genome_bin_counts, 99.5) # switch off high-coverage bins -- set them to eps (a small nonzero number so we can distinguish them) high_cov_bins = np.where(genome_bin_counts > high_cov_threshold)[0] (r, c) = mat.nonzero() martian.log_info("removing %d bins" % len(high_cov_bins)) for hc_bin in high_cov_bins: mat.data[c == hc_bin] = eps # Recalculate the genome bins distribution genome_bin_counts = np.array( (mat > (2 * eps)).sum(axis=0)).flatten().astype('float') martian.log_info( "Genome Bin Coverage mean: %f 99.95th percentile: %f" % (genome_bin_counts.mean(), high_cov_threshold)) # Adjust for 'effective genome size' based on the distribution over bins # i.e. more skewed distribution -> fewer effective bins effective_bins_factor = ((genome_bin_counts / genome_bin_counts.sum())**2).sum() self.effective_genome_bins = 1.0 / effective_bins_factor martian.log_info("Effective Number of Genome Bins = %f" % self.effective_genome_bins) self.mat = mat martian.log_info("done __init__")
class Node: def __init__ (self, label): self.label = label def __str__(self): return str(self.label) def print_sets(nodes): sets = [ str(union_find.find(x)) for x in nodes ] print('set representatives: %s' % (sets)) print('number of disjoint sets: %s' % (len([ i for i in itertools.groupby(sets) ]))) print() union_find = union_find.UnionFind() nodes = [ Node(ch) for ch in 'abcdefg' ] print('labels: %s' % ([ str(i) for i in nodes ])) for node in nodes: union_find.make_set(node) print_sets(nodes) assert(union_find.find(nodes[0]) != union_find.find(nodes[2])) union_find.union(nodes[0], nodes[2]) assert(union_find.find(nodes[0]) == union_find.find(nodes[2])) print_sets(nodes)