def expand_from_node(cur, a, c, lvl, distances, sample_id=None): """ From a seed samples a, get all samples that can be connect with <=lvl over multiple nodes. Parameters ---------- cur: obj database cursor a: int seed samples id c: int cluster name lvl: int cluster level distances: dict distances[a][b] = d distances[b][a] = d sample_id: int (optional) sample to remove from cluster Returns ------- with_a: list all samples connected to a (over multiple nodes) """ logging.info("Expanding from sample %s.", a) t_lvl = 't%i' % (lvl) with_a = [a] mems = get_all_cluster_members(cur, c, t_lvl) # optionally we can remove one if sample_id != None: mems.remove(sample_id) idx = 0 while True: pivot = with_a[idx] dis = get_distances_from_memory(cur, distances, pivot, mems) # get all the new nodes with are connected to the curren pivot and which we have not considered yet new_nodes = [sa for (sa, di) in dis if di <= lvl and sa not in with_a] # if there are no new ones and we have reached the end of the list, we're done if len(new_nodes) == 0 and pivot == with_a[-1]: break else: # else add more nodes to consider with_a += new_nodes # consider the next one idx += 1 logging.info("Samples connected to sample %s: %s.", a, with_a) return with_a
def split_clusters(cur, sample_id, problems, lvl, distances): """ Split cluster. Parameters ---------- cur: obj database cursor sample_id: int id of sample to remove problems: list of tuples [(c, a, b), ...] <- no longer connected pairs in cluster c distances: dist distances[a][b] = d distances[b][a] = d Returns ------- groups: dict groups[a] = [list of others] groups[b] = [list of others] """ groups = {} visited = set() for (c, a, b) in problems: mems = get_all_cluster_members(cur, c, 't' + str(lvl)) for node in [a, b]: # get a set of all samples that are already in one of the groups _ = [visited.update(x) for x in groups.values()] # if we already expanded from that node or went past it in a previous group, don't go if groups.has_key(node) or (node in visited): continue else: groups[node] = expand_from_node(cur, node, c, lvl, distances, sample_id) # if the combined langth of all groups covers the whole cluster (without the removee), we're done if sum([len(x) for x in groups.values()]) == len(mems) - 1: break # else there must be another 'broken link' return groups
def check_cluster_integrity(cur, sample_id, snad, distances, levels=[0, 2, 5, 10, 25, 50, 100, 250]): """ Check whether the removal of sample_id from any of its cluster necessitates the split of the cluster. Parameters ---------- cur: obj database cursor sample_id: int id of sample to remove snad: list of 7 int snip address distances: dist distances[a][b] = d distances[b][a] = d levels: list of 7 int better not change this [0, 5, 10, 25, 50, 100, 250] Returns ------- None if no splits required, else: splits: dict splits[level] = [(c, a, b), ...] <- no longer connected pair in cluster c """ splits = {} for clu, lvl in zip(snad, levels): t_lvl = 't%i' % (lvl) logging.info("Checking cluster integrity for cluster %s on level %s.", clu, t_lvl) # get all other members of the cluster apart from the removee mems = get_all_cluster_members(cur, clu, t_lvl) mems.remove(sample_id) # get distances of the removee to them d = get_distances(cur, sample_id, mems) connected_mems = [] for (sa, di) in d: # get all samples that are connected to the removee with d <= t if di <= lvl: connected_mems.append(sa) remember_distance(distances, sample_id, sa, di) logging.debug("Samples connected via removee: %s", sorted(connected_mems)) # investigate all pw distances between connected members potentially_broken_pairs = [] for i, a in enumerate(connected_mems): for j, b in enumerate(connected_mems): if i < j: pwd = None try: pwd = distances[a][b] except KeyError: pwd = get_all_pw_dists(cur, [a, b])[0] remember_distance(distances, a, b, pwd) # if pw distance between the two sampes is bigger than the threshold, # the link between the samples might be broken, unless there is another samples # (or chain of samples) connecting them if pwd > lvl: potentially_broken_pairs.append((a, b)) # all pairs that were connected through the removee are also directly connected, happy days if len(potentially_broken_pairs) == 0: splits[lvl] = None continue logging.debug( "Samples potentially no longer connected via removee: %s", potentially_broken_pairs) # check if there is another path to get from a to b with only steps <= t for a, b in potentially_broken_pairs: broken = False logging.debug( "Checking if there is another way to connect %s and %s.", a, b) # list of samples connectable to a (directly or over multiple nodes) rel_conn_sams_to_a = [a] idx = 0 # when b in connected the a w're done while b not in rel_conn_sams_to_a: # pivot is the one currently investigated pivot = rel_conn_sams_to_a[idx] # get all the members of the current cluster except the pivot all_mems_but_pivot = [x for x in mems if x != pivot] # get all the distances from the pivot to thpse members d = get_distances_from_memory(cur, distances, pivot, all_mems_but_pivot) # all new samples that are connectable to the pivot # two conditions: a) the sample is connected to the pivot with d<=t # b) we don't have this sample yet in the ones we already know are connected to a rel_conn_to_pivot = [ sa for (sa, di) in d if (di <= lvl) and (sa not in rel_conn_sams_to_a) ] # there are no new samples connected to the pivot and the last sample has been considered # but b is not yet foud to be connected => cluster is broken if len(rel_conn_to_pivot ) == 0 and pivot == rel_conn_sams_to_a[-1]: broken = True break else: # otehr wise add any potential new ones to the list and check the next one rel_conn_sams_to_a += rel_conn_to_pivot idx += 1 # we need to remember what was broken for updating later if broken == True: try: splits[lvl].append((clu, a, b)) except KeyError: splits[lvl] = [(clu, a, b)] # go to next broken pair, there might be more than one and # we want to know for updating later # we checked all pairs and always found b somehow, cluster is fine if splits.has_key(lvl) == False: splits[lvl] = None return splits