예제 #1
0
def expand_from_node(cur, a, c, lvl, distances, sample_id=None):
    """
    From a seed samples a, get all samples that can be connect with <=lvl over multiple nodes.

    Parameters
    ----------
    cur: obj
        database cursor
    a: int
        seed samples id
    c: int
        cluster name
    lvl: int
        cluster level
    distances: dict
        distances[a][b] = d
        distances[b][a] = d
    sample_id: int
        (optional)  sample to remove from cluster

    Returns
    -------
    with_a: list
        all samples connected to a (over multiple nodes)
    """

    logging.info("Expanding from sample %s.", a)
    t_lvl = 't%i' % (lvl)
    with_a = [a]
    mems = get_all_cluster_members(cur, c, t_lvl)
    # optionally we can remove one
    if sample_id != None:
        mems.remove(sample_id)

    idx = 0
    while True:
        pivot = with_a[idx]
        dis = get_distances_from_memory(cur, distances, pivot, mems)
        # get all the new nodes with are connected to the curren pivot and which we have not considered yet
        new_nodes = [sa for (sa, di) in dis if di <= lvl and sa not in with_a]
        # if there are no new ones and we have reached the end of the list, we're done
        if len(new_nodes) == 0 and pivot == with_a[-1]:
            break
        else:
            # else add more nodes to consider
            with_a += new_nodes
        # consider the next one
        idx += 1

    logging.info("Samples connected to sample %s: %s.", a, with_a)

    return with_a
예제 #2
0
def split_clusters(cur, sample_id, problems, lvl, distances):
    """
    Split cluster.

    Parameters
    ----------
    cur: obj
        database cursor
    sample_id: int
        id of sample to remove
    problems: list of tuples
        [(c, a, b), ...] <- no longer connected pairs in cluster c
    distances: dist
        distances[a][b] = d
        distances[b][a] = d

    Returns
    -------
    groups: dict
        groups[a] = [list of others]
        groups[b] = [list of others]

    """

    groups = {}
    visited = set()
    for (c, a, b) in problems:
        mems = get_all_cluster_members(cur, c, 't' + str(lvl))
        for node in [a, b]:
            # get a set of all samples that are already in one of the groups
            _ = [visited.update(x) for x in groups.values()]
            # if we already expanded from that node or went past it in a previous group, don't go
            if groups.has_key(node) or (node in visited):
                continue
            else:
                groups[node] = expand_from_node(cur, node, c, lvl, distances,
                                                sample_id)

        # if the combined langth of all groups covers the whole cluster (without the removee), we're done
        if sum([len(x) for x in groups.values()]) == len(mems) - 1:
            break
        # else there must be another 'broken link'

    return groups
예제 #3
0
def check_cluster_integrity(cur,
                            sample_id,
                            snad,
                            distances,
                            levels=[0, 2, 5, 10, 25, 50, 100, 250]):
    """
    Check whether the removal of sample_id from any of its cluster necessitates
    the split of the cluster.

    Parameters
    ----------
    cur: obj
        database cursor
    sample_id: int
        id of sample to remove
    snad: list of 7 int
        snip address
    distances: dist
        distances[a][b] = d
        distances[b][a] = d
    levels: list of 7 int
        better not change this
        [0, 5, 10, 25, 50, 100, 250]

    Returns
    -------
    None if no splits required, else:
    splits: dict
        splits[level] = [(c, a, b), ...] <- no longer connected pair in cluster c
    """

    splits = {}

    for clu, lvl in zip(snad, levels):

        t_lvl = 't%i' % (lvl)

        logging.info("Checking cluster integrity for cluster %s on level %s.",
                     clu, t_lvl)

        # get all other members of the cluster apart from the removee
        mems = get_all_cluster_members(cur, clu, t_lvl)
        mems.remove(sample_id)

        # get distances of the removee to them
        d = get_distances(cur, sample_id, mems)
        connected_mems = []
        for (sa, di) in d:
            # get all samples that are connected to the removee with d <= t
            if di <= lvl:
                connected_mems.append(sa)
            remember_distance(distances, sample_id, sa, di)

        logging.debug("Samples connected via removee: %s",
                      sorted(connected_mems))

        # investigate all pw distances between connected members
        potentially_broken_pairs = []
        for i, a in enumerate(connected_mems):
            for j, b in enumerate(connected_mems):
                if i < j:
                    pwd = None
                    try:
                        pwd = distances[a][b]
                    except KeyError:
                        pwd = get_all_pw_dists(cur, [a, b])[0]
                        remember_distance(distances, a, b, pwd)
                    # if pw distance between the two sampes is bigger than the threshold,
                    # the link between the samples might be broken, unless there is another samples
                    # (or chain of samples) connecting them
                    if pwd > lvl:
                        potentially_broken_pairs.append((a, b))

        # all pairs that were connected through the removee are also directly connected, happy days
        if len(potentially_broken_pairs) == 0:
            splits[lvl] = None
            continue

        logging.debug(
            "Samples potentially no longer connected via removee: %s",
            potentially_broken_pairs)

        # check if there is another path to get from a to b with only steps <= t
        for a, b in potentially_broken_pairs:
            broken = False
            logging.debug(
                "Checking if there is another way to connect %s and %s.", a, b)
            # list of samples connectable to a (directly or over multiple nodes)
            rel_conn_sams_to_a = [a]
            idx = 0
            # when b in connected the a w're done
            while b not in rel_conn_sams_to_a:
                # pivot is the one currently investigated
                pivot = rel_conn_sams_to_a[idx]
                # get all the members of the current cluster except the pivot
                all_mems_but_pivot = [x for x in mems if x != pivot]
                # get all the distances from the pivot to thpse members
                d = get_distances_from_memory(cur, distances, pivot,
                                              all_mems_but_pivot)
                # all new samples that are connectable to the pivot
                # two conditions: a) the sample is connected to the pivot with d<=t
                #         b) we don't have this sample yet in the ones we already know are connected to a
                rel_conn_to_pivot = [
                    sa for (sa, di) in d
                    if (di <= lvl) and (sa not in rel_conn_sams_to_a)
                ]
                # there are no new samples connected to the pivot and the last sample has been considered
                # but b is not yet foud to be connected => cluster is broken
                if len(rel_conn_to_pivot
                       ) == 0 and pivot == rel_conn_sams_to_a[-1]:
                    broken = True
                    break
                else:
                    # otehr wise add any potential new ones to the list and check the next one
                    rel_conn_sams_to_a += rel_conn_to_pivot
                    idx += 1
            # we need to remember what was broken for updating later
            if broken == True:
                try:
                    splits[lvl].append((clu, a, b))
                except KeyError:
                    splits[lvl] = [(clu, a, b)]
                # go to next broken pair, there might be more than one and
                # we want to know for updating later

        # we checked all pairs and always found b somehow, cluster is fine
        if splits.has_key(lvl) == False:
            splits[lvl] = None

    return splits