示例#1
0
def binarise(U, technique, param=0.0):
    K = np.shape(U)[1]
    allZerosInd = np.sum(U, axis=1) == 0
    technique = technique.lower()
    if technique in ['union', 'ub']:
        B = U > 0
    elif technique in ['intersection', 'ib']:
        B = U == 1
    elif technique in ['max', 'mvb']:
        B = nu.isequaltoaxis(U, np.max(U, axis=1), axis=1)
    elif technique in ['valuethreshold', 'value', 'vtb']:
        B = U >= param
    elif technique in ['stdthresh', 'std']:
        B = (nu.isequaltoaxis(U, np.max(U, axis=1), axis=1)) & \
            (np.tile(np.std(U, axis=1), [K, 1]).transpose() > param)
    elif technique in ['difference', 'diff', 'dtb']:
        Usorted = np.sort(U, axis=1)
        diff = Usorted[:, -1] - Usorted[:, -2]
        B = (nu.isequaltoaxis(U, np.max(U, axis=1), axis=1)) & \
            (np.tile(diff, [K, 1]).transpose() > param)
    elif technique in ['top', 'tb']:
        B = nu.subtractaxis(U, np.max(U, axis=1), axis=1) <= param
    else:
        raise ValueError('The given technique is invalid.')
    B[allZerosInd] = 0
    return np.array(B, dtype='bool')
示例#2
0
def mseclusters(X, B, donormalise=True, GDM=None):
    Xloc = np.array(X)
    Bloc = np.array(B)

    if ds.maxDepthOfArray(Xloc) == 2:
        Xloc = np.expand_dims(Xloc, axis=0)
    Nx = len(Xloc)  # Number of datasets
    if len(Bloc.shape) == 1:
        Bloc = Bloc.reshape(-1, 1)
    M = Bloc.shape[0]  # Number of genes
    K = Bloc.shape[1]  # Number of clusters

    if GDM is None:
        GDMloc = np.ones([Bloc.shape[0], Nx], dtype=bool)
    else:
        GDMloc = np.array(GDM)

    # I commented these two lines after adding GDM
    #if any([True if x.shape[0] != M else False for x in Xloc]):
    #    raise ValueError('Unequal number of genes in datasets and partitions')

    mseC = np.zeros([Nx, K], dtype=float)

    Nk = [np.sum(b) for b in Bloc.transpose()]  # Number of genes per cluster
    Nd = [x.shape[1] for x in Xloc]  # Number of dimensions per dataset

    # Normalise if needed
    if donormalise:
        Xloc = [pp.normaliseSampleFeatureMat(x, 4) for x in Xloc]

    # Calculations
    for nx in range(Nx):
        reportedprogress = 0
        for k in range(K):
            # Report progress
            if (k - reportedprogress == 100):
                io.updateparallelprogress(100)
                reportedprogress = k
            # WORK
            if not any(Bloc[:, k]):
                mseC[nx, k] = float('nan')
            else:
                Xlocloc = Xloc[nx][Bloc[GDMloc[:, nx], k], :]
                tmp = nu.subtractaxis(Xlocloc,
                                      np.mean(Xlocloc, axis=0),
                                      axis=0)
                tmp = np.sum(np.power(tmp, 2))
                mseC[nx, k] = tmp / Nd[nx] / Nk[k]
        # Report progress
        if (K > reportedprogress):
            io.updateparallelprogress(K - reportedprogress)

    return np.mean(mseC, axis=0)
示例#3
0
def mseclustersfuzzy(X, B, donormalise=True, GDM=None):
    Xloc = np.array(X)
    Bloc = np.array(B)

    if ds.maxDepthOfArray(Xloc) == 2:
        Xloc = np.expand_dims(Xloc, axis=0)
    Nx = len(Xloc)  # Number of datasets
    if len(Bloc.shape) == 1:
        Bloc = Bloc.reshape(-1, 1)
    M = Bloc.shape[0]  # Number of genes
    K = Bloc.shape[1]  # Number of clusters

    if GDM is None:
        GDMloc = np.ones([Bloc.shape[0], Nx], dtype=bool)
    else:
        GDMloc = np.array(GDM)

    # I commented these two lines after adding GDM
    #if any([True if x.shape[0] != M else False for x in Xloc]):
    #    raise ValueError('Unequal number of genes in datasets and partitions')

    mseC = np.zeros([Nx, K], dtype=float)

    Nk = [np.sum(b) for b in Bloc.transpose()]  # Number of genes per cluster
    Nd = [x.shape[1] for x in Xloc]  # Number of dimensions per dataset

    # Normalise if needed
    if donormalise:
        Xloc = [pp.normaliseSampleFeatureMat(x, 4) for x in Xloc]

    # Calculations
    for nx in range(Nx):
        for k in range(K):
            if Nk[k] == 0:
                mseC[nx, k] = float('nan')
            else:
                Cmeanloc = nu.multiplyaxis(
                    Xloc[nx], Bloc[GDMloc[:, nx], k],
                    axis=1) / Nk[k]  # Weighted mean for the cluster
                tmp = nu.subtractaxis(Xloc[nx], Cmeanloc, axis=0)  # Errors
                tmp = nu.multiplyaxis(tmp, Bloc[GDMloc[:, nx], k],
                                      axis=1)  # Weighted errors
                tmp = np.sum(np.power(tmp, 2))  # Squared weighted errors
                mseC[nx, k] = tmp / Nd[nx] / Nk[k]  # Weighted MSE

    return np.mean(mseC, axis=0)
示例#4
0
def normaliseSampleFeatureMat(X, type):
    """
    X = normalizeSampleFeatureMat(X, type)

    type: 0 (none), 1 (divide by mean), 2 (divide by the first),
        3 (take log2), 31 (take log2 after setting all values < 1.0 to 1.0, i.e. guarantee positive log),
        4 (subtract the mean and divide by the std),
        5 (divide by the sum), 6 (subtract the mean),
        7 (divide by the max), 8 (2 to the power X), 9 (subtract the min),
        10 (rank: 1 for lowest, then 2, 3, ...; average on ties),
        11 (rank, like 10 but order arbitrarly on ties),
        12 (normalise to the [0 1] range),
        13 (Genes with low values everywhere are set to zeros; bimodel distribution is fit to maxima of rows)

        101 (quantile), 102 (subtract columns (samples) means),
        103 (subtract global mean)

        1000 (Automatically detect normalisation)

    If (type) was a vector like [3 1], this means to apply normalisation
    type (3) over (X) then to apply type (1) over the result. And so on.

    :param X:
    :param type:
    :return:
    """
    Xout = np.array(X)
    codes = np.array(
        type
    )  # stays as input types unless auto-normalisation (type 1000) changes it

    if isinstance(type, (list, tuple, np.ndarray)):
        # This has a reason, which is if there is a single type (1000), it will replace it with the actual codes
        j = 0
        for i in range(len(type)):
            Xout, codesi = normaliseSampleFeatureMat(Xout, type[i])
            if isinstance(codesi, (list, tuple, np.ndarray)) & codesi.ndim > 0:
                codes[j] = codesi[0]
                codes = np.insert(codes, j + 1, codesi[1:])
                j = j + len(codesi)
            else:
                j = j + 1
        return Xout, codes

    if type == 1:
        # 1: Divide by the mean
        Xout = nu.divideaxis(Xout, np.mean(Xout, axis=1), 1)

    if type == 2:
        # 2: Divide by the first value
        Xout = nu.divideaxis(Xout, Xout[:, 1], 1)

    if type == 3:
        # 3: Take log2
        Xout[Xout <= 0] = float('nan')
        Xout = np.log2(Xout)
        ind1 = np.any(isnan(Xout), axis=1)
        Xout[ind1] = fixnans(Xout[ind1])

    if type == 31:
        # 31: Set all values < 1 to 1 then take log (guarantee a positive log)
        Xout[Xout <= 1] = 1
        Xout = np.log2(Xout)

    if type == 4:
        # 4: Subtract the mean and divide by the std
        Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=1), axis=1)
        ConstGenesIndices = np.std(Xout, axis=1) == 0
        Xout = nu.divideaxis(Xout, np.std(Xout, axis=1), axis=1)
        Xout[ConstGenesIndices] = 0

    if type == 5:
        # 5: Divide by the sum
        Xout = nu.divideaxis(Xout, np.sum(Xout, axis=1), axis=1)

    if type == 6:
        # 6: Subtract the mean
        Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=1), axis=1)

    if type == 7:
        # 7: Divide by the maximum
        Xout = nu.divideaxis(Xout, np.max(Xout, axis=1), axis=1)

    if type == 8:
        # 8: (2 to the power X)
        Xout = np.power(2, Xout)

    if type == 9:
        # 9: Subtract the min
        Xout = nu.subtractaxis(Xout, np.min(Xout, axis=1), axis=1)

    if type == 10:
        # 10: Rank: 0 for lowest, then 1, 2, ...; average on ties
        Xout = spmstats.rankdata(Xout, axis=0) - 1

    if type == 11:
        # 11: Rank: 0 for lowest, then 1, 2, ...; arbitrary order on ties
        Xout = np.argsort(np.argsort(Xout, axis=0), axis=0)

    if type == 12:
        # 12: Normalise to the [0 1] range
        Xout = nu.subtractaxis(Xout, np.min(Xout, axis=1), axis=1)
        Xout = nu.divideaxis(Xout, np.max(Xout, axis=1), axis=1)

    if type == 13:
        # 13: Genes with low values everywhere are set to zeros; bimodel distribution is fit to maxima of rows
        Xout = filterBimodal(X)

    # 100s
    if type == 101:
        # 101: quantile
        av = np.mean(np.sort(Xout, axis=0), axis=1)
        II = np.argsort(np.argsort(Xout, axis=0), axis=0)
        Xout = av[II]

    if type == 102:
        # 102: subtract the mean of each sample (column) from it
        Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=0), axis=0)

    if type == 103:
        # 103: subtract the global mean of the data
        Xout -= np.mean(Xout)

    if type == 1000:
        # 1000: automatically detect normalisation
        codes = autoNormalise(Xout)
        Xout = normaliseSampleFeatureMat(Xout, codes)[0]
        codes = np.append([101], codes)

    return Xout, codes
示例#5
0
def correcterrors_withinworse(B, X, GDM, falsepositivestrimmed=0.01):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    [Ng, K] = Bloc.shape  # Ng genes and K clusters
    L = Xloc.shape[0]  # L datasets

    # Find clusters' means (Cmeans), absolute shifter clusters genes (SCG),
    # and the emperical CDF functions for them (cdfs)
    Cmeans = np.array([None] * L, dtype=object)
    SCG = np.array([None] * L, dtype=object)
    for l in range(L):
        Cmeans[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        SCG[l] = np.zeros(
            [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)),
             Xloc[l].shape[1]])  # M* genes x D dimensions ...
        # (M* are all # genes in any cluster)

        gi = 0
        for k in range(K):
            Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
            csize = np.sum(Bloc[GDM[:, l], k])
            tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :],
                                     Cmeans[l][k],
                                     axis=0)
            SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG)
            gi += csize
        SCG[l] = SCG[l][np.any(
            SCG[l], axis=1)]  # Remove all zeros genes (rows of SCG[l])
        SCG[l] = np.sort(SCG[l], axis=0)
        if falsepositivestrimmed > 0:
            trimmed = int(falsepositivestrimmed * SCG[l].shape[0])
            if trimmed > 0:
                SCG[l] = SCG[l][
                    0:-trimmed]  # trim the lowest (trimmed) rows in SCG

    # Helping function
    def iswithinworse(ref, x):
        return x <= np.max(ref)

    # Find who belongs
    belongs = np.ones([Ng, K, L],
                      dtype=bool)  # Ng genes x K clusters x L datasets
    for l in range(L):
        for k in range(K):
            for d in range(Xloc[l].shape[1]):
                tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d])
                belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX)

    # Include in clusters genes which belongs everywhere
    B_out = np.all(belongs, axis=2)

    # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters
    # (guarrantee that the worst belongingness of a gene to a cluster is optimised)
    f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
    for fi in f:
        ficlusts = np.nonzero(B_out[fi])[0]  # Clusters competing over gene fi
        fidatasets = np.nonzero(GDM[fi])[0]  # Datasets that have gene fi
        localdists = np.zeros(
            [len(ficlusts),
             len(fidatasets)])  # (Clusts competing) x (datasets that have fi)
        for l in range(len(fidatasets)):
            ll = fidatasets[l]  # Actual dataset index
            fi_ll = np.sum(GDM[:fi, ll])  # Index of fi in this Xloc[ll]
            localdists[:, l] = nu.dist_matrices(
                Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape([len(ficlusts)])
        localdists = np.max(localdists, axis=1)  # (Clusts competing) x 1
        ficlosest = np.argmin(localdists)  # Closest cluster
        B_out[fi] = False
        B_out[fi, ficlusts[ficlosest]] = True

    return B_out
示例#6
0
def optimise_tukey_sqrtSCG(B,
                           X,
                           GDM,
                           clustdists=None,
                           smallestClusterSize=11,
                           tails=1,
                           Q3s=2):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    [Ng, K] = Bloc.shape  # Ng genes and K clusters
    L = Xloc.shape[0]  # L datasets

    # Normalise clustdists to provide weights. If not provided, make it unity for all
    if clustdists is None:
        clustdistsloc = np.ones(K)
    else:
        clustdistsloc = [c for c in clustdists]

    # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG),
    # and the emperical CDF functions for them (cdfs)
    Cmeans = np.array([None] * L, dtype=object)
    SCG = np.array([None] * L, dtype=object)

    Cgood = mnplotsdistancethreshold(clustdistsloc, method='largestgap')
    for l in range(L):
        Cmeans[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        SCG[l] = np.zeros(
            [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)),
             Xloc[l].shape[1]])  # M* genes x D dimensions ...
        w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)))  # M* genes
        # (M* are all # genes in any cluster)

        gi = 0
        for k in range(K):
            Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
            if k in Cgood:
                csize = np.sum(Bloc[GDM[:, l], k])
                tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :],
                                         Cmeans[l][k],
                                         axis=0)
                SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG)
                gi += csize
        SCG[l] = SCG[l][np.any(
            SCG[l], axis=1)]  # Remove all zeros genes (rows of SCG[l])

        if ds.numel(SCG[l] > 0):
            if tails == 1:
                Q3 = np.percentile(SCG[l], q=75, axis=0)
                thresh = Q3s * Q3
                SCGouts = SCG[l] > np.array(
                    [thresh for ii in range(0, SCG[l].shape[0])])
                SCG[l][
                    SCGouts] = 0.0  # Set the outlier values to zeros so they do not affect decisions later on
            elif tails == 2:
                Q1 = np.percentile(np.sqrt(SCG[l]), q=25, axis=0)
                Q3 = np.percentile(np.sqrt(SCG[l]), q=75, axis=0)
                IQR = np.subtract(Q3, Q1)
                thresh = np.add(Q3, 1.5 * IQR)
                SCGouts = np.sqrt(SCG[l]) > np.array(
                    [thresh for ii in range(0, SCG[l].shape[0])])
                SCG[l][
                    SCGouts] = 0.0  # Set the outlier values to zeros so they do not affect decisions later on
            else:
                raise ValueError(
                    'Invalid number of tails. It should be either 1 or 2.')
        else:
            SCG[l] = np.zeros((1, SCG[l].shape[1]))

    # Clusters mins and maxes (NEW)
    Cmins = np.array([None] * L, dtype=object)
    Cmaxes = np.array([None] * L, dtype=object)
    for l in range(L):
        Cmins[l] = np.zeros([K, Xloc[l].shape[1]])  # K clusters x D dimensions
        Cmaxes[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        for k in range(K):
            Cmins[l][k] = Cmeans[l][k] - np.max(SCG[l], axis=0)
            Cmaxes[l][k] = Cmeans[l][k] + np.max(SCG[l], axis=0)

    # Resolve overlaps between clusters (NEW)
    for k1 in range(K):
        for k2 in range(K):
            # Compare the pair of clusters only once, and don't compare a cluster with itself. This if statement
            # guarantees that k2 will always be a later cluster than k1.
            if (k1 >= k2):
                continue
            # Value of the smallest overlap between the ranges of the clusters k1 and k2, and ...
            # the dataset (l) and the dimension (d), at which this overlap is found
            # t_smallest overlap is the type of the overlap, (-1, 0, 1, or 2). Type (-1) means that the entire (min
            # to max) range of one cluster is within the range of the other cluster. This is the worse overlap.
            # Type (0) means that the max of (k1) is within the range of (min to max) of (k2), and type (1) is the other
            # way around. Type (2) means there is no overlap. This is the best and finding one of it breaks the loop
            v_smallestoverlap = 0
            l_smallestoverlap = -1
            d_smallestoverlap = -1
            t_smallestoverlap = -1  # Overlap type, read above
            for l in range(L):
                Nd = len(Cmins[l][k1])  # Dimensions in this dataset
                for d in range(Nd):
                    x1 = Cmaxes[l][k1][d]
                    x2 = Cmaxes[l][k2][d]
                    n1 = Cmins[l][k1][d]
                    n2 = Cmins[l][k2][d]
                    if (x1 > n2 and x1 <= x2):
                        if (n1 < n2):
                            ov = x1 - n2
                            if (t_smallestoverlap == -1
                                    or ov < v_smallestoverlap):
                                t_smallestoverlap = 0
                                v_smallestoverlap = ov
                                l_smallestoverlap = l
                                d_smallestoverlap = d
                    elif (x2 > n1 and x2 <= x1):
                        if (n2 < n1):
                            ov = x2 - n1
                            if (t_smallestoverlap == -1
                                    or ov < v_smallestoverlap):
                                t_smallestoverlap = 1
                                v_smallestoverlap = ov
                                l_smallestoverlap = l
                                d_smallestoverlap = d
                    else:
                        t_smallestoverlap = 2
                        continue  # Absolutely no overlap at this point, so k1 and k2 are distinct, so continue
                if (t_smallestoverlap == 2):
                    continue  # Absolutely no overlap at some point, so k1 and k2 are distinct, so continue

            # Sort out the overlap if exists between k1 and k2
            if (t_smallestoverlap == -1):
                # Here one of the two clusters always swallows the other one. So effectively remove the later one (k2).
                # Cluster removal is by making its minimum larger than its maximum at a single point (at l=0, d=0),
                # so effectively no gene will ever be mapped to it!
                Cmins[0][k2][0] = 1
                Cmaxes[0][k2][0] = 0
            elif (t_smallestoverlap == 0):
                Cmins[l_smallestoverlap][k2][d_smallestoverlap] = \
                    Cmaxes[l_smallestoverlap][k1][d_smallestoverlap] + sys.float_info.epsilon
            elif (t_smallestoverlap == 1):
                Cmaxes[l_smallestoverlap][k2][d_smallestoverlap] = \
                    Cmins[l_smallestoverlap][k1][d_smallestoverlap] - sys.float_info.epsilon

    # Find who belongs (NEW)
    belongs = np.ones([Ng, K, L],
                      dtype=bool)  # Ng genes x K clusters x L datasets
    for l in range(L):
        for k in range(K):
            tmp1 = nu.largerthanaxis(Xloc[l],
                                     Cmins[l][k],
                                     axis=0,
                                     orequal=True)
            tmp2 = nu.lessthanaxis(Xloc[l], Cmaxes[l][k], axis=0, orequal=True)
            belongs[GDM[:, l], k, l] = np.all(np.logical_and(tmp1, tmp2),
                                              axis=1)

    # # Helping function (OLD - to be removed)
    # def iswithinworse(ref, x):
    #     return x <= np.max(ref)
    #
    # # Find who belongs (OLD - to be removed)
    # belongs = np.ones([Ng, K, L], dtype=bool)  # Ng genes x K clusters x L datasets
    # for l in range(L):
    #     for k in range(K):
    #         for d in range(Xloc[l].shape[1]):
    #             tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d])
    #             belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX)

    # Include in clusters genes which belongs everywhere (OLD - to be removed)
    B_out = np.all(belongs, axis=2)

    # Solve genes included in two clusters (OLD - should not be needed now - TO BE REMOVED)
    solution = 2
    if solution == 1:
        # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters
        # (guarrantee that the worst belongingness of a gene to a cluster is optimised)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            fidatasets = np.nonzero(GDM[fi])[0]  # Datasets that have gene fi
            localdists = np.zeros([
                len(ficlusts), len(fidatasets)
            ])  # (Clusts competing) x (datasets that have fi)
            for l in range(len(fidatasets)):
                ll = fidatasets[l]  # Actual dataset index
                fi_ll = np.sum(GDM[:fi, ll])  # Index of fi in this Xloc[ll]
                localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts],
                                                    Xloc[ll][fi_ll]).reshape(
                                                        [len(ficlusts)])
            localdists = np.max(localdists, axis=1)  # (Clusts competing) x 1
            ficlosest = np.argmin(localdists)  # Closest cluster
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True
    elif solution == 2:
        # Genes included in two clusters, include them in the earlier cluster (smallest k)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            ficlosest = np.argmin(ficlusts)  # earliest cluster (smallest k)
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True

    # Remove clusters smaller than minimum cluster size
    ClusterSizes = np.sum(B_out, axis=0)
    B_out = B_out[:, ClusterSizes >= smallestClusterSize]

    return B_out
示例#7
0
def correcterrors_weighted_outliers2(B,
                                     X,
                                     GDM,
                                     clustdists=None,
                                     stds=3,
                                     smallestClusterSize=11):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    [Ng, K] = Bloc.shape  # Ng genes and K clusters
    L = Xloc.shape[0]  # L datasets

    # Normalise clustdists to provide weights. If not provided, make it unity for all
    if clustdists is None:
        clustweights = np.ones(K)
    else:
        clustweights = np.min(clustdists) / clustdists

    # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG),
    # and the emperical CDF functions for them (cdfs)
    Cmeans = np.array([None] * L, dtype=object)
    SCG = np.array([None] * L, dtype=object)
    for l in range(L):
        Cmeans[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        SCG[l] = np.zeros(
            [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)),
             Xloc[l].shape[1]])  # M* genes x D dimensions ...
        w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)))  # M* genes
        # (M* are all # genes in any cluster)

        gi = 0
        for k in range(K):
            Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
            csize = np.sum(Bloc[GDM[:, l], k])
            tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :],
                                     Cmeans[l][k],
                                     axis=0)
            SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG)
            # Added this in this version
            w[gi:(gi + csize)] = clustweights[k]
            gi += csize
        SCG[l] = SCG[l][np.any(
            SCG[l], axis=1)]  # Remove all zeros genes (rows of SCG[l])
        SCG[l] = np.sort(SCG[l], axis=0)
        SCGmeans = np.average(SCG[l], weights=w, axis=0)
        SCGstds = st.weighted_std_axis(SCG[l], weights=w, axix=0)
        SCGouts = nu.divideaxis(nu.subtractaxis(SCG[l], SCGmeans, axis=0),
                                SCGstds,
                                axis=0)  # No. of stds away
        SCGouts = SCGouts > stds  # TRUE for outliers and FALSE for others (bool: M* genex x D dimensions)
        SCG[l][
            SCGouts] = 0.0  # Set the outlier values to zeros so they do not affect decisions later on

    # Helping function
    def iswithinworse(ref, x):
        return x <= np.max(ref)

    # Find who belongs
    belongs = np.ones([Ng, K, L],
                      dtype=bool)  # Ng genes x K clusters x L datasets
    for l in range(L):
        for k in range(K):
            for d in range(Xloc[l].shape[1]):
                tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d])
                belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX)

    # Include in clusters genes which belongs everywhere
    B_out = np.all(belongs, axis=2)

    # Solve genes included in two clusters:
    solution = 2
    if solution == 1:
        # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters
        # (guarrantee that the worst belongingness of a gene to a cluster is optimised)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            fidatasets = np.nonzero(GDM[fi])[0]  # Datasets that have gene fi
            localdists = np.zeros([
                len(ficlusts), len(fidatasets)
            ])  # (Clusts competing) x (datasets that have fi)
            for l in range(len(fidatasets)):
                ll = fidatasets[l]  # Actual dataset index
                fi_ll = np.sum(GDM[:fi, ll])  # Index of fi in this Xloc[ll]
                localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts],
                                                    Xloc[ll][fi_ll]).reshape(
                                                        [len(ficlusts)])
            localdists = np.max(localdists, axis=1)  # (Clusts competing) x 1
            ficlosest = np.argmin(localdists)  # Closest cluster
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True
    elif solution == 2:
        # Genes included in two clusters, include them in the earlier cluster (smallest k)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            ficlosest = np.argmin(ficlusts)  # earliest cluster (smallest k)
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True

    # Remove clusters smaller than minimum cluster size
    ClusterSizes = np.sum(B_out, axis=0)
    B_out = B_out[:, ClusterSizes >= smallestClusterSize]

    return B_out