示例#1
0
def uncles(X,
           type='A',
           Ks=[n for n in range(4, 21, 4)],
           params=None,
           methods=None,
           methodsDetailed=None,
           U=None,
           Utype='PM',
           relabel_technique='minmin',
           setsP=None,
           setsN=None,
           dofuzzystretch=False,
           wsets=None,
           wmethods=None,
           GDM=None,
           smallestClusterSize=11,
           CoPaMfinetrials=1,
           CoPaMfinaltrials=1,
           binarise_techniqueP='DTB',
           binarise_paramP=np.arange(0.0, 1.1, 0.1, dtype='float'),
           binarise_techniqueN='DTB',
           binarise_paramN=np.concatenate(([sys.float_info.epsilon],
                                           np.arange(0.1,
                                                     1.1,
                                                     0.1,
                                                     dtype='float'))),
           Xnames=None,
           deterministic=False,
           ncores=1):
    Xloc = ds.listofarrays2arrayofarrays(X)
    L = len(Xloc)  # Number of datasets

    # Fix parameters
    if params is None: params = {}
    if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))]
    if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)]
    setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int)
    Xloc = Xloc[setsPN]
    L = np.shape(Xloc)[0]  # Number of datasets
    if wsets is None:
        wsets = np.array([1 for x in range(L)])
    else:
        wsets = np.array(wsets)[setsPN]
    if GDM is None:
        Ng = np.shape(Xloc[0])[0]
        GDMloc = np.ones([Ng, L], dtype='bool')
    else:
        GDMloc = GDM[:, setsPN]
        Ng = GDMloc.shape[0]
    if Xnames is None:
        Xnames = ['X{0}'.format(l) for l in range(L)]

    if methods is None:
        methods = [['k-means']]
        # largest_DS = np.max([x.shape[0] for x in Xloc])
        # if (largest_DS <= maxgenesinsetforpdist):
        #    methods = [['k-means'], ['HC']]
        # else:
        #    methods = [['k-means']]
    else:
        largest_DS = np.max([x.shape[0] for x in Xloc])
        if (largest_DS > maxgenesinsetforpdist):
            methods = [
                m for m in methods
                if 'hc' not in [entry.lower() for entry in m]
            ]
            if not methods:
                io.log('No valid base clustering can be used. Please note that clust would not use HC clustering ' \
                       'on datasets with more than {0} genes. You have a dataset with {1} genes.' \
                       ''.format(maxgenesinsetforpdist, largest_DS))
                io.log('Clust will terminate here.')
                io.log(op.bottomline(), addextrastick=False)
                sys.exit()
    if methodsDetailed is None:
        methodsDetailedloc = np.array([methods for l in range(L)])
    else:
        methodsDetailedloc = methodsDetailed[setsPN]
    if wmethods is None:
        wmethods = [[1 for x in m] for m in methodsDetailedloc]
    elif not isinstance(wmethods[0], (list, tuple, np.ndarray)):
        wmethods = np.tile(methods, [L, 1])
    else:
        wmethods = np.array(wmethods)[setsPN]

    setsPloc = [ii for ii in range(len(setsP))]
    if L > len(setsPloc):
        setsNloc = [ii for ii in range(len(setsPloc), L)]

    Ks = np.array(Ks)
    Ks = Ks[Ks <= Ng]  # Remove Ks that are larger than the number of genes Ng
    Ks = Ks.tolist()
    NKs = len(Ks)  # Number of K values

    # If the dataset is empty, return basic output
    if Ng == 0:
        NPp = len(binarise_paramP)  # Number of P params
        NNp = len(binarise_paramN)  # Number of N params
        if type == 'A':
            B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object)
            Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)
        elif type == 'B':
            B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object)
            Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)

        params = dict(
            params, **{
                'methods':
                methods,
                'setsP':
                setsPloc,
                'setsN':
                setsNloc,
                'dofuzzystretch':
                dofuzzystretch,
                'type':
                type,
                'Ks':
                Ks,
                'NKs':
                NKs,
                'wsets':
                wsets,
                'wmethods':
                wmethods,
                'Ds':
                Ds,
                'L':
                L,
                'CoPaMs':
                np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
                    [CoPaMfinaltrials, NKs]),
                'smallestclustersize':
                smallestClusterSize,
                'GDM':
                GDMloc
            })

        Uloc = np.array([None] * (L * NKs)).reshape([L, NKs])

        UnclesRes = collections.namedtuple('UnclesRes',
                                           ['B', 'Mc', 'params', 'X', 'U'])
        return UnclesRes(B, Mc, params, Xloc, Uloc)

    # Clustering
    if U is None:
        Utype = 'PM'
        Uloc = np.array([None] * (L * NKs)).reshape([L, NKs])
        totalparallel = np.sum(Ks) * np.sum(
            [len(meths) for meths in methodsDetailedloc])
        for meths in methodsDetailedloc:
            for meth in meths:
                if 'k-means' in meth:
                    totalparallel += np.max(Ks) * np.max(Ks)
                    continue
        io.resetparallelprogress(totalparallel)

        for l in range(L):
            # Cache kmeans initialisations for the dataset once to save time:
            cl.cache_kmeans_init(Xloc[l],
                                 Ks,
                                 methodsDetailedloc[l],
                                 datasetID=l)

            # Now go to parallel clustering
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                Utmp = Parallel(n_jobs=ncores)\
                    (delayed(clustDataset)
                     (Xloc[l], Ks[ki], methodsDetailedloc[l], GDMloc[:, l], Ng, l) for ki in range(NKs))

                Utmp = [u for u in Utmp]
                for ki in range(NKs):
                    Uloc[l, ki] = Utmp[ki]

                gc.collect()
                #io.updateparallelprogress(np.sum(Ks) * len(methodsDetailedloc))

    else:
        Uloc = ds.listofarrays2arrayofarrays(U)[setsPN]

    # Calculate a CoPaM for each dataset at each K
    CoPaMsFine = np.array([None] * (L * NKs)).reshape([L, NKs])
    for l in range(L):
        for ki in range(NKs):
            if Utype.lower() == 'pm':
                CoPaMsFineTmp = [
                    generateCoPaM(Uloc[l, ki],
                                  relabel_technique=relabel_technique,
                                  X=[Xloc[l]],
                                  w=wmethods[l],
                                  K=Ks[ki],
                                  GDM=GDMloc[:, l].reshape([-1, 1]))
                    for i in range(CoPaMfinetrials)
                ]
            elif Utype.lower() == 'idx':
                CoPaMsFineTmp = \
                    [generateCoPaMfromidx(Uloc[l, ki], relabel_technique=relabel_technique, X=Xloc,
                                          w=wmethods[l], K=Ks[ki])
                     for i in range(CoPaMfinetrials)]
            else:
                raise ValueError('Invalid Utype')
            CoPaMsFine[l,
                       ki] = generateCoPaM(CoPaMsFineTmp,
                                           relabel_technique=relabel_technique,
                                           X=[Xloc[l]],
                                           GDM=GDMloc[:, l].reshape([-1, 1]))

            if dofuzzystretch:
                CoPaMsFine[l, ki] = fuzzystretch(CoPaMsFine[l, ki])

    # Calculate the final CoPaM for each K
    CoPaMs = np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
        [CoPaMfinaltrials, NKs])
    CoPaMsP = np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
        [CoPaMfinaltrials, NKs])
    CoPaMsN = np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
        [CoPaMfinaltrials, NKs])
    for t in range(CoPaMfinaltrials):
        for ki in range(NKs):
            if type == 'A':
                if Utype.lower() == 'pm':
                    CoPaMs[t, ki] = generateCoPaM(
                        CoPaMsFine[:, ki],
                        relabel_technique=relabel_technique,
                        w=wsets,
                        X=Xloc,
                        GDM=GDMloc)
                elif Utype.lower() == 'idx':
                    CoPaMs[t, ki] = generateCoPaMfromidx(
                        CoPaMsFine[:, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets,
                        GDM=GDMloc)
                else:
                    raise ValueError('Invalid Utype')
            elif type == 'B':
                if Utype.lower() == 'pm':
                    CoPaMsP[t, ki] = generateCoPaM(
                        CoPaMsFine[setsPloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsPloc],
                        GDM=GDMloc[:, setsPloc])
                    CoPaMsN[t, ki] = generateCoPaM(
                        CoPaMsFine[setsNloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsNloc],
                        GDM=GDMloc[:, setsNloc])
                elif Utype.lower() == 'idx':
                    CoPaMsP[t, ki] = generateCoPaMfromidx(
                        CoPaMsFine[setsPloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsPloc],
                        GDM=GDMloc[:, setsPloc])
                    CoPaMsN[t, ki] = generateCoPaMfromidx(
                        CoPaMsFine[setsNloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsNloc],
                        GDM=GDMloc[:, setsNloc])
                else:
                    raise ValueError('Invalid Utype')
            else:
                raise ValueError(
                    'Invalid UNCLES type. It has to be either A or B')

    # Binarise
    NPp = len(binarise_paramP)  # Number of P params
    NNp = len(binarise_paramN)  # Number of N params
    if type == 'A':
        B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object)
        Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)
    elif type == 'B':
        B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object)
        Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)

    for t in range(CoPaMfinaltrials):
        for ki in range(NKs):
            if type == 'A':
                # Pre-sorting binarisation
                for p in range(NPp):
                    B[t, p, 0, ki] = binarise(CoPaMs[t,
                                                     ki], binarise_techniqueP,
                                              binarise_paramP[p])
                Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]]

                # Sorting
                CoPaMs[t, ki] = sortclusters(CoPaMs[t, ki], Mc[t, ki],
                                             smallestClusterSize)

                # Post-sorting binarisation
                for p in range(NPp):
                    B[t, p, 0, ki] = binarise(CoPaMs[t,
                                                     ki], binarise_techniqueP,
                                              binarise_paramP[p])
                Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]]
            elif type == 'B':
                # Pre-sorting binarisation
                BP = [
                    binarise(CoPaMsP[t, ki], binarise_techniqueP,
                             binarise_paramP[p]) for p in range(NPp)
                ]
                McP = [np.sum(BPp, axis=0) for BPp in BP]

                BN = [
                    binarise(CoPaMsN[t, ki], binarise_techniqueN,
                             binarise_paramN[p]) for p in range(NNp)
                ]
                McN = [np.sum(BNp, axis=0) for BNp in BN]

                # Sorting
                CoPaMsP[t, ki] = sortclusters(CoPaMsP[t, ki], McP,
                                              smallestClusterSize)
                CoPaMsN[t, ki] = sortclusters(CoPaMsN[t, ki], McN,
                                              smallestClusterSize)

                # Post-sorting binarisation
                BP = [
                    binarise(CoPaMsP[t, ki], binarise_techniqueP,
                             binarise_paramP[p]) for p in range(NPp)
                ]
                McP = [np.sum(BPp, axis=0) for BPp in BP]

                BN = [
                    binarise(CoPaMsN[t, ki], binarise_techniqueN,
                             binarise_paramN[p]) for p in range(NNp)
                ]
                McN = [np.sum(BNp, axis=0) for BNp in BN]

                # UNCLES B logic
                for pp in range(NPp):
                    for pn in range(NNp):
                        B[t, pp, pn, ki] = BP[pp]
                        B[t, pp, pn, ki][np.any(BN[pn], axis=1)] = False

                # Fill Mc
                Mc[t, ki] = [None] * Ks[ki]
                for k in range(Ks[ki]):
                    Mc[t, ki][k] = np.zeros([NPp, NNp])
                    for pp in range(NPp):
                        for pn in range(NNp):
                            Mc[t, ki][k][pp, pn] = np.sum(B[t, pp, pn, ki][:,
                                                                           k])

    # Prepare and return the results:
    params = dict(
        params, **{
            'methods': methods,
            'setsP': setsPloc,
            'setsN': setsNloc,
            'dofuzzystretch': dofuzzystretch,
            'type': type,
            'Ks': Ks,
            'NKs': NKs,
            'wsets': wsets,
            'wmethods': wmethods,
            'L': L,
            'CoPaMs': CoPaMs,
            'smallestclustersize': smallestClusterSize,
            'GDM': GDMloc
        })

    UnclesRes = collections.namedtuple('UnclesRes',
                                       ['B', 'Mc', 'params', 'X', 'U'])
    return UnclesRes(B, Mc, params, Xloc, Uloc)
示例#2
0
def mnplotsgreedy(X, B, type='A', params=None, allMSE=None, tightnessweight=1, setsP=None, setsN=None, Xtype='data',
                      mseCache=None, wsets=None, GDM=None, msesummary='average', percentageOfClustersKept=100,
                      smallestClusterSize=11, Xnames=None, ncores=1):
    Xloc = ds.listofarrays2arrayofarrays(X)
    Bloc = ds.reduceToArrayOfNDArraysAsObjects(B, 2)
    L = Xloc.shape[0]  # Number of datasets

    # Fix parameters
    if params is None: params = {}
    if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))]
    if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)]
    setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int)
    Xloc = Xloc[setsPN]
    L = Xloc.shape[0]
    if wsets is None:
        wsets = np.array([1 for x in range(L)])
    if GDM is None:
        Ng = np.shape(Xloc[0])[0]
        GDMloc = np.ones([Ng, L], dtype='bool')
    else:
        Ng = np.shape(GDM)[0]
        GDMloc = GDM[:, setsPN]
    if Xnames is None:
        Xnames = ['X{0}'.format(l) for l in range(L)]

    # Put all clusters in one matrix
    N = Bloc.shape[0]  # Number of partitions
    K = [Bloc[i].shape[1] for i in range(N)]  # Number of clusters in each partition

    # One big matrix for all clusters
    BB = Bloc[0]
    for n in range(1, N):
        BB = np.append(BB, Bloc[n], axis=1)
    VMc = np.sum(BB, axis=0)
    NN = len(VMc)  # Total number of clusters

    # Fill Vmse if not provided
    if mseCache is None and allMSE is None:
        # Cache all mse values
        mseCache = np.zeros([NN, L])
        io.resetparallelprogress(NN * L)
        for l in range(L):
            if Xtype == 'files':
                # load files here
                raise NotImplementedError('Xtype "files" has not been implemented yet.')
            elif Xtype == 'data':
                Xtmp = Xloc[l]
            else:
                raise ValueError('Xtype has to be "files" or "data". The given Xtype is invalid.')

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                mseCachetmp = Parallel(n_jobs=ncores)\
                    (delayed(mseclusters)
                     (Xtmp, ds.matlablike_index2D(BB, GDMloc[:, l], nn), 0) for nn in range(NN))
                mseCachetmp = [mm[0] for mm in mseCachetmp]
                for nn in range(NN):
                    mseCache[nn, l] = mseCachetmp[nn]

                gc.collect()

                #io.updateparallelprogress(NN)

            '''
            for nn in range(NN):
                mseCache[nn, l] = mseclusters(Xtmp, ds.matlablike_index2D(BB, GDMloc[:, l], nn), 0)[0]
            io.log('Done cluster evaluation for {0} have been calculated.'.format(Xnames[l]))
            '''

    # Calculate allMSE if needed (Nx1)
    if allMSE is None:
        if type == 'A':
            wsetsloc = wsets[setsPN]
            wsetsloc = [float(n)/sum(wsetsloc) for n in wsetsloc]
            if msesummary == 'average' or msesummary == 'mean':
                allMSE = np.dot(mseCache[:, setsPN], wsets)
            elif msesummary == 'worse' or msesummary == 'max':
                allMSE = np.max(np.multiply(mseCache[:, setsPN], wsets), axis=1)
            else:
                raise ValueError('msesummary value has to be "average", "mean", "worse", or "max".',
                                 ' "average and "mean" behave similarly, and "worse" and "max" behave similarly.')
        elif type == 'B':
            wsetsP = wsets[setsP]
            wsetsP = [n/sum(wsetsP) for n in wsetsP]
            wsetsN = wsets[setsN]
            wsetsN = [n / sum(wsetsN) for n in wsetsN]
            if msesummary == 'average' or msesummary == 'mean':
                allMSE = np.dot(mseCache[:, setsP] , wsetsP) - np.dot(mseCache[:, setsN] , wsetsN)
            elif msesummary == 'worse' or msesummary == 'max':
                allMSE = np.max(np.multiply(mseCache[:, setsP], wsetsP), axis=1) \
                         - np.max(np.multiply(mseCache[:, setsN], wsetsN), axis=1)
            else:
                raise ValueError('msesummary value has to be "average", "mean", "worse", or "max".',
                                 ' "average and "mean" behave similarly, and "worse" and "max" behave similarly.')
        else:
            raise ValueError('Type should be either A or B; given type is invalid.')

    # Find the distances
    maxx = np.max(allMSE[~np.isnan(allMSE)])
    minx = np.min(allMSE[~np.isnan(allMSE)])
    maxy = np.log10(np.max(VMc))
    miny = 0
    with np.errstate(divide='ignore'):
        allVecs = np.concatenate(([(allMSE - minx) / (maxx - minx)],
                                  [(np.log10(VMc) - miny) / (maxy - miny)]), axis=0).transpose()
    allVecs[:, 0] *= tightnessweight
    allDists = np.array([np.sqrt(1.1 + np.power(tightnessweight, 2)) if np.any(np.isnan(n))
                         else sp.spatial.distance.euclidean(n, [0, 1]) for n in allVecs])
    alpha = 0.0001
    tmp, uVdsI = np.unique(allDists, return_index=True)
    while len(uVdsI) != len(allDists):
        for n in range(len(allDists)):
            if n not in uVdsI:
                allDists[n] += alpha * sp.random.normal()
        tmp, uVdsI = np.unique(allDists, return_index=True)

    # Helper function for greedy solution below
    def mngreedy(Bloc, I, Vds, iter=float('inf')):
        Vdsloc = np.array(Vds)
        res = np.array([False for n in Vdsloc])
        if iter == 0 or not any(I):
            return res
        for n in range(len(I)):
            if not I[n]:
                Vdsloc[n] = float('inf')
        p = np.argmin(Vdsloc)
        res[p] = True
        #II = I
        overlaps = np.dot(ds.matlablike_index2D(Bloc, 'all', p).transpose(), Bloc) > 0
        I &= ~overlaps
        return res | mngreedy(Bloc, I, Vdsloc, iter-1)

    # ** Find greedy solution **
    # Sort clusters based on distances (not important, but benefits the output)
    II = np.argsort(allDists)
    allDists = allDists[II]
    BB = ds.matlablike_index2D(BB, 'a', II)
    allVecs = ds.matlablike_index2D(allVecs, II, 'a')
    allMSE = allMSE[II]
    mseCache = ds.matlablike_index2D(mseCache, II, 'a')
    VMc = VMc[II]

    # include the top XX% of the clusters that have at least smallestClusterSize
    Ismall = VMc < smallestClusterSize
    Inans = np.isnan(allDists)
    tmpDists = [np.max(allDists) if Inans[n] | Ismall[n] else allDists[n] for n in range(len(allDists))]
    percentageOfClustersKept *= float(np.sum(~Ismall)) / len(allDists)
    Iincluded = (tmpDists <= np.percentile(tmpDists, percentageOfClustersKept)) & (np.bitwise_not(Ismall))
    I = mngreedy(BB, Iincluded, allDists)
    B_out = ds.matlablike_index2D(BB, 'a', I)

    # Prepare and return the results:
    params = dict(params, **{
        'tightnessweight': tightnessweight,
        'msesummary': msesummary,
        'percentageofclusterskept': percentageOfClustersKept,
        'smallestclustersize': smallestClusterSize
    })

    MNResults = collections.namedtuple('MNResults',
                                       ['B', 'I', 'allVecs', 'allDists', 'allMSE', 'mseCache', 'Ball', 'params'])
    return MNResults(B_out, I, allVecs, allDists, allMSE, mseCache, BB, params)