Exemplo n.º 1
0
def uncles(X,
           type='A',
           Ks=[n for n in range(4, 21, 4)],
           params=None,
           methods=None,
           methodsDetailed=None,
           U=None,
           Utype='PM',
           relabel_technique='minmin',
           setsP=None,
           setsN=None,
           dofuzzystretch=False,
           wsets=None,
           wmethods=None,
           GDM=None,
           smallestClusterSize=11,
           CoPaMfinetrials=1,
           CoPaMfinaltrials=1,
           binarise_techniqueP='DTB',
           binarise_paramP=np.arange(0.0, 1.1, 0.1, dtype='float'),
           binarise_techniqueN='DTB',
           binarise_paramN=np.concatenate(([sys.float_info.epsilon],
                                           np.arange(0.1,
                                                     1.1,
                                                     0.1,
                                                     dtype='float'))),
           Xnames=None,
           deterministic=False,
           ncores=1):
    Xloc = ds.listofarrays2arrayofarrays(X)
    L = len(Xloc)  # Number of datasets

    # Fix parameters
    if params is None: params = {}
    if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))]
    if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)]
    setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int)
    Xloc = Xloc[setsPN]
    L = np.shape(Xloc)[0]  # Number of datasets
    if wsets is None:
        wsets = np.array([1 for x in range(L)])
    else:
        wsets = np.array(wsets)[setsPN]
    if GDM is None:
        Ng = np.shape(Xloc[0])[0]
        GDMloc = np.ones([Ng, L], dtype='bool')
    else:
        GDMloc = GDM[:, setsPN]
        Ng = GDMloc.shape[0]
    if Xnames is None:
        Xnames = ['X{0}'.format(l) for l in range(L)]

    if methods is None:
        methods = [['k-means']]
        # largest_DS = np.max([x.shape[0] for x in Xloc])
        # if (largest_DS <= maxgenesinsetforpdist):
        #    methods = [['k-means'], ['HC']]
        # else:
        #    methods = [['k-means']]
    else:
        largest_DS = np.max([x.shape[0] for x in Xloc])
        if (largest_DS > maxgenesinsetforpdist):
            methods = [
                m for m in methods
                if 'hc' not in [entry.lower() for entry in m]
            ]
            if not methods:
                io.log('No valid base clustering can be used. Please note that clust would not use HC clustering ' \
                       'on datasets with more than {0} genes. You have a dataset with {1} genes.' \
                       ''.format(maxgenesinsetforpdist, largest_DS))
                io.log('Clust will terminate here.')
                io.log(op.bottomline(), addextrastick=False)
                sys.exit()
    if methodsDetailed is None:
        methodsDetailedloc = np.array([methods for l in range(L)])
    else:
        methodsDetailedloc = methodsDetailed[setsPN]
    if wmethods is None:
        wmethods = [[1 for x in m] for m in methodsDetailedloc]
    elif not isinstance(wmethods[0], (list, tuple, np.ndarray)):
        wmethods = np.tile(methods, [L, 1])
    else:
        wmethods = np.array(wmethods)[setsPN]

    setsPloc = [ii for ii in range(len(setsP))]
    if L > len(setsPloc):
        setsNloc = [ii for ii in range(len(setsPloc), L)]

    Ks = np.array(Ks)
    Ks = Ks[Ks <= Ng]  # Remove Ks that are larger than the number of genes Ng
    Ks = Ks.tolist()
    NKs = len(Ks)  # Number of K values

    # If the dataset is empty, return basic output
    if Ng == 0:
        NPp = len(binarise_paramP)  # Number of P params
        NNp = len(binarise_paramN)  # Number of N params
        if type == 'A':
            B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object)
            Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)
        elif type == 'B':
            B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object)
            Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)

        params = dict(
            params, **{
                'methods':
                methods,
                'setsP':
                setsPloc,
                'setsN':
                setsNloc,
                'dofuzzystretch':
                dofuzzystretch,
                'type':
                type,
                'Ks':
                Ks,
                'NKs':
                NKs,
                'wsets':
                wsets,
                'wmethods':
                wmethods,
                'Ds':
                Ds,
                'L':
                L,
                'CoPaMs':
                np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
                    [CoPaMfinaltrials, NKs]),
                'smallestclustersize':
                smallestClusterSize,
                'GDM':
                GDMloc
            })

        Uloc = np.array([None] * (L * NKs)).reshape([L, NKs])

        UnclesRes = collections.namedtuple('UnclesRes',
                                           ['B', 'Mc', 'params', 'X', 'U'])
        return UnclesRes(B, Mc, params, Xloc, Uloc)

    # Clustering
    if U is None:
        Utype = 'PM'
        Uloc = np.array([None] * (L * NKs)).reshape([L, NKs])
        totalparallel = np.sum(Ks) * np.sum(
            [len(meths) for meths in methodsDetailedloc])
        for meths in methodsDetailedloc:
            for meth in meths:
                if 'k-means' in meth:
                    totalparallel += np.max(Ks) * np.max(Ks)
                    continue
        io.resetparallelprogress(totalparallel)

        for l in range(L):
            # Cache kmeans initialisations for the dataset once to save time:
            cl.cache_kmeans_init(Xloc[l],
                                 Ks,
                                 methodsDetailedloc[l],
                                 datasetID=l)

            # Now go to parallel clustering
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                Utmp = Parallel(n_jobs=ncores)\
                    (delayed(clustDataset)
                     (Xloc[l], Ks[ki], methodsDetailedloc[l], GDMloc[:, l], Ng, l) for ki in range(NKs))

                Utmp = [u for u in Utmp]
                for ki in range(NKs):
                    Uloc[l, ki] = Utmp[ki]

                gc.collect()
                #io.updateparallelprogress(np.sum(Ks) * len(methodsDetailedloc))

    else:
        Uloc = ds.listofarrays2arrayofarrays(U)[setsPN]

    # Calculate a CoPaM for each dataset at each K
    CoPaMsFine = np.array([None] * (L * NKs)).reshape([L, NKs])
    for l in range(L):
        for ki in range(NKs):
            if Utype.lower() == 'pm':
                CoPaMsFineTmp = [
                    generateCoPaM(Uloc[l, ki],
                                  relabel_technique=relabel_technique,
                                  X=[Xloc[l]],
                                  w=wmethods[l],
                                  K=Ks[ki],
                                  GDM=GDMloc[:, l].reshape([-1, 1]))
                    for i in range(CoPaMfinetrials)
                ]
            elif Utype.lower() == 'idx':
                CoPaMsFineTmp = \
                    [generateCoPaMfromidx(Uloc[l, ki], relabel_technique=relabel_technique, X=Xloc,
                                          w=wmethods[l], K=Ks[ki])
                     for i in range(CoPaMfinetrials)]
            else:
                raise ValueError('Invalid Utype')
            CoPaMsFine[l,
                       ki] = generateCoPaM(CoPaMsFineTmp,
                                           relabel_technique=relabel_technique,
                                           X=[Xloc[l]],
                                           GDM=GDMloc[:, l].reshape([-1, 1]))

            if dofuzzystretch:
                CoPaMsFine[l, ki] = fuzzystretch(CoPaMsFine[l, ki])

    # Calculate the final CoPaM for each K
    CoPaMs = np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
        [CoPaMfinaltrials, NKs])
    CoPaMsP = np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
        [CoPaMfinaltrials, NKs])
    CoPaMsN = np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
        [CoPaMfinaltrials, NKs])
    for t in range(CoPaMfinaltrials):
        for ki in range(NKs):
            if type == 'A':
                if Utype.lower() == 'pm':
                    CoPaMs[t, ki] = generateCoPaM(
                        CoPaMsFine[:, ki],
                        relabel_technique=relabel_technique,
                        w=wsets,
                        X=Xloc,
                        GDM=GDMloc)
                elif Utype.lower() == 'idx':
                    CoPaMs[t, ki] = generateCoPaMfromidx(
                        CoPaMsFine[:, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets,
                        GDM=GDMloc)
                else:
                    raise ValueError('Invalid Utype')
            elif type == 'B':
                if Utype.lower() == 'pm':
                    CoPaMsP[t, ki] = generateCoPaM(
                        CoPaMsFine[setsPloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsPloc],
                        GDM=GDMloc[:, setsPloc])
                    CoPaMsN[t, ki] = generateCoPaM(
                        CoPaMsFine[setsNloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsNloc],
                        GDM=GDMloc[:, setsNloc])
                elif Utype.lower() == 'idx':
                    CoPaMsP[t, ki] = generateCoPaMfromidx(
                        CoPaMsFine[setsPloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsPloc],
                        GDM=GDMloc[:, setsPloc])
                    CoPaMsN[t, ki] = generateCoPaMfromidx(
                        CoPaMsFine[setsNloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsNloc],
                        GDM=GDMloc[:, setsNloc])
                else:
                    raise ValueError('Invalid Utype')
            else:
                raise ValueError(
                    'Invalid UNCLES type. It has to be either A or B')

    # Binarise
    NPp = len(binarise_paramP)  # Number of P params
    NNp = len(binarise_paramN)  # Number of N params
    if type == 'A':
        B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object)
        Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)
    elif type == 'B':
        B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object)
        Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)

    for t in range(CoPaMfinaltrials):
        for ki in range(NKs):
            if type == 'A':
                # Pre-sorting binarisation
                for p in range(NPp):
                    B[t, p, 0, ki] = binarise(CoPaMs[t,
                                                     ki], binarise_techniqueP,
                                              binarise_paramP[p])
                Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]]

                # Sorting
                CoPaMs[t, ki] = sortclusters(CoPaMs[t, ki], Mc[t, ki],
                                             smallestClusterSize)

                # Post-sorting binarisation
                for p in range(NPp):
                    B[t, p, 0, ki] = binarise(CoPaMs[t,
                                                     ki], binarise_techniqueP,
                                              binarise_paramP[p])
                Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]]
            elif type == 'B':
                # Pre-sorting binarisation
                BP = [
                    binarise(CoPaMsP[t, ki], binarise_techniqueP,
                             binarise_paramP[p]) for p in range(NPp)
                ]
                McP = [np.sum(BPp, axis=0) for BPp in BP]

                BN = [
                    binarise(CoPaMsN[t, ki], binarise_techniqueN,
                             binarise_paramN[p]) for p in range(NNp)
                ]
                McN = [np.sum(BNp, axis=0) for BNp in BN]

                # Sorting
                CoPaMsP[t, ki] = sortclusters(CoPaMsP[t, ki], McP,
                                              smallestClusterSize)
                CoPaMsN[t, ki] = sortclusters(CoPaMsN[t, ki], McN,
                                              smallestClusterSize)

                # Post-sorting binarisation
                BP = [
                    binarise(CoPaMsP[t, ki], binarise_techniqueP,
                             binarise_paramP[p]) for p in range(NPp)
                ]
                McP = [np.sum(BPp, axis=0) for BPp in BP]

                BN = [
                    binarise(CoPaMsN[t, ki], binarise_techniqueN,
                             binarise_paramN[p]) for p in range(NNp)
                ]
                McN = [np.sum(BNp, axis=0) for BNp in BN]

                # UNCLES B logic
                for pp in range(NPp):
                    for pn in range(NNp):
                        B[t, pp, pn, ki] = BP[pp]
                        B[t, pp, pn, ki][np.any(BN[pn], axis=1)] = False

                # Fill Mc
                Mc[t, ki] = [None] * Ks[ki]
                for k in range(Ks[ki]):
                    Mc[t, ki][k] = np.zeros([NPp, NNp])
                    for pp in range(NPp):
                        for pn in range(NNp):
                            Mc[t, ki][k][pp, pn] = np.sum(B[t, pp, pn, ki][:,
                                                                           k])

    # Prepare and return the results:
    params = dict(
        params, **{
            'methods': methods,
            'setsP': setsPloc,
            'setsN': setsNloc,
            'dofuzzystretch': dofuzzystretch,
            'type': type,
            'Ks': Ks,
            'NKs': NKs,
            'wsets': wsets,
            'wmethods': wmethods,
            'L': L,
            'CoPaMs': CoPaMs,
            'smallestclustersize': smallestClusterSize,
            'GDM': GDMloc
        })

    UnclesRes = collections.namedtuple('UnclesRes',
                                       ['B', 'Mc', 'params', 'X', 'U'])
    return UnclesRes(B, Mc, params, Xloc, Uloc)
Exemplo n.º 2
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    # Parse arguments
    headertxt = op.topline()
    headertxt += op.msgformated(
        'Clust\n'
        'Optimised consensus clustering of multiple heterogeneous datasets\n'
        'Version {0}\n'
        '\n'
        'By Basel Abu-Jamous\n'
        'Department of Plant Sciences\n'
        'The University of Oxford\n'
        '*****@*****.**'.format(version), '^')
    headertxt += op.midline()
    headertxt += op.msgformated('Citation\n' '~~~~~~~~', '^')
    citationtxt = 'When publishing work that uses Clust, please cite:\n' \
                  'Basel Abu-Jamous and Steven Kelly (2018) Clust: automatic extraction of optimal co-expressed ' \
                  'gene clusters from gene expression data. Genome Biology 19:172; ' \
                  'doi: https://doi.org/10.1186/s13059-018-1536-8.'
    # TODO: citation
    headertxt += op.msgformated(citationtxt, '<')
    headertxt += op.midline()
    headertxt += op.msgformated(
        'Full description of usage can be found at:\n'
        'https://github.com/BaselAbujamous/clust', '<')
    headertxt += op.bottomline()

    parser = argparse.ArgumentParser(description=headertxt,
                                     formatter_class=RawTextHelpFormatter)
    parser.add_argument('datapath',
                        help='Data file path or directory with data file(s).',
                        default=None)
    parser.add_argument(
        '-n',
        metavar='<file or int>',
        help='Normalisation file or list of codes (default: 1000)',
        default=['1000'],
        nargs='+')
    parser.add_argument('-r',
                        metavar='<file>',
                        help='Replicates structure file',
                        default=None)
    parser.add_argument('-m',
                        metavar='<file>',
                        help='OrthoGroups (OGs) mapping file',
                        default=None)
    parser.add_argument('-o',
                        metavar='<directory>',
                        help='Output directory',
                        default=None)
    parser.add_argument('-t',
                        metavar='<real number>',
                        type=float,
                        help='Cluster tightness (default: 1.0).',
                        default=1.0)
    parser.add_argument(
        '-basemethods',
        metavar='<string>',
        nargs='+',
        help='One or more base clustering methods (default: k-means)',
        default=None)
    parser.add_argument(
        '-K',
        metavar='<integer>',
        type=int,
        nargs='+',
        help='K values, e.g. 2 4 6 10 ... (default: 4 to 20 (step=4))',
        default=[n for n in range(4, 21, 4)])
    parser.add_argument('-s',
                        metavar='<real number>',
                        type=float,
                        help='Outlier standard deviations (default: 3.0)',
                        default=3.0)
    parser.add_argument(
        '-d',
        metavar='<integer>',
        type=int,
        help='Min datasets in which a gene must exist (default: 1)',
        default=1)
    parser.add_argument(
        '-fil-v',
        metavar='<real number>',
        dest='filv',
        type=float,
        help='Filtering: gene expression threshold (default: -inf)',
        default=-float("inf"))
    parser.add_argument('-fil-c',
                        metavar='<integer>',
                        dest='filc',
                        type=int,
                        help='Filtering: number of conditions (default: 0)',
                        default=0)
    parser.add_argument('-fil-d',
                        metavar='<integer>',
                        dest='fild',
                        type=int,
                        help='Filtering: number of datasets (default: 1)',
                        default=0)
    parser.add_argument('--fil-abs',
                        dest='absval',
                        action='store_true',
                        help='Filter using absolute values of expression')
    parser.add_argument(
        '--fil-perc',
        dest='filperc',
        action='store_true',
        help='-fil-v is a percentile of genes rather than raw value')
    parser.add_argument(
        '--fil-flat',
        dest='filflat',
        action='store_true',
        help='Filter out genes with flat expression profiles (default)')
    parser.add_argument('--no-fil-flat',
                        dest='filflat',
                        action='store_false',
                        help='Cancels the default --fil-flat option')
    parser.add_argument('-cs',
                        metavar='<integer>',
                        type=int,
                        help='Smallest cluster size (default: 11)',
                        default=11)
    parser.add_argument('-q3s',
                        metavar='<real number>',
                        type=float,
                        help='Q3'
                        's defining outliers (default: 2.0)',
                        default=2.0)
    parser.add_argument('--no-optimisation',
                        dest='optimisation',
                        action='store_false',
                        help='Skip cluster optimsation & completion')
    parser.add_argument(
        '--deterministic',
        dest='deterministic',
        action='store_true',
        help='Obsolete. All steps are already deterministic (v1.7.4+)')
    parser.add_argument('-np',
                        metavar='<integer>',
                        type=int,
                        help='Number of parallel processes (default: 1)',
                        default=1)
    parser.set_defaults(optimisation=True,
                        deterministic=False,
                        absval=False,
                        filperc=False,
                        filflat=True)
    # parser.add_argument('-ec', type=int, help='Perform error correction, 1 or 0 (default: 1)', default=1)

    if len(args) == 0:
        parser.parse_args(['-h'])

    args = parser.parse_args(args)

    if args.filperc:
        filtype = 'perc'
    else:
        filtype = 'raw'

    if args.basemethods is not None:
        args.basemethods = [[m] for m in args.basemethods]

    # Call the clust function
    clustpipeline.clustpipeline(args.datapath, args.m, args.r, args.n, args.o,
                                args.K, args.t, args.s, args.d, args.filv,
                                args.filc, args.fild, args.absval, filtype,
                                args.filflat, args.cs, args.np,
                                args.optimisation, args.q3s, args.basemethods,
                                args.deterministic)