Пример #1
0
 def name4param(self, keys=None):
     d = self.param
     if keys is not None:
         d = pyutil.dictFilter(d, keys)
     name = pyutil.dict2flat(d)
     self.set_name(name)
     return self.name
Пример #2
0
def fit_KMEANS(
    C,
    ALI='Test',
    maxIt=1000,
    nClu=30,
    DIR='.',
    model_only=0,
    random_state=None,
    reorder=0,
):
    X = C
    algo = 'KMEANS'
    param = {
        'genre': algo,
        'nClu': nClu,
        'maxIt': maxIt,
        'randomState': random_state,
    }
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
    param.update(getattr(X, 'param', {}))
    X, rowName, colName = X.values, X.index, X.columns

    if ALI == 'Test':
        ALI = getattr(X, 'name', 'Test')

    mdl = skclu.KMeans(n_clusters=nClu,
                       n_init=1,
                       max_iter=maxIt,
                       random_state=random_state)
    NAME = '%s_%s' % (ALI, pyutil.dict2flat(param))

    print '[MSG] Now Fitting Model:%s' % NAME
    d = {
        'name': NAME,
        'train_data': X,
        'colName': colName,
        'rowName': rowName,
        'param': param,
    }

    try:
        logFile = open('%s/%s.log' % (DIR, NAME), 'w', 0)
        with pyutil.RedirectStdStreams(logFile):
            mdl.fit(X)
            d.update({'suc': 1, 'model': mdl})
        print "[SUCC] to fit Model:%s" % (NAME, )
    except Exception as e:
        print "[FAIL] to fit Model:%s due to :'%s'" % (NAME, e)
        d.update({'suc': 0})
    if model_only:
        d['train_data'] = None
        d['rowName'] = None
        d['colName'] = None

    np.save('%s/%s' % (DIR, NAME), d)
    d = scount.countMatrix.from_dict(d)
    return d
Пример #3
0
def main(f, dbg=0, reCallPeak=0, gPar=None):
    global shellexec

    def shellexec(cmd, dbg=0):
        if dbg:
            print cmd
            res = 'dbg'
        else:
            res = subprocess.check_output(cmd, shell=1)
        return res

#     #############################################################################################

#     DEPENDENT_FILES_PATH        = '/media/pw_synology3/Software/chip-summary/'  # [path of chip-summary.py]
#     DEFAULT_TARGET_RANGE        = '3000' # [change]  a string, not a number
#     SUMMARY_FILE_NAME           = 'summary.html'
#     SUMMARY_DIR                 = 'summary'
#     PEAK_CALL_PIPELINE_TEMPLATE = os.path.join(DEPENDENT_FILES_PATH, 'depend/script/pipeline724-t.sh')
#     PEAK_SELECT_SCRIPT          = os.path.join(DEPENDENT_FILES_PATH, 'depend/script/select_peaks.py')
#     GENELOCUS_TO_GENENAME_SCRIPT= os.path.join(DEPENDENT_FILES_PATH, 'depend/script/genelocus2genename.py')

#     #### Slowest part to be refactored???
#     EXTRACT_AGI_CODE_AND_FC     = os.path.join(DEPENDENT_FILES_PATH, 'depend/script/extract_AGI_code_and_fold_change.py')
#     GO_ENRICHMENT_SCRIPT        = os.path.join(DEPENDENT_FILES_PATH, 'depend/script/fe.sh')  # install goatools (GO enrichment) and edit fe.sh
#     GO_ENRICHMENT_DIFF_SCRIPT   = os.path.join(DEPENDENT_FILES_PATH, 'depend/script/goterm-matrix.py')
#     AGI_TO_GENE_NAMES           = os.path.join(DEPENDENT_FILES_PATH, 'depend/data/AGI-to-gene-names.txt')
#     ANNOTATION_FILE             = os.path.join(DEPENDENT_FILES_PATH, 'depend/data/genesTAIR10.bed') # for bedmap
#     GENE_DESCRIPTION            = os.path.join(DEPENDENT_FILES_PATH, 'depend/data/gene_description_20140101.txt')
#     MAX_FOLD_CHANGE             = 10  # for number of peaks versus fold-change plot

#     #############################################################################################

    gPar = gPar or get_global_parameters(f)
    condDict = get_conditions(f, gPar)
    DIR = pyutil.dict2flat(gPar)
    #     os.system('mkdir -p ' + DIR); os.chdir(DIR)
    if dbg == 1:
        d = gPar, condDict
        for dd in d:
            print pyutil.ppJson(dd)

        return d


#     try:
    if 1:

        # Collect results
        os.system('mkdir -p %s' % SUMMARY_DIR)

        # make pipeline files for peak calling
        def getPeak(k):
            sname = make_peak_call_script(k, condDict,
                                          PEAK_CALL_PIPELINE_TEMPLATE)
            print('Run %s ...' % (sname))
            res = subprocess.call(['bash', sname])
            #             return '%s_peaks.narrowPeak'%k
            return res

        if reCallPeak:
            # check that every ChIP file is present
            for k in condDict.keys():
                chip_file = condDict[k]['CHIP']
                input_file = condDict[k]['INPUT']
                if not os.path.exists(chip_file):
                    print('%s dose not exist. STOP' % (chip_file))
                    sys.exit()
                if not os.path.exists(input_file):
                    print('%s dose not exist. STOP' % (input_file))
                    sys.exit()
            [getPeak(k) for k in condDict.keys()]

        npkFS = ['%s_peaks.narrowPeak' % k for k in condDict.keys()]
        peakSummary(npkFS)

        gene_lists = {
        }  # a dictionary of form d  = {'condition1': {'AT1G12345':'2.3', 'AT1G12346':'1.2'} }
Пример #4
0
def fit_BGM(
    C,
    ALI='Test',
    #             DIR = ''
    #             normF = identityNorm,
    stdPer=0,
    rowName=None,
    colName=None,
    nClu=25,
    maxIt=1000,
    algo='DPGMM',
    DIR='.',
    #            algoLst = ['DPGMM'],
    alpha=.1,
    covariance_type='diag',
    fixMean=0,
    reorder=1,
    model_only=0,
    random_state=None,
    dbg=0,
    #             covariance_type = None,
    #             **kwargs
):
    '''
Fit an BayesianGaussianMixture() model from sklearn
'''
    #     if algoLst is None:
    #         algoLst = ['DPGMM','DDGMM','GMM',]
    try:
        DIR, ALI = ALI.rsplit('/', 1)
    except:
        DIR = DIR
    os.system('mkdir -p %s' % (DIR))

    ###### Manage meta attributes of the model ########
    param = {
        'fixMean': fixMean,
        'stdPer': stdPer,
        'nClu': nClu,
        'genre': algo,
        'covarianceType': covariance_type,
        'maxIt': maxIt,
        'randomState': random_state,
    }
    param.update(getattr(C, 'param', {}))

    ####### Convert to numpy arrary ######
    if isinstance(C, pd.DataFrame):
        if ALI == 'Test':
            ALI = getattr(C, 'name', 'Test')

        rowName, colName, C = C.index.values, C.columns, C.values
        pass

    ##### Old routine that filter by STD ###########
    if stdPer > 0:
        assert stdPer < 100, 'Percentile must < 100, Got %d instead' % stdPer
        (MEAN, STD, CV), _ = qc_Avg(C)
        pIdx = STD > np.percentile(STD, stdPer)
        rowName = np.array(rowName)[pIdx]
        C = C[pIdx]
    print '[ALI]=', ALI
    nFeat = C.shape[-1]

    #####====== Defnitions of fitters=========#######

    ###### Arguments shared among fitters ######
    common = {
        'n_components': nClu,
        'verbose': 2,
        'max_iter': maxIt,
        'covariance_type': covariance_type,
        'random_state': random_state,
    }
    if fixMean:
        mean_precision_prior = 1E-128
        mean_prior = [0.] * nFeat
    else:
        mean_precision_prior = None
        mean_prior = None

    ####### List of fitters ######
    mdlLst = {
        'DPGMM':
        skmix.BayesianGaussianMixture(
            weight_concentration_prior_type='dirichlet_process',
            weight_concentration_prior=alpha,
            mean_precision_prior=mean_precision_prior,
            mean_prior=mean_prior,
            **common),
        'GMM':
        skmix.GaussianMixture(**common),
        'DDGMM':
        skmix.BayesianGaussianMixture(
            weight_concentration_prior_type='dirichlet_distribution',
            weight_concentration_prior=alpha,
            mean_precision_prior=mean_precision_prior,
            mean_prior=mean_prior,
            **common),
    }

    ############# Select model by "algo"####
    X = C
    #     if dbg >= 2:
    #         qcplots.qc_Avg(C,silent=0)
    print pyutil.qc_matrix(X)
    mdl = mdlLst.get(algo, None)
    assert mdl is not None, 'Algorithm %s not found ' % algo

    NAME = '%s_%s' % (ALI, pyutil.dict2flat(param))
    print '[MSG] Now Fitting Model:%s' % NAME

    ####### Meta data of the training Data #######
    d = {
        'name': NAME,
        'train_data': X,
        'colName': colName,
        'rowName': rowName,
        'param': param,
    }

    ##### Fitting model and caching the result to specified DIR/NAME ####
    try:
        logFile = open('%s/%s.log' % (DIR, NAME), 'w', 0)
        with pyutil.RedirectStdStreams(logFile):
            mdl.fixMean = fixMean
            mdl.fit(X)
            #             reorderByMSQ(mdl)
            if reorder:
                mdl.reorderByMSQ()
            d.update({'suc': 1, 'model': mdl})
#             logFile.close()
        print "[SUCC] to fit Model:%s" % (NAME, )
        print qcmsg.msgGMM(mdl)
    except Exception as e:
        print "[FAIL] to fit Model:%s due to :'%s'" % (NAME, e)
        d.update({'suc': 0})
    if model_only:
        d['train_data'] = None
        d['rowName'] = None
        d['colName'] = None
    np.save('%s/%s' % (DIR.rstrip('/'), NAME), d)
    d = scount.countMatrix.from_dict(d)
    return d