def name4param(self, keys=None): d = self.param if keys is not None: d = pyutil.dictFilter(d, keys) name = pyutil.dict2flat(d) self.set_name(name) return self.name
def fit_KMEANS( C, ALI='Test', maxIt=1000, nClu=30, DIR='.', model_only=0, random_state=None, reorder=0, ): X = C algo = 'KMEANS' param = { 'genre': algo, 'nClu': nClu, 'maxIt': maxIt, 'randomState': random_state, } if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) param.update(getattr(X, 'param', {})) X, rowName, colName = X.values, X.index, X.columns if ALI == 'Test': ALI = getattr(X, 'name', 'Test') mdl = skclu.KMeans(n_clusters=nClu, n_init=1, max_iter=maxIt, random_state=random_state) NAME = '%s_%s' % (ALI, pyutil.dict2flat(param)) print '[MSG] Now Fitting Model:%s' % NAME d = { 'name': NAME, 'train_data': X, 'colName': colName, 'rowName': rowName, 'param': param, } try: logFile = open('%s/%s.log' % (DIR, NAME), 'w', 0) with pyutil.RedirectStdStreams(logFile): mdl.fit(X) d.update({'suc': 1, 'model': mdl}) print "[SUCC] to fit Model:%s" % (NAME, ) except Exception as e: print "[FAIL] to fit Model:%s due to :'%s'" % (NAME, e) d.update({'suc': 0}) if model_only: d['train_data'] = None d['rowName'] = None d['colName'] = None np.save('%s/%s' % (DIR, NAME), d) d = scount.countMatrix.from_dict(d) return d
def main(f, dbg=0, reCallPeak=0, gPar=None): global shellexec def shellexec(cmd, dbg=0): if dbg: print cmd res = 'dbg' else: res = subprocess.check_output(cmd, shell=1) return res # ############################################################################################# # DEPENDENT_FILES_PATH = '/media/pw_synology3/Software/chip-summary/' # [path of chip-summary.py] # DEFAULT_TARGET_RANGE = '3000' # [change] a string, not a number # SUMMARY_FILE_NAME = 'summary.html' # SUMMARY_DIR = 'summary' # PEAK_CALL_PIPELINE_TEMPLATE = os.path.join(DEPENDENT_FILES_PATH, 'depend/script/pipeline724-t.sh') # PEAK_SELECT_SCRIPT = os.path.join(DEPENDENT_FILES_PATH, 'depend/script/select_peaks.py') # GENELOCUS_TO_GENENAME_SCRIPT= os.path.join(DEPENDENT_FILES_PATH, 'depend/script/genelocus2genename.py') # #### Slowest part to be refactored??? # EXTRACT_AGI_CODE_AND_FC = os.path.join(DEPENDENT_FILES_PATH, 'depend/script/extract_AGI_code_and_fold_change.py') # GO_ENRICHMENT_SCRIPT = os.path.join(DEPENDENT_FILES_PATH, 'depend/script/fe.sh') # install goatools (GO enrichment) and edit fe.sh # GO_ENRICHMENT_DIFF_SCRIPT = os.path.join(DEPENDENT_FILES_PATH, 'depend/script/goterm-matrix.py') # AGI_TO_GENE_NAMES = os.path.join(DEPENDENT_FILES_PATH, 'depend/data/AGI-to-gene-names.txt') # ANNOTATION_FILE = os.path.join(DEPENDENT_FILES_PATH, 'depend/data/genesTAIR10.bed') # for bedmap # GENE_DESCRIPTION = os.path.join(DEPENDENT_FILES_PATH, 'depend/data/gene_description_20140101.txt') # MAX_FOLD_CHANGE = 10 # for number of peaks versus fold-change plot # ############################################################################################# gPar = gPar or get_global_parameters(f) condDict = get_conditions(f, gPar) DIR = pyutil.dict2flat(gPar) # os.system('mkdir -p ' + DIR); os.chdir(DIR) if dbg == 1: d = gPar, condDict for dd in d: print pyutil.ppJson(dd) return d # try: if 1: # Collect results os.system('mkdir -p %s' % SUMMARY_DIR) # make pipeline files for peak calling def getPeak(k): sname = make_peak_call_script(k, condDict, PEAK_CALL_PIPELINE_TEMPLATE) print('Run %s ...' % (sname)) res = subprocess.call(['bash', sname]) # return '%s_peaks.narrowPeak'%k return res if reCallPeak: # check that every ChIP file is present for k in condDict.keys(): chip_file = condDict[k]['CHIP'] input_file = condDict[k]['INPUT'] if not os.path.exists(chip_file): print('%s dose not exist. STOP' % (chip_file)) sys.exit() if not os.path.exists(input_file): print('%s dose not exist. STOP' % (input_file)) sys.exit() [getPeak(k) for k in condDict.keys()] npkFS = ['%s_peaks.narrowPeak' % k for k in condDict.keys()] peakSummary(npkFS) gene_lists = { } # a dictionary of form d = {'condition1': {'AT1G12345':'2.3', 'AT1G12346':'1.2'} }
def fit_BGM( C, ALI='Test', # DIR = '' # normF = identityNorm, stdPer=0, rowName=None, colName=None, nClu=25, maxIt=1000, algo='DPGMM', DIR='.', # algoLst = ['DPGMM'], alpha=.1, covariance_type='diag', fixMean=0, reorder=1, model_only=0, random_state=None, dbg=0, # covariance_type = None, # **kwargs ): ''' Fit an BayesianGaussianMixture() model from sklearn ''' # if algoLst is None: # algoLst = ['DPGMM','DDGMM','GMM',] try: DIR, ALI = ALI.rsplit('/', 1) except: DIR = DIR os.system('mkdir -p %s' % (DIR)) ###### Manage meta attributes of the model ######## param = { 'fixMean': fixMean, 'stdPer': stdPer, 'nClu': nClu, 'genre': algo, 'covarianceType': covariance_type, 'maxIt': maxIt, 'randomState': random_state, } param.update(getattr(C, 'param', {})) ####### Convert to numpy arrary ###### if isinstance(C, pd.DataFrame): if ALI == 'Test': ALI = getattr(C, 'name', 'Test') rowName, colName, C = C.index.values, C.columns, C.values pass ##### Old routine that filter by STD ########### if stdPer > 0: assert stdPer < 100, 'Percentile must < 100, Got %d instead' % stdPer (MEAN, STD, CV), _ = qc_Avg(C) pIdx = STD > np.percentile(STD, stdPer) rowName = np.array(rowName)[pIdx] C = C[pIdx] print '[ALI]=', ALI nFeat = C.shape[-1] #####====== Defnitions of fitters=========####### ###### Arguments shared among fitters ###### common = { 'n_components': nClu, 'verbose': 2, 'max_iter': maxIt, 'covariance_type': covariance_type, 'random_state': random_state, } if fixMean: mean_precision_prior = 1E-128 mean_prior = [0.] * nFeat else: mean_precision_prior = None mean_prior = None ####### List of fitters ###### mdlLst = { 'DPGMM': skmix.BayesianGaussianMixture( weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=alpha, mean_precision_prior=mean_precision_prior, mean_prior=mean_prior, **common), 'GMM': skmix.GaussianMixture(**common), 'DDGMM': skmix.BayesianGaussianMixture( weight_concentration_prior_type='dirichlet_distribution', weight_concentration_prior=alpha, mean_precision_prior=mean_precision_prior, mean_prior=mean_prior, **common), } ############# Select model by "algo"#### X = C # if dbg >= 2: # qcplots.qc_Avg(C,silent=0) print pyutil.qc_matrix(X) mdl = mdlLst.get(algo, None) assert mdl is not None, 'Algorithm %s not found ' % algo NAME = '%s_%s' % (ALI, pyutil.dict2flat(param)) print '[MSG] Now Fitting Model:%s' % NAME ####### Meta data of the training Data ####### d = { 'name': NAME, 'train_data': X, 'colName': colName, 'rowName': rowName, 'param': param, } ##### Fitting model and caching the result to specified DIR/NAME #### try: logFile = open('%s/%s.log' % (DIR, NAME), 'w', 0) with pyutil.RedirectStdStreams(logFile): mdl.fixMean = fixMean mdl.fit(X) # reorderByMSQ(mdl) if reorder: mdl.reorderByMSQ() d.update({'suc': 1, 'model': mdl}) # logFile.close() print "[SUCC] to fit Model:%s" % (NAME, ) print qcmsg.msgGMM(mdl) except Exception as e: print "[FAIL] to fit Model:%s due to :'%s'" % (NAME, e) d.update({'suc': 0}) if model_only: d['train_data'] = None d['rowName'] = None d['colName'] = None np.save('%s/%s' % (DIR.rstrip('/'), NAME), d) d = scount.countMatrix.from_dict(d) return d