def make_cls_input(data_combo,y_coln_cls,middle_percentile_skipped): """ Make input for classifier :param data_combo: pandas dataframe containing training data :param y_coln_cls: name of column containing target values :param middle_percentile_skipped: skip middle percentile at boundary while creating classes [0,..,1] """ data_ml=y2classes(data_combo,y_coln_cls, middle_percentile_skipped=middle_percentile_skipped) data_ml=data_ml.drop(y_coln_cls,axis=1) X_cols_cls=data_ml.columns.tolist() y_coln_cls="classes" X_cols_cls=data_ml.columns.tolist().remove(y_coln_cls) data_ml_mutids=list(data_ml.index) # print sum(~pd.isnull(data_combo.loc[:,y_coln_cls])) data_combo=set_index(data_combo,"mutids") data_ml=set_index(data_ml,"mutids") # data_feats=set_index(data_feats,"mutids") # data_combo=pd.concat([data_feats, # data_combo.loc[:,y_coln_cls]],axis=1) # print sum(~pd.isnull(data_combo.loc[:,y_coln_cls])) # data_combo.index.name='mutids' y=data_ml.loc[:,y_coln_cls] data_ml=X_cols2binary(data_ml.drop(y_coln_cls,axis=1)) data_ml.loc[:,y_coln_cls]=y#60data_ml.loc[:,y_coln_cls] data_ml=rescalecols(data_ml) data_cls_train=denan(data_ml,axis='both',condi='all any') # print sum(~pd.isnull(data_ml.loc[:,y_coln_cls])) data_cls_train_mutids=list(data_cls_train.index.values) data_cls_tests_mutids=[mutid for mutid in data_ml_mutids if not mutid in data_cls_train_mutids] data_cls_tests=data_ml.loc[data_cls_tests_mutids,:] return data_combo,data_ml,data_cls_train,data_cls_tests
def concat_feats(data_feats_all,data_feats,col_index): """ Concatenates tables containing individual mutation wise and position wise features. :param data_feats_all: mutation wise features :param data_feats: position wise features :param col_index: columns to be concatenated """ data_feats_all=set_index(data_feats_all,col_index) data_feats =set_index(data_feats ,col_index) data_feats_all=data_feats_all.join(data_feats) return data_feats_all
def plot_submap(info,data_fit_fhs=None,plot_type="submap", feats_tup=["Mutant amino acid's Solvent Accessible Surface Area",'Solvent Accessible Surface Area',], feats_labels= ['SASA','SASA'], ): data_feats_all_fh='%s/data_feats/aas/data_feats_all' % info.prj_dh data_feats_all=pd.read_csv(data_feats_all_fh).set_index('mutids') data_feats_all=set_index(data_feats_all,col_index='mutids') if data_fit_fhs is None: data_fit_fhs=get_fhs('%s/data_fit/aas/' % info.prj_dh, include='_WRT_',exclude='_inferred') type_form='aas' for data_fit_fh in data_fit_fhs: data_fit_fn=basename(data_fit_fh) data_fit=pd.read_csv(data_fit_fh) # data_plot=pd.concat([data_fit,data_feats_all],axis=1) data_plot=data_fit.join(data_feats_all) # data_plot.to_csv('test.csv') plot_fh="%s/plots/%s/%s.%s.pdf" % (info.prj_dh,type_form,data_fit_fn,plot_type) if not exists(plot_fh): for c in ['FiAcol','FiArow']: data_plot.loc[:,c]=data_plot.loc[:,'FiA'] data_plot.loc[:,c]=data_plot.loc[:,c].fillna(0) make_plot_cluster_sub_matrix(data_plot, 'FiA',[[0,1]], feats=['FiAcol','FiArow'], feats_labels= feats_labels, row_cluster=False, col_cluster=False, plot_fh=plot_fh, test=True, )
def rescale_fitnessbysynonymous(data_fit,col_fit="FCA_norm",col_fit_rescaled="FiA",syn2nan=True): """ Rescale fold changes by the fold change of synonymous mutations at that position :param data_fit: pandas table with fold change values """ if not sum(~pd.isnull(data_fit.loc[(data_fit.loc[:,'mut']==data_fit.loc[:,'ref']),col_fit]))==0: data_fit=set_index(data_fit,'mutids') if col_fit_rescaled in data_fit.columns: col_fit_rescaled_ori=col_fit_rescaled col_fit_rescaled ="tmp" if not "refrefi" in data_fit: data_fit.loc[:,'refrefi']\ =mutids_converter(data_fit.reset_index().loc[:,'mutids'], 'refrefi','aas') for refrefi in data_fit.loc[:,"refrefi"].unique(): data_fit_posi=data_fit.loc[data_fit.loc[:,"refrefi"]==refrefi,:] FiS=float(data_fit_posi.loc[data_fit_posi.loc[:,"mut"]==data_fit_posi.loc[:,"ref"],col_fit]) for mutid in data_fit_posi.index: data_fit.loc[mutid,col_fit_rescaled]=data_fit.loc[mutid,col_fit]-FiS if "tmp" in data_fit.columns: data_fit.loc[:,col_fit_rescaled_ori]=data_fit.loc[:,"tmp"] data_fit=data_fit.drop("tmp",axis=1) if syn2nan: data_fit.loc[(data_fit.loc[:,'ref']==data_fit.loc[:,'mut']),col_fit_rescaled]=np.nan return data_fit else: logging.info('no synonymous mutations available') data_fit.loc[:,col_fit_rescaled]=np.nan # data_fit.loc[:,col_fit_rescaled] return data_fit
def make_GLM_norm(data_lbl_ref_fn,data_lbl_sel_fn,data_fit,info): """ Wrapper for DESeq2 mediated GLM normalization :param data_lbl_ref: pandas table with counts of mutations from reference condition :param data_lbl_sel: pandas table with counts of mutations from selected condition :param data_fit: pandas table with fold change values :param info: dict with information of the experiment """ data_lbl_col='NiA_norm' data_deseq2_annot,data_deseq2_annot_fh=make_deseq2_annot(data_lbl_ref_fn,data_lbl_sel_fn, data_lbl_col,info.prj_dh) data_deseq2_count,data_deseq2_count_fh=make_deseq2_count(data_lbl_ref_fn,data_lbl_sel_fn, data_deseq2_annot,data_lbl_col,info.prj_dh) data_deseq2_count=set_index(data_deseq2_count,'mutids') if len(data_deseq2_count.columns)==2: logging.error('transform_type can not be GLM: no replicates found') sys.exit() log_fh="%s.log" % data_deseq2_annot_fh data_deseq2_res_fh="%s.deseq2_res.csv" % data_deseq2_annot_fh if not exists(data_deseq2_res_fh): deseq_fh="%s/deseq2.R" % (abspath(dirname(__file__))) with open(log_fh,'a') as log_f: com='%s %s %s %s 2' % (info.rscript_fh,deseq_fh,data_deseq2_count_fh,data_deseq2_annot_fh) # print com subprocess.call(com,shell=True,stdout=log_f, stderr=subprocess.STDOUT) try: data_deseq2_res=pd.read_csv(data_deseq2_res_fh).set_index('Unnamed: 0') except: logging.error('check deseq2 log for more info: %s' % basename(log_fh)) logging.error("check if deseq2 is installed.") data_deseq2_res.index.name='mutids' # baseMean log2FoldChange lfcSE stat pvalue padj test='Waldtest' multitest='fdr_bh' col_test_pval="pval %s" % test col_test_stat="stat %s" % test col_multitest_pval="padj %s %s" % (test,multitest) cols=data_deseq2_res.columns.tolist() cols=[col_test_pval if s=='pvalue' else s for s in cols] cols=[col_test_stat if s=='stat' else s for s in cols] cols=[col_multitest_pval if s=='padj' else s for s in cols] data_deseq2_res.columns=cols data_deseq2_res.loc[:,'pval']=data_deseq2_res.loc[:,col_test_pval] data_deseq2_res.loc[:,'stat']=data_deseq2_res.loc[:,col_test_stat] data_deseq2_res.loc[:,'padj']=data_deseq2_res.loc[:,col_multitest_pval] data_deseq2_res.loc[:,'FCA_norm']=data_deseq2_res.loc[:,'log2FoldChange'] #set wald as default stat data_fit=data_fit.drop(['pval','stat','padj'],axis=1) data_fit=data_fit.join(data_deseq2_res) return data_fit
def class_comparison(dA,dB): """ This classifies differences in fitness i.e. relative fitness into positive, negative or robust categories. :param dc: dataframe with `dc`. :returns dc: dataframe with `class__comparison` added according to fitness levels in input and selected samples in `dc` """ dA=set_index(dA,'mutids') dB=set_index(dB,'mutids') dc=get_repli_FiA(dA).join(get_repli_FiA(dB),lsuffix='_test',rsuffix='_ctrl') up=data_fit2cutoffs(dc,sA='_reps_test',sB='_reps_ctrl',N=False) dw=-1*up diff=dA.loc[:,'FiA']-dB.loc[:,'FiA'] diff.index.name='mutids' diff=diff.reset_index() mutids_up=diff.loc[(diff.loc[:,'FiA']>up),'mutids'].tolist() mutids_dw=diff.loc[(diff.loc[:,'FiA']<dw),'mutids'].tolist() dc.loc[mutids_up,'class_comparison']="positive" dc.loc[mutids_dw,'class_comparison']="negative" return dc.loc[:,'class_comparison']
def data_comparison2scatter_mutilayered(data,data_label,color_dots=None, mutids_heads=[],mutids_tails=[], col_filter=None, note_text=None, col_pvals=None, repel=0.045, figsize=[15,5], plot_fh=None): """ Wrapper to plot multi layered scatter plot :param data: pandas dataframe :param data_label: label of the data """ from dms2dfe.lib.io_strs import splitlabel # print data.shape data=set_index(data,'mutids') labels=splitlabel(data_label,splitby=' versus ',ctrl='$37^{0}$C') if not note_text is None: labels=["%s (%s)" % (l,note_text) for l in labels] data.loc[:,labels[0]]=data.loc[:,'Fi_test'] data.loc[:,labels[1]]=data.loc[:,'Fi_ctrl'] if not col_pvals is None: data.loc[:,col_pvals]=np.log10(data.loc[:,col_pvals]) if not data.index.name=='mutids': data.index.name='mutids' # print data.index zcol_threshold=np.log10(0.01) if not col_filter is None: data.loc[data.loc[:,col_filter],labels] cols=['mut','ref']+labels if not col_pvals is None: cols=cols+[col_pvals] data=denanrows(data.loc[:,cols]) # print data.shape # print data.index.name # print data.columns.tolist() plot_scatter_mutilayered(data,labels[1],labels[0], plot_fh=plot_fh, color_dots=color_dots, mutids_heads=mutids_heads, mutids_tails=mutids_tails, color_heads='b',color_tails='b', col_z_mutations=col_pvals, zcol_threshold=0.05, repel=repel, figsize=figsize,#[6.375,4.5], )
def make_cls_input(data_combo, y_coln_cls, middle_percentile_skipped): """ Make input for classifier :param data_combo: pandas dataframe containing training data :param y_coln_cls: name of column containing target values :param middle_percentile_skipped: skip middle percentile at boundary while creating classes [0,..,1] """ data_ml = y2classes(data_combo, y_coln_cls, middle_percentile_skipped=middle_percentile_skipped) data_ml = data_ml.drop(y_coln_cls, axis=1) X_cols_cls = data_ml.columns.tolist() y_coln_cls = "classes" X_cols_cls = data_ml.columns.tolist().remove(y_coln_cls) data_ml_mutids = list(data_ml.index) # print sum(~pd.isnull(data_combo.loc[:,y_coln_cls])) data_combo = set_index(data_combo, "mutids") data_ml = set_index(data_ml, "mutids") # data_feats=set_index(data_feats,"mutids") # data_combo=pd.concat([data_feats, # data_combo.loc[:,y_coln_cls]],axis=1) # print sum(~pd.isnull(data_combo.loc[:,y_coln_cls])) # data_combo.index.name='mutids' y = data_ml.loc[:, y_coln_cls] data_ml = X_cols2binary(data_ml.drop(y_coln_cls, axis=1)) data_ml.loc[:, y_coln_cls] = y #60data_ml.loc[:,y_coln_cls] data_ml = rescalecols(data_ml) data_cls_train = denan(data_ml, axis='both', condi='all any') # print sum(~pd.isnull(data_ml.loc[:,y_coln_cls])) data_cls_train_mutids = list(data_cls_train.index.values) data_cls_tests_mutids = [ mutid for mutid in data_ml_mutids if not mutid in data_cls_train_mutids ] data_cls_tests = data_ml.loc[data_cls_tests_mutids, :] return data_combo, data_ml, data_cls_train, data_cls_tests
def get_data_lbl_reps(data_lbl_fn, data_lbl_type, repli, info, data_fit=None, data_lbl_col='NiA_tran', type_form='aas', col_sep='.'): """ Gets the replicates for a given filename of data_lbl :param data_lbl_fn: filename of data_lbl :param data_lbl_type: codon level or amino acid level mutations """ if data_lbl_fn in repli.index: reps = repli.loc[data_lbl_fn, :].dropna() for rep in reps: data_lbl_fh = "%s/data_lbl/%s/%s" % (info.prj_dh, type_form, rep) if exists(data_lbl_fh): data_lbl = pd.read_csv(data_lbl_fh) data_lbl = set_index(data_lbl, 'mutids') data_fit_col = "%s%s%s%s%s" % (rep, col_sep, data_lbl_col, col_sep, data_lbl_type) data_lbl_col = "%s" % (data_lbl_col) if data_fit is None: data_fit = data_lbl.loc[:, ['ref', 'refi', 'mut']].copy() data_fit.loc[:, data_fit_col] = data_lbl.loc[:, data_lbl_col] else: logging.warning('%s does not exist' % basename(data_lbl_fh)) if len(reps) == 0: logging.warning("no replicates found in cfg: %s" % data_lbl_fn) else: return data_fit else: rep = data_lbl_fn data_lbl_fh = "%s/data_lbl/%s/%s" % (info.prj_dh, type_form, rep) # print info.prj_dh if exists(data_lbl_fh): data_lbl = pd.read_csv(data_lbl_fh).set_index('mutids') data_fit_col = "%s%s%s%s%s" % (rep, col_sep, data_lbl_col, col_sep, data_lbl_type) data_lbl_col = "%s" % (data_lbl_col) if data_fit is None: data_fit = data_lbl.loc[:, ['ref', 'refi', 'mut']].copy() data_fit.loc[:, data_fit_col] = data_lbl.loc[:, data_lbl_col] return data_fit else: logging.warning('does not exists: %s' % data_lbl_fn)
def make_dXy(dXy,ycol,unique_quantile=0.25,index="mutids",if_rescalecols=True): """ Create a pandas table with target and predictor data :param dXy: pandas dataframe with target(y) and predictor(X) data :param ycol: column name of target values """ dXy=set_index(dXy,index) # print 'len(cols_del)=%s' % len(get_cols_del(dXy)) dXy=dXy.drop(get_cols_del(dXy),axis=1) Xcols=[c for c in dXy.columns.tolist() if c!=ycol] Xunique=pd.DataFrame({'unique':[len(np.unique(dXy[c])) for c in Xcols]},index=[c for c in Xcols]) Xcols=Xunique.index[Xunique['unique']>Xunique['unique'].quantile(unique_quantile)] dXy=dXy.loc[:,Xcols.tolist()+[ycol]] dXy=dXy.dropna(axis=1, how='all').dropna(axis=0, how='any') if if_rescalecols: Xcols=[c for c in dXy.columns.tolist() if c!=ycol] dXy.loc[:,Xcols]=rescalecols(dXy.loc[:,Xcols]) return dXy,Xcols,ycol
def get_data_lbl_reps(data_lbl_fn,data_lbl_type,repli,info,data_fit=None, data_lbl_col='NiA_tran',type_form='aas',col_sep='.'): """ Gets the replicates for a given filename of data_lbl :param data_lbl_fn: filename of data_lbl :param data_lbl_type: codon level or amino acid level mutations """ if data_lbl_fn in repli.index: reps=repli.loc[data_lbl_fn,:].dropna() for rep in reps: data_lbl_fh="%s/data_lbl/%s/%s" % (info.prj_dh,type_form,rep) if exists(data_lbl_fh): data_lbl=pd.read_csv(data_lbl_fh) data_lbl=set_index(data_lbl,'mutids') data_fit_col="%s%s%s%s%s" % (rep,col_sep,data_lbl_col,col_sep,data_lbl_type) data_lbl_col="%s" % (data_lbl_col) if data_fit is None: data_fit=data_lbl.loc[:,['ref','refi','mut']].copy() data_fit.loc[:,data_fit_col]=data_lbl.loc[:,data_lbl_col] else: logging.warning('%s does not exist' % basename(data_lbl_fh)) if len(reps)==0: logging.warning("no replicates found in cfg: %s" % data_lbl_fn) else: return data_fit else: rep=data_lbl_fn data_lbl_fh="%s/data_lbl/%s/%s" % (info.prj_dh,type_form,rep) # print info.prj_dh if exists(data_lbl_fh): data_lbl=pd.read_csv(data_lbl_fh).set_index('mutids') data_fit_col="%s%s%s%s%s" % (rep,col_sep,data_lbl_col,col_sep,data_lbl_type) data_lbl_col="%s" % (data_lbl_col) if data_fit is None: data_fit=data_lbl.loc[:,['ref','refi','mut']].copy() data_fit.loc[:,data_fit_col]=data_lbl.loc[:,data_lbl_col] return data_fit else: logging.warning('does not exists: %s' % data_lbl_fn)
def rescale_fitnessbysynonymous(data_fit, col_fit="FCA_norm", col_fit_rescaled="FiA", syn2nan=True): """ Rescale fold changes by the fold change of synonymous mutations at that position :param data_fit: pandas table with fold change values """ if not sum(~pd.isnull(data_fit.loc[ (data_fit.loc[:, 'mut'] == data_fit.loc[:, 'ref']), col_fit])) == 0: data_fit = set_index(data_fit, 'mutids') if col_fit_rescaled in data_fit.columns: col_fit_rescaled_ori = col_fit_rescaled col_fit_rescaled = "tmp" if not "refrefi" in data_fit: data_fit.loc[:,'refrefi']\ =mutids_converter(data_fit.reset_index().loc[:,'mutids'], 'refrefi','aas') for refrefi in data_fit.loc[:, "refrefi"].unique(): data_fit_posi = data_fit.loc[data_fit.loc[:, "refrefi"] == refrefi, :] FiS = float(data_fit_posi.loc[data_fit_posi.loc[:, "mut"] == data_fit_posi.loc[:, "ref"], col_fit]) for mutid in data_fit_posi.index: data_fit.loc[mutid, col_fit_rescaled] = data_fit.loc[mutid, col_fit] - FiS if "tmp" in data_fit.columns: data_fit.loc[:, col_fit_rescaled_ori] = data_fit.loc[:, "tmp"] data_fit = data_fit.drop("tmp", axis=1) if syn2nan: data_fit.loc[(data_fit.loc[:, 'ref'] == data_fit.loc[:, 'mut']), col_fit_rescaled] = np.nan return data_fit else: logging.info('no synonymous mutations available') data_fit.loc[:, col_fit_rescaled] = np.nan # data_fit.loc[:,col_fit_rescaled] return data_fit
def plot_submap( info, data_fit_fhs=None, plot_type="submap", feats_tup=[ "Mutant amino acid's Solvent Accessible Surface Area", 'Solvent Accessible Surface Area', ], feats_labels=['SASA', 'SASA'], ): data_feats_all_fh = '%s/data_feats/aas/data_feats_all' % info.prj_dh data_feats_all = pd.read_csv(data_feats_all_fh).set_index('mutids') data_feats_all = set_index(data_feats_all, col_index='mutids') if data_fit_fhs is None: data_fit_fhs = get_fhs('%s/data_fit/aas/' % info.prj_dh, include='_WRT_', exclude='_inferred') type_form = 'aas' for data_fit_fh in data_fit_fhs: data_fit_fn = basename(data_fit_fh) data_fit = pd.read_csv(data_fit_fh) # data_plot=pd.concat([data_fit,data_feats_all],axis=1) data_plot = data_fit.join(data_feats_all) # data_plot.to_csv('test.csv') plot_fh = "%s/plots/%s/%s.%s.pdf" % (info.prj_dh, type_form, data_fit_fn, plot_type) if not exists(plot_fh): for c in ['FiAcol', 'FiArow']: data_plot.loc[:, c] = data_plot.loc[:, 'FiA'] data_plot.loc[:, c] = data_plot.loc[:, c].fillna(0) make_plot_cluster_sub_matrix( data_plot, 'FiA', [[0, 1]], feats=['FiAcol', 'FiArow'], feats_labels=feats_labels, row_cluster=False, col_cluster=False, plot_fh=plot_fh, test=True, )
def make_GLM_norm(data_lbl_ref_fn, data_lbl_sel_fn, data_fit, info): """ Wrapper for DESeq2 mediated GLM normalization :param data_lbl_ref: pandas table with counts of mutations from reference condition :param data_lbl_sel: pandas table with counts of mutations from selected condition :param data_fit: pandas table with fold change values :param info: dict with information of the experiment """ data_lbl_col = 'NiA_norm' data_deseq2_annot, data_deseq2_annot_fh = make_deseq2_annot( data_lbl_ref_fn, data_lbl_sel_fn, data_lbl_col, info.prj_dh) data_deseq2_count, data_deseq2_count_fh = make_deseq2_count( data_lbl_ref_fn, data_lbl_sel_fn, data_deseq2_annot, data_lbl_col, info.prj_dh) data_deseq2_count = set_index(data_deseq2_count, 'mutids') if len(data_deseq2_count.columns) == 2: logging.error('transform_type can not be GLM: no replicates found') sys.exit() log_fh = "%s.log" % data_deseq2_annot_fh data_deseq2_res_fh = "%s.deseq2_res.csv" % data_deseq2_annot_fh if not exists(data_deseq2_res_fh): deseq_fh = "%s/deseq2.R" % (abspath(dirname(__file__))) with open(log_fh, 'a') as log_f: com = '%s %s %s %s 2' % (info.rscript_fh, deseq_fh, data_deseq2_count_fh, data_deseq2_annot_fh) # print com subprocess.call(com, shell=True, stdout=log_f, stderr=subprocess.STDOUT) try: data_deseq2_res = pd.read_csv(data_deseq2_res_fh).set_index( 'Unnamed: 0') except: logging.error('check deseq2 log for more info: %s' % basename(log_fh)) logging.error("check if deseq2 is installed.") data_deseq2_res.index.name = 'mutids' # baseMean log2FoldChange lfcSE stat pvalue padj test = 'Waldtest' multitest = 'fdr_bh' col_test_pval = "pval %s" % test col_test_stat = "stat %s" % test col_multitest_pval = "padj %s %s" % (test, multitest) cols = data_deseq2_res.columns.tolist() cols = [col_test_pval if s == 'pvalue' else s for s in cols] cols = [col_test_stat if s == 'stat' else s for s in cols] cols = [col_multitest_pval if s == 'padj' else s for s in cols] data_deseq2_res.columns = cols data_deseq2_res.loc[:, 'pval'] = data_deseq2_res.loc[:, col_test_pval] data_deseq2_res.loc[:, 'stat'] = data_deseq2_res.loc[:, col_test_stat] data_deseq2_res.loc[:, 'padj'] = data_deseq2_res.loc[:, col_multitest_pval] data_deseq2_res.loc[:, 'FCA_norm'] = data_deseq2_res.loc[:, 'log2FoldChange'] #set wald as default stat data_fit = data_fit.drop(['pval', 'stat', 'padj'], axis=1) data_fit = data_fit.join(data_deseq2_res) return data_fit
def main(prj_dh, test=False, ml=False): """ **--step 3**. Identifies molecular features that may determine fitness scores. This plots the results in following visualisations. .. code-block:: text ROC plots Relative importances of features :param prj_dh: path to project directory. """ logging.info("start") if not exists(prj_dh): logging.error("Could not find '%s'" % prj_dh) sys.exit() configure.main(prj_dh) from dms2dfe.tmp import info from dms2dfe.tmp import info from dms2dfe.lib.io_ml import corrplot corrplot(info) if ml: from dms2dfe.lib.io_dfs import set_index from dms2dfe.lib.io_ml import data_fit2ml #,get_cols_del,make_data_combo,data_combo2ml cores = int(info.cores) if hasattr(info, 'mut_type'): mut_type = info.mut_type else: mut_type = 'single' if hasattr(info, 'ml_input'): if info.ml_input == 'FC': ml_input = 'FCA_norm' elif info.ml_input == 'Fi': ml_input = 'FiA' else: ml_input = 'FCA_norm' type_form = "aas" if not exists("%s/plots/%s" % (prj_dh, type_form)): makedirs("%s/plots/%s" % (prj_dh, type_form)) if not exists("%s/data_ml/%s" % (prj_dh, type_form)): makedirs("%s/data_ml/%s" % (prj_dh, type_form)) data_feats = pd.read_csv("%s/data_feats/aas/data_feats_all" % (prj_dh)) if mut_type == 'single': data_fit_keys = ["data_fit/%s/%s" % (type_form,basename(fh)) \ for fh in glob("%s/data_fit/aas/*" % prj_dh) \ if (not "inferred" in basename(fh)) and ("_WRT_" in basename(fh))] data_fit_keys = np.unique(data_fit_keys) if len(data_fit_keys) != 0: if test: pooled_io_ml(data_fit_keys[0]) # for data_fit_key in data_fit_keys: # pooled_io_ml(data_fit_key) else: for data_fit_key in data_fit_keys: pooled_io_ml(data_fit_key) # pool_io_ml=Pool(processes=int(cores)) # pool_io_ml.map(pooled_io_ml,data_fit_keys) # pool_io_ml.close(); pool_io_ml.join() else: logging.info("already processed") elif mut_type == 'double': data_feats = set_index(data_feats, 'mutids') data_fit_dh = 'data_fit_dm' data_fit_keys = ["%s/%s/%s" % (data_fit_dh,type_form,basename(fh)) \ for fh in glob("%s/%s/aas/*" % (prj_dh,data_fit_dh)) \ if (not "inferred" in basename(fh)) and ("_WRT_" in basename(fh))] data_fit_keys = np.unique(data_fit_keys) ycol = ml_input Xcols = data_feats.columns if len(data_fit_keys) != 0: for data_fit_key in data_fit_keys: data_fit_dm_fh = '%s/%s' % (prj_dh, data_fit_key) data_combo_fh = '%s/data_ml/aas/%s.combo' % ( prj_dh, basename(data_fit_dm_fh)) force = False if not exists(data_combo_fh) or force: data_fit_dm = pd.read_csv(data_fit_dm_fh).set_index( 'mutids') data_combo = make_data_combo(data_fit_dm, data_feats, ycol, Xcols) if not exists(dirname(data_combo_fh)): makedirs(dirname(data_combo_fh)) data_combo.to_csv(data_combo_fh) else: data_combo = pd.read_csv(data_combo_fh).set_index( 'mutids') logging.info('ml: start') data_combo2ml( data_combo, basename(data_fit_dm_fh), dirname(data_combo_fh), dirname(data_combo_fh), ycoln=ycol, col_idx='mutids', ml_type='cls', middle_percentile_skipped=0.1, force=False, ) def pooled_io_ml(data_fit_key): """ This module makes use of muti threading to speed up `dms2dfe.lib.io_ml.data_fit2ml`. :param data_fit_key: in the form <data_fit>/<aas/cds>/<name of file>. """ from dms2dfe.tmp import info dX_fh = "%s/data_feats/aas/data_feats_all" % (info.prj_dh) dy_fh = '%s/%s' % (info.prj_dh, data_fit_key) logging.info('processing: %s' % basename(dy_fh)) data_fit2ml(dX_fh, dy_fh, info, regORcls='cls') logging.shutdown()