def make_deseq2_count(unsel, sel, data_deseq2_annot, data_lbl_col, prj_dh, type_form='aas'): """ Makes DESeq2 count file :param unsel: unselected condition :param sel: selected condtition :param prj_dh: path to the project directory :param data_lbl_col: column of the data_lbl pandas table to be used :param data_deseq2_annot: pandas table with annotation information """ data_deseq2_count_dh = '%s/data_fit/%s_all' % (prj_dh, type_form) data_deseq2_count_fh = '%s/%s_WRT_%s.deseq2_count.csv' % ( data_deseq2_count_dh, sel, unsel) if not exists(data_deseq2_count_fh): data_lbl_fhs = [ "%s/data_lbl/%s/%s" % (prj_dh, type_form, s.split('.')[0]) for s in data_deseq2_annot.index ] col_sep = "." data_lbl_all = fhs2data_combo(data_lbl_fhs, cols=[data_lbl_col], index='mutids', col_sep=col_sep) data_lbl_all = debad(data_lbl_all, axis=0, condi='any', bad='nan') data_lbl_all.to_csv(data_deseq2_count_fh) else: data_lbl_all = pd.read_csv(data_deseq2_count_fh).set_index('mutids') return data_lbl_all, data_deseq2_count_fh
def transform_data_lbl( prj_dh, transform_type, type_form='aas', data_lbl_col='NiA_norm', ): """ Transforamtion of counts of mutants in data_lbl table :param prj_dh: path to the project directory :param transform_type: type of transformation log, plog, glog etc :returns data_lbl: data_lbl with transformed counts """ data_lbl_fhs = glob("%s/data_lbl/aas/*" % prj_dh) if len(data_lbl_fhs) > 0: col_sep = "." data_lbl_all = fhs2data_combo(data_lbl_fhs, cols=[data_lbl_col], index='mutids', col_sep=col_sep) data_lbl_all_dh = '%s/data_lbl/%s_all' % (prj_dh, type_form) if not exists(data_lbl_all_dh): makedirs(data_lbl_all_dh) data_lbl_all_fh = '%s/%s.csv' % (data_lbl_all_dh, data_lbl_col) data_lbl_all.to_csv(data_lbl_all_fh) if (transform_type == 'log2') or (transform_type == 'log'): data_lbl_all = data_lbl_all.apply(np.log2) elif transform_type == 'plog': data_lbl_all = data_lbl_all.apply(plog) else: logging.error("trnaform_type not valid: %s" % transform_type) sys.exist() data_lbl_col = 'NiA_tran' data_lbl_all_fh = '%s/%s.csv' % (data_lbl_all_dh, data_lbl_col) data_lbl_all.to_csv(data_lbl_all_fh) for col in data_lbl_all: data_lbl_fn, tmp = col.split('.') data_lbl_fh = '%s/data_lbl/%s/%s' % (prj_dh, type_form, data_lbl_fn) data_lbl = pd.read_csv(data_lbl_fh).set_index('mutids') if not data_lbl_col in data_lbl: data_lbl_cols = data_lbl.columns.tolist() data_lbl = pd.concat([data_lbl, data_lbl_all.loc[:, col]], axis=1) data_lbl.columns = data_lbl_cols + [data_lbl_col] data_lbl.index.name = 'mutids' data_lbl.to_csv(data_lbl_fh)
def transform_data_lbl(prj_dh,transform_type, type_form='aas',data_lbl_col='NiA_norm',): """ Transforamtion of counts of mutants in data_lbl table :param prj_dh: path to the project directory :param transform_type: type of transformation log, plog, glog etc :returns data_lbl: data_lbl with transformed counts """ data_lbl_fhs=glob("%s/data_lbl/aas/*" % prj_dh) if len(data_lbl_fhs)>0: col_sep="." data_lbl_all=fhs2data_combo(data_lbl_fhs,cols=[data_lbl_col],index='mutids',col_sep=col_sep) data_lbl_all_dh='%s/data_lbl/%s_all' % (prj_dh,type_form) if not exists(data_lbl_all_dh): makedirs(data_lbl_all_dh) data_lbl_all_fh='%s/%s.csv' % (data_lbl_all_dh,data_lbl_col) data_lbl_all.to_csv(data_lbl_all_fh) if (transform_type=='log2') or (transform_type=='log'): data_lbl_all=data_lbl_all.apply(np.log2) elif transform_type=='plog': data_lbl_all=data_lbl_all.apply(plog) else: logging.error("trnaform_type not valid: %s" % transform_type) sys.exist() data_lbl_col='NiA_tran' data_lbl_all_fh='%s/%s.csv' % (data_lbl_all_dh,data_lbl_col) data_lbl_all.to_csv(data_lbl_all_fh) for col in data_lbl_all: data_lbl_fn,tmp=col.split('.') data_lbl_fh='%s/data_lbl/%s/%s' % (prj_dh,type_form,data_lbl_fn) data_lbl=pd.read_csv(data_lbl_fh).set_index('mutids') if not data_lbl_col in data_lbl: data_lbl_cols=data_lbl.columns.tolist() data_lbl=pd.concat([data_lbl, data_lbl_all.loc[:,col]],axis=1) data_lbl.columns=data_lbl_cols+[data_lbl_col] data_lbl.index.name='mutids' data_lbl.to_csv(data_lbl_fh)
def make_deseq2_count(unsel,sel,data_deseq2_annot,data_lbl_col,prj_dh,type_form='aas'): """ Makes DESeq2 count file :param unsel: unselected condition :param sel: selected condtition :param prj_dh: path to the project directory :param data_lbl_col: column of the data_lbl pandas table to be used :param data_deseq2_annot: pandas table with annotation information """ data_deseq2_count_dh='%s/data_fit/%s_all' % (prj_dh,type_form) data_deseq2_count_fh='%s/%s_WRT_%s.deseq2_count.csv' % (data_deseq2_count_dh,sel,unsel) if not exists(data_deseq2_count_fh): data_lbl_fhs=["%s/data_lbl/%s/%s" % (prj_dh,type_form,s.split('.')[0]) for s in data_deseq2_annot.index] col_sep="." data_lbl_all=fhs2data_combo(data_lbl_fhs,cols=[data_lbl_col],index='mutids',col_sep=col_sep) data_lbl_all=debad(data_lbl_all,axis=0,condi='any',bad='nan') data_lbl_all.to_csv(data_deseq2_count_fh) else: data_lbl_all=pd.read_csv(data_deseq2_count_fh).set_index('mutids') return data_lbl_all,data_deseq2_count_fh
def data_lbl2data_fit_lite(fits_pairs,prj_dh,data_lbl_dh,data_fit_dh,force=False): """ Short wrapper for conversion of mutation counts to fold changes :param fits_pair: list with pair of selected and reference condition :param pj_dh: path to the protject directory :param data_lbl_dh: path to the diorectory containing data_lbl csv tables :param data_fit_dh: path to the directory of data_fit csvs """ data_fit_fh='%s/%s/aas/%s_WRT_%s' % (prj_dh,data_fit_dh,fits_pairs[1],fits_pairs[0]) if not exists(data_fit_fh) or force: data_lbl_fhs=['%s/%s/aas/%s' % (prj_dh,data_lbl_dh,s) for s in fits_pairs] data_fit=fhs2data_combo(data_lbl_fhs,cols=['NiA_tran'], # labels=['ref','sel'], index='mutids',col_sep='.') data_fit.columns=['NiA_tran.ref','NiA_tran.sel'] data_fit.loc[:,'FCA']=data_fit.loc[:,'NiA_tran.sel']-data_fit.loc[:,'NiA_tran.ref'] data_fit.loc[:,'FCA_norm']=data_fit.loc[:,'NiA_tran.sel']-data_fit.loc[:,'NiA_tran.ref'] if not exists(dirname(data_fit_fh)): makedirs(dirname(data_fit_fh)) data_fit.to_csv(data_fit_fh)
def corrplot(info): """ Plots a correlation matrix heatmap between range of features and fold change values :param info: dict, with the information of the experiment """ from dms2dfe.lib.io_dfs import fhs2data_combo from glob import glob from dms2dfe.lib.plot_mut_data_heatmaps import clustermap from dms2dfe.lib.io_ml_data import make_dXy ml_input = info.ml_input prj_dh = info.prj_dh data_fit_fhs = glob('%s/data_fit/aas/*' % prj_dh) data_feats_all_fh = '%s/data_feats/aas/data_feats_all' % prj_dh data_feats_all = pd.read_csv(data_feats_all_fh).set_index('mutids') data_fit_all = fhs2data_combo(data_fit_fhs, ['%sA' % ml_input], 'mutids') data_fit_all.columns = [c.split(': ')[0] for c in data_fit_all] for c in data_fit_all: plot_fh = '%s/plots/aas/%s.corr.pdf' % (prj_dh, c) if not exists(plot_fh): if not exists(dirname(plot_fh)): makedirs(dirname(plot_fh)) dXy = data_feats_all.join(data_fit_all[c]) dXy, Xcols, ycol = make_dXy(dXy, ycol=c, if_rescalecols=False, unique_quantile=0.25) dXy, Xcols, ycol = feats_sel_corr(dXy, ycol, range_coef=[0.9, 0.8]) g, ax = clustermap( dXy.corr(method='spearman'), highlight_col=c, vlim=[-0.5, 0.5], figsize=[10, 10], plot_fh=plot_fh, )
def data_lbl2data_fit_lite(fits_pairs, prj_dh, data_lbl_dh, data_fit_dh, force=False): """ Short wrapper for conversion of mutation counts to fold changes :param fits_pair: list with pair of selected and reference condition :param pj_dh: path to the protject directory :param data_lbl_dh: path to the diorectory containing data_lbl csv tables :param data_fit_dh: path to the directory of data_fit csvs """ data_fit_fh = '%s/%s/aas/%s_WRT_%s' % (prj_dh, data_fit_dh, fits_pairs[1], fits_pairs[0]) if not exists(data_fit_fh) or force: data_lbl_fhs = [ '%s/%s/aas/%s' % (prj_dh, data_lbl_dh, s) for s in fits_pairs ] data_fit = fhs2data_combo( data_lbl_fhs, cols=['NiA_tran'], # labels=['ref','sel'], index='mutids', col_sep='.') data_fit.columns = ['NiA_tran.ref', 'NiA_tran.sel'] data_fit.loc[:, 'FCA'] = data_fit.loc[:, 'NiA_tran.sel'] - data_fit.loc[:, 'NiA_tran.ref'] data_fit.loc[:, 'FCA_norm'] = data_fit.loc[:, 'NiA_tran.sel'] - data_fit.loc[:, 'NiA_tran.ref'] if not exists(dirname(data_fit_fh)): makedirs(dirname(data_fit_fh)) data_fit.to_csv(data_fit_fh)
def plot_data_comparison_multiviolin( prj_dh, data_fits, col, data_fiti_ctrl=0, aasORcds="aas", ns=True, numeric=False, color_test=(0.6, 0.8, 1), #'mediumpurple'#"lime" color_ctrl="lightgray", data_fits_labels=None, pval=True, stars=True, violin_width=0.9, color_xticks=None, force=False, ylims=None, col_hue='Conditions', label_test='Test', label_ctrl='Control', ylabel=None, figsize=[4, 3], plot_fh=None): """ Plotting distributions of comparison of fold change data :param prj_dh: path to project directory :param data_fits: pandas dataframe with fold change data :param col: column with the fold change data """ data_fit_fhs = [ "%s/data_fit/%s/%s" % (prj_dh, aasORcds, s) for s in data_fits ] if plot_fh != None: data_comparison_ctrl_fh = plot_fh + "data_comparison_ctrl.csv" data_comparison_test_fh = plot_fh + "data_comparison_test.csv" else: data_comparison_ctrl_fh = "data_comparison_ctrl.csv" data_comparison_test_fh = "data_comparison_test.csv" if (not exists(data_comparison_ctrl_fh)) or\ (not exists(data_comparison_test_fh)) or force: data_fit_test_fhs = [fh for i, fh in enumerate(data_fit_fhs) if i != 0] data_fit_ctrl_fhs = [fh for i, fh in enumerate(data_fit_fhs) if i == 0] data_comparison_test = fhs2data_combo(data_fit_test_fhs, [col], index='mutids') data_comparison_ctrl = fhs2data_combo(data_fit_ctrl_fhs, [col], index='mutids') data_comparison_test.columns = data_fits[1:] data_comparison_ctrl.columns = data_fits[1:] #data_comparison_test.to_csv(data_comparison_test_fh) #data_comparison_ctrl.to_csv(data_comparison_ctrl_fh) else: data_comparison_test = pd.read_csv(data_comparison_test_fh).set_index( 'mutids') data_comparison_ctrl = pd.read_csv(data_comparison_ctrl_fh).set_index( 'mutids') data_all = pd.concat([ data_comparison_ctrl.loc[:, data_comparison_ctrl.columns.tolist()[0]], data_comparison_test ], axis=1) data_all.columns = data_fits data_comparison_test = data_comparison_test.unstack().reset_index() data_comparison_ctrl = data_comparison_ctrl.unstack().reset_index() data_comparison_test.columns = ["condi", "mutids", col] data_comparison_ctrl.columns = ["condi", "mutids", col] data_comparison_test.loc[:, col_hue] = label_test data_comparison_ctrl.loc[:, col_hue] = label_ctrl data_comparison = data_comparison_test.append(data_comparison_ctrl) y_max = data_comparison.loc[:, col].max() y_min = data_comparison.loc[:, col].min() data_fit_ctrl = data_fits[data_fiti_ctrl] data_fit_tests = [ s for s in data_fits if data_fits.index(s) != data_fiti_ctrl ] # fig= plt.figure(figsize=figsize, dpi=300) ax = plt.subplot(111) import seaborn as sns from scipy.stats import mannwhitneyu, wilcoxon # plt.style.use('seaborn-whitegrid') # plt.style.use('seaborn-white') sns.violinplot( x="condi", y=col, hue=col_hue, split=True, data=data_comparison, # color='m', palette={ label_test: color_test, label_ctrl: color_ctrl }, inner="quartile", width=violin_width, cut=0, # bw=0.1, scale="width", ax=ax) plt.legend( title=col_hue, loc='upper center', bbox_to_anchor=(0.5, 1.1), #, 1., .102), ncol=2, borderaxespad=0.1, frameon=True, ) ax.set_xlabel("") if not ylabel is None: ax.set_ylabel(ylabel) # ax.grid(b=True) ax.yaxis.grid(True) if data_fits_labels != None: ax.set_xticklabels(data_fits_labels[1:]) if ylabel != None: ax.set_ylabel(ylabel) plt.tight_layout() if ylims is None: ax.set_ylim([y_min, y_max]) else: ax.set_ylim([ylims[0], ylims[1]]) # return data_comparison,data_all if pval: col_ctrl = data_all.columns.tolist()[0] col_tests = data_all.columns.tolist()[1:] for col_testi in range(len(col_tests)): col_test = col_tests[col_testi] pval, side = get_wilcoxon(data_all, col_ctrl, col_test, side='one', denan=False) side = '' #stitch # print pval if stars: pval = pval2stars(pval, ns=ns, numeric=False) if ns and (pval > 0.05): side = '' pval = '' result = "%s\n%s" % (side, pval) else: result = "%s\n%s" % (side, pval) # print result # print ax.get_ylim()[0]+(ax.get_ylim()[1]-ax.get_ylim()[0])*0.05 ax.text( col_testi, ax.get_ylim()[0] + (ax.get_ylim()[1] - ax.get_ylim()[0]) * 0.05, result, ha='center', color='b', bbox=dict( facecolor='w', edgecolor='none', # boxstyle='round', alpha=0.6, )) # data_all.to_csv('test1.csv') if plot_fh != None: #plt.tight_layout() plt.savefig(plot_fh, format='pdf') plt.clf() plt.close() return ax, data_comparison, data_all
def transform_data_lbl_deseq(prj_dh,transform_type,rscript_fh,type_form='aas'): """ Transforamtion of counts of mutants in data_lbl table using DESeq2 :param prj_dh: path to the project directory :param transform_type: type of transformation rlog or VST :returns data_lbl: data_lbl with transformed counts """ data_lbl_fhs=glob("%s/data_lbl/aas/*" % prj_dh) data_lbl_col='NiA_norm' col_sep="." data_lbl_all=fhs2data_combo(data_lbl_fhs,cols=[data_lbl_col],index='mutids',col_sep=col_sep) data_lbl_all_dh='%s/data_lbl/%s_all' % (prj_dh,type_form) if not exists(data_lbl_all_dh): makedirs(data_lbl_all_dh) data_lbl_all_fh='%s/%s.csv' % (data_lbl_all_dh,data_lbl_col) # data_lbl_all=debad(data_lbl_all,axis=0,condi='any',bad='nan') # #psudocount to avoid all zero error # data_lbl_all=data_lbl_all.fillna(0) # data_lbl_all=data_lbl_all+0.5 data_lbl_all.to_csv(data_lbl_all_fh) data_lbl_tran_col='NiA_tran' data_lbl_all_tran_fh='%s/%s.csv' % (data_lbl_all_dh,data_lbl_tran_col) if not exists(data_lbl_all_tran_fh): data_lbl_all_tran=pd.DataFrame(index=data_lbl_all.index) data_lbl_all_tran.index.name='mutids' repli=pd.read_csv('%s/cfg/repli' % prj_dh).set_index('varname') for avg in repli.index: avg_col="%s%s%s" % (avg,col_sep,data_lbl_col) if transform_type=='rlog': data_lbl_tran_fh="%s/%s.deseq2_annot.csv.deseq2_rld.csv" % (data_lbl_all_dh,avg_col) elif transform_type=='vst': data_lbl_tran_fh="%s/%s.deseq2_annot.csv.deseq2_vsd.csv" % (data_lbl_all_dh,avg_col) if not exists(data_lbl_tran_fh): data_deseq2_annot=pd.DataFrame(columns=['condition','type','number of lanes','total number of reads','exon counts']) data_deseq2_annot.index.name='file' data_deseq2_annot_fh='%s/%s.deseq2_annot.csv' % (data_lbl_all_dh,avg_col) data_deseq2_count=pd.DataFrame() data_deseq2_count_fh='%s/%s.deseq2_count.csv' % (data_lbl_all_dh,avg_col) for rep in repli.loc[avg,:]: if not pd.isnull(rep): rep_col="%s%s%s" % (rep,col_sep,data_lbl_col) data_deseq2_annot.loc[rep_col,'condition']=rep if len(data_deseq2_count)==0: data_deseq2_count=pd.DataFrame(data_lbl_all.loc[:,rep_col].copy()) else: data_deseq2_count.loc[:,rep_col]=data_lbl_all.loc[:,rep_col] # data_deseq2_annot_index_all=["%s%s%s" % (basename(fh),col_sep,data_lbl_col) \ # for fh in data_lbl_fhs if (basename(fh) not in repli.index)] # data_deseq2_annot_index_no_reps=[i for i in data_deseq2_annot_index_all if (i not in data_deseq2_annot.index)] # for idx in data_deseq2_annot_index_no_reps: # data_deseq2_annot.loc[idx,'condition']=idx data_deseq2_annot.to_csv(data_deseq2_annot_fh) data_deseq2_count=debad(data_deseq2_count,axis=0,condi='any',bad='nan') data_deseq2_count.to_csv(data_deseq2_count_fh) deseq_fh="%s/deseq2.R" % (abspath(dirname(__file__))) log_fh="%s.log" % data_deseq2_annot_fh with open(log_fh,'a') as log_f: deseq2_com='%s %s %s %s 1' % (rscript_fh,deseq_fh,data_deseq2_count_fh,data_deseq2_annot_fh) # print deseq2_com subprocess.call(deseq2_com,shell=True,stdout=log_f, stderr=subprocess.STDOUT) data_lbl_tran=pd.read_csv(data_lbl_tran_fh).set_index('Unnamed: 0') data_lbl_tran.index.name='mutids' if len(data_lbl_all_tran)==0: data_lbl_all_tran=data_lbl_tran.copy() else: data_lbl_all_tran=data_lbl_all_tran.join(data_lbl_tran) # print data_lbl_all_tran.columns.tolist() # print len(data_lbl_all_tran) if len(data_lbl_all_tran.columns.tolist())>0: data_lbl_all_tran.to_csv(data_lbl_all_tran_fh) else: logging.error('transform_type can not be %s: no replicates found' % transform_type) sys.exit() else: data_lbl_all_tran=pd.read_csv(data_lbl_all_tran_fh).set_index('mutids') data_lbl_tran_col='NiA_tran' for col in data_lbl_all_tran: data_lbl_fn,tmp=col.split('.') data_lbl_fh='%s/data_lbl/%s/%s' % (prj_dh,type_form,data_lbl_fn) data_lbl=pd.read_csv(data_lbl_fh).set_index('mutids') if not data_lbl_tran_col in data_lbl: data_lbl_cols=data_lbl.columns.tolist() data_lbl=data_lbl.join(data_lbl_all_tran.loc[:,col]) data_lbl.columns=data_lbl_cols+[data_lbl_tran_col] data_lbl.index.name='mutids' data_lbl.to_csv(data_lbl_fh)
def transform_data_lbl_deseq(prj_dh, transform_type, rscript_fh, type_form='aas'): """ Transforamtion of counts of mutants in data_lbl table using DESeq2 :param prj_dh: path to the project directory :param transform_type: type of transformation rlog or VST :returns data_lbl: data_lbl with transformed counts """ data_lbl_fhs = glob("%s/data_lbl/aas/*" % prj_dh) data_lbl_col = 'NiA_norm' col_sep = "." data_lbl_all = fhs2data_combo(data_lbl_fhs, cols=[data_lbl_col], index='mutids', col_sep=col_sep) data_lbl_all_dh = '%s/data_lbl/%s_all' % (prj_dh, type_form) if not exists(data_lbl_all_dh): makedirs(data_lbl_all_dh) data_lbl_all_fh = '%s/%s.csv' % (data_lbl_all_dh, data_lbl_col) # data_lbl_all=debad(data_lbl_all,axis=0,condi='any',bad='nan') # #psudocount to avoid all zero error # data_lbl_all=data_lbl_all.fillna(0) # data_lbl_all=data_lbl_all+0.5 data_lbl_all.to_csv(data_lbl_all_fh) data_lbl_tran_col = 'NiA_tran' data_lbl_all_tran_fh = '%s/%s.csv' % (data_lbl_all_dh, data_lbl_tran_col) if not exists(data_lbl_all_tran_fh): data_lbl_all_tran = pd.DataFrame(index=data_lbl_all.index) data_lbl_all_tran.index.name = 'mutids' repli = pd.read_csv('%s/cfg/repli' % prj_dh).set_index('varname') for avg in repli.index: avg_col = "%s%s%s" % (avg, col_sep, data_lbl_col) if transform_type == 'rlog': data_lbl_tran_fh = "%s/%s.deseq2_annot.csv.deseq2_rld.csv" % ( data_lbl_all_dh, avg_col) elif transform_type == 'vst': data_lbl_tran_fh = "%s/%s.deseq2_annot.csv.deseq2_vsd.csv" % ( data_lbl_all_dh, avg_col) if not exists(data_lbl_tran_fh): data_deseq2_annot = pd.DataFrame(columns=[ 'condition', 'type', 'number of lanes', 'total number of reads', 'exon counts' ]) data_deseq2_annot.index.name = 'file' data_deseq2_annot_fh = '%s/%s.deseq2_annot.csv' % ( data_lbl_all_dh, avg_col) data_deseq2_count = pd.DataFrame() data_deseq2_count_fh = '%s/%s.deseq2_count.csv' % ( data_lbl_all_dh, avg_col) for rep in repli.loc[avg, :]: if not pd.isnull(rep): rep_col = "%s%s%s" % (rep, col_sep, data_lbl_col) data_deseq2_annot.loc[rep_col, 'condition'] = rep if len(data_deseq2_count) == 0: data_deseq2_count = pd.DataFrame( data_lbl_all.loc[:, rep_col].copy()) else: data_deseq2_count.loc[:, rep_col] = data_lbl_all.loc[:, rep_col] # data_deseq2_annot_index_all=["%s%s%s" % (basename(fh),col_sep,data_lbl_col) \ # for fh in data_lbl_fhs if (basename(fh) not in repli.index)] # data_deseq2_annot_index_no_reps=[i for i in data_deseq2_annot_index_all if (i not in data_deseq2_annot.index)] # for idx in data_deseq2_annot_index_no_reps: # data_deseq2_annot.loc[idx,'condition']=idx data_deseq2_annot.to_csv(data_deseq2_annot_fh) data_deseq2_count = debad(data_deseq2_count, axis=0, condi='any', bad='nan') data_deseq2_count.to_csv(data_deseq2_count_fh) deseq_fh = "%s/deseq2.R" % (abspath(dirname(__file__))) log_fh = "%s.log" % data_deseq2_annot_fh with open(log_fh, 'a') as log_f: deseq2_com = '%s %s %s %s 1' % (rscript_fh, deseq_fh, data_deseq2_count_fh, data_deseq2_annot_fh) # print deseq2_com subprocess.call(deseq2_com, shell=True, stdout=log_f, stderr=subprocess.STDOUT) data_lbl_tran = pd.read_csv(data_lbl_tran_fh).set_index( 'Unnamed: 0') data_lbl_tran.index.name = 'mutids' if len(data_lbl_all_tran) == 0: data_lbl_all_tran = data_lbl_tran.copy() else: data_lbl_all_tran = data_lbl_all_tran.join(data_lbl_tran) # print data_lbl_all_tran.columns.tolist() # print len(data_lbl_all_tran) if len(data_lbl_all_tran.columns.tolist()) > 0: data_lbl_all_tran.to_csv(data_lbl_all_tran_fh) else: logging.error('transform_type can not be %s: no replicates found' % transform_type) sys.exit() else: data_lbl_all_tran = pd.read_csv(data_lbl_all_tran_fh).set_index( 'mutids') data_lbl_tran_col = 'NiA_tran' for col in data_lbl_all_tran: data_lbl_fn, tmp = col.split('.') data_lbl_fh = '%s/data_lbl/%s/%s' % (prj_dh, type_form, data_lbl_fn) data_lbl = pd.read_csv(data_lbl_fh).set_index('mutids') if not data_lbl_tran_col in data_lbl: data_lbl_cols = data_lbl.columns.tolist() data_lbl = data_lbl.join(data_lbl_all_tran.loc[:, col]) data_lbl.columns = data_lbl_cols + [data_lbl_tran_col] data_lbl.index.name = 'mutids' data_lbl.to_csv(data_lbl_fh)
def plot_data_comparison_multiviolin(prj_dh,data_fits,col, data_fiti_ctrl=0, aasORcds="aas", ns=True,numeric=False, color_test=(0.6, 0.8, 1),#'mediumpurple'#"lime" color_ctrl="lightgray", data_fits_labels=None, pval=True, stars=True, violin_width=0.9, color_xticks=None, force=False, ylims=None, col_hue='Conditions', label_test='Test', label_ctrl='Control', ylabel=None, figsize=[4,3], plot_fh=None): """ Plotting distributions of comparison of fold change data :param prj_dh: path to project directory :param data_fits: pandas dataframe with fold change data :param col: column with the fold change data """ data_fit_fhs=["%s/data_fit/%s/%s" % (prj_dh,aasORcds,s) for s in data_fits] if plot_fh!=None: data_comparison_ctrl_fh=plot_fh+"data_comparison_ctrl.csv" data_comparison_test_fh=plot_fh+"data_comparison_test.csv" else: data_comparison_ctrl_fh="data_comparison_ctrl.csv" data_comparison_test_fh="data_comparison_test.csv" if (not exists(data_comparison_ctrl_fh)) or\ (not exists(data_comparison_test_fh)) or force: data_fit_test_fhs=[fh for i,fh in enumerate(data_fit_fhs) if i!=0] data_fit_ctrl_fhs=[fh for i,fh in enumerate(data_fit_fhs) if i==0] data_comparison_test=fhs2data_combo(data_fit_test_fhs,[col],index='mutids') data_comparison_ctrl=fhs2data_combo(data_fit_ctrl_fhs,[col],index='mutids') data_comparison_test.columns=data_fits[1:] data_comparison_ctrl.columns=data_fits[1:] #data_comparison_test.to_csv(data_comparison_test_fh) #data_comparison_ctrl.to_csv(data_comparison_ctrl_fh) else: data_comparison_test=pd.read_csv(data_comparison_test_fh).set_index('mutids') data_comparison_ctrl=pd.read_csv(data_comparison_ctrl_fh).set_index('mutids') data_all=pd.concat([data_comparison_ctrl.loc[:,data_comparison_ctrl.columns.tolist()[0]], data_comparison_test],axis=1) data_all.columns=data_fits data_comparison_test=data_comparison_test.unstack().reset_index() data_comparison_ctrl=data_comparison_ctrl.unstack().reset_index() data_comparison_test.columns=["condi","mutids",col] data_comparison_ctrl.columns=["condi","mutids",col] data_comparison_test.loc[:,col_hue]=label_test data_comparison_ctrl.loc[:,col_hue]=label_ctrl data_comparison=data_comparison_test.append(data_comparison_ctrl) y_max=data_comparison.loc[:,col].max() y_min=data_comparison.loc[:,col].min() data_fit_ctrl=data_fits[data_fiti_ctrl] data_fit_tests=[s for s in data_fits if data_fits.index(s)!=data_fiti_ctrl] # fig= plt.figure(figsize=figsize,dpi=300) ax=plt.subplot(111) import seaborn as sns from scipy.stats import mannwhitneyu,wilcoxon # plt.style.use('seaborn-whitegrid') # plt.style.use('seaborn-white') sns.violinplot(x="condi", y=col, hue=col_hue, split=True, data=data_comparison, # color='m', palette={label_test: color_test, label_ctrl: color_ctrl}, inner="quartile", width=violin_width, cut=0, # bw=0.1, scale="width",ax=ax) plt.legend(title=col_hue,loc='upper center', bbox_to_anchor=(0.5, 1.1),#, 1., .102), ncol=2, borderaxespad=0.1,frameon=True, ) ax.set_xlabel("") if not ylabel is None: ax.set_ylabel(ylabel) # ax.grid(b=True) ax.yaxis.grid(True) if data_fits_labels!=None: ax.set_xticklabels(data_fits_labels[1:]) if ylabel!=None: ax.set_ylabel(ylabel) plt.tight_layout() if ylims is None: ax.set_ylim([y_min,y_max]) else: ax.set_ylim([ylims[0],ylims[1]]) # return data_comparison,data_all if pval: col_ctrl=data_all.columns.tolist()[0] col_tests=data_all.columns.tolist()[1:] for col_testi in range(len(col_tests)): col_test=col_tests[col_testi] pval,side=get_wilcoxon(data_all,col_ctrl,col_test,side='one',denan=False) side='' #stitch # print pval if stars: pval=pval2stars(pval,ns=ns,numeric=False) if ns and (pval>0.05): side='' pval='' result="%s\n%s" % (side,pval) else: result="%s\n%s" % (side,pval) # print result # print ax.get_ylim()[0]+(ax.get_ylim()[1]-ax.get_ylim()[0])*0.05 ax.text(col_testi,ax.get_ylim()[0]+(ax.get_ylim()[1]-ax.get_ylim()[0])*0.05, result,ha='center',color='b', bbox=dict(facecolor='w', edgecolor='none', # boxstyle='round', alpha=0.6,) ) # data_all.to_csv('test1.csv') if plot_fh!=None: #plt.tight_layout() plt.savefig(plot_fh,format='pdf') plt.clf();plt.close() return ax,data_comparison,data_all