예제 #1
0
def make_deseq2_count(unsel,
                      sel,
                      data_deseq2_annot,
                      data_lbl_col,
                      prj_dh,
                      type_form='aas'):
    """
    Makes DESeq2 count file

    :param unsel: unselected condition
    :param sel: selected condtition
    :param prj_dh: path to the project directory
    :param data_lbl_col: column of the data_lbl pandas table to be used 
    :param data_deseq2_annot: pandas table with annotation information 
    """
    data_deseq2_count_dh = '%s/data_fit/%s_all' % (prj_dh, type_form)
    data_deseq2_count_fh = '%s/%s_WRT_%s.deseq2_count.csv' % (
        data_deseq2_count_dh, sel, unsel)
    if not exists(data_deseq2_count_fh):
        data_lbl_fhs = [
            "%s/data_lbl/%s/%s" % (prj_dh, type_form, s.split('.')[0])
            for s in data_deseq2_annot.index
        ]
        col_sep = "."
        data_lbl_all = fhs2data_combo(data_lbl_fhs,
                                      cols=[data_lbl_col],
                                      index='mutids',
                                      col_sep=col_sep)
        data_lbl_all = debad(data_lbl_all, axis=0, condi='any', bad='nan')
        data_lbl_all.to_csv(data_deseq2_count_fh)
    else:
        data_lbl_all = pd.read_csv(data_deseq2_count_fh).set_index('mutids')
    return data_lbl_all, data_deseq2_count_fh
예제 #2
0
def transform_data_lbl(
    prj_dh,
    transform_type,
    type_form='aas',
    data_lbl_col='NiA_norm',
):
    """
    Transforamtion of counts of mutants in data_lbl table


    :param prj_dh: path to the project directory
    :param transform_type: type of transformation log, plog, glog etc
    :returns data_lbl: data_lbl with transformed counts
    """
    data_lbl_fhs = glob("%s/data_lbl/aas/*" % prj_dh)
    if len(data_lbl_fhs) > 0:
        col_sep = "."
        data_lbl_all = fhs2data_combo(data_lbl_fhs,
                                      cols=[data_lbl_col],
                                      index='mutids',
                                      col_sep=col_sep)
        data_lbl_all_dh = '%s/data_lbl/%s_all' % (prj_dh, type_form)
        if not exists(data_lbl_all_dh):
            makedirs(data_lbl_all_dh)
        data_lbl_all_fh = '%s/%s.csv' % (data_lbl_all_dh, data_lbl_col)
        data_lbl_all.to_csv(data_lbl_all_fh)

        if (transform_type == 'log2') or (transform_type == 'log'):
            data_lbl_all = data_lbl_all.apply(np.log2)
        elif transform_type == 'plog':
            data_lbl_all = data_lbl_all.apply(plog)
        else:
            logging.error("trnaform_type not valid: %s" % transform_type)
            sys.exist()
        data_lbl_col = 'NiA_tran'
        data_lbl_all_fh = '%s/%s.csv' % (data_lbl_all_dh, data_lbl_col)
        data_lbl_all.to_csv(data_lbl_all_fh)

        for col in data_lbl_all:
            data_lbl_fn, tmp = col.split('.')
            data_lbl_fh = '%s/data_lbl/%s/%s' % (prj_dh, type_form,
                                                 data_lbl_fn)
            data_lbl = pd.read_csv(data_lbl_fh).set_index('mutids')
            if not data_lbl_col in data_lbl:
                data_lbl_cols = data_lbl.columns.tolist()
                data_lbl = pd.concat([data_lbl, data_lbl_all.loc[:, col]],
                                     axis=1)
                data_lbl.columns = data_lbl_cols + [data_lbl_col]
                data_lbl.index.name = 'mutids'
                data_lbl.to_csv(data_lbl_fh)
예제 #3
0
def transform_data_lbl(prj_dh,transform_type,
                      type_form='aas',data_lbl_col='NiA_norm',):
    """
    Transforamtion of counts of mutants in data_lbl table


    :param prj_dh: path to the project directory
    :param transform_type: type of transformation log, plog, glog etc
    :returns data_lbl: data_lbl with transformed counts
    """
    data_lbl_fhs=glob("%s/data_lbl/aas/*" % prj_dh)
    if len(data_lbl_fhs)>0:
        col_sep="."
        data_lbl_all=fhs2data_combo(data_lbl_fhs,cols=[data_lbl_col],index='mutids',col_sep=col_sep)
        data_lbl_all_dh='%s/data_lbl/%s_all' % (prj_dh,type_form)
        if not exists(data_lbl_all_dh):
            makedirs(data_lbl_all_dh)
        data_lbl_all_fh='%s/%s.csv' % (data_lbl_all_dh,data_lbl_col)
        data_lbl_all.to_csv(data_lbl_all_fh)

        if (transform_type=='log2') or (transform_type=='log'):
            data_lbl_all=data_lbl_all.apply(np.log2)
        elif transform_type=='plog':
            data_lbl_all=data_lbl_all.apply(plog)
        else:
            logging.error("trnaform_type not valid: %s" % transform_type)
            sys.exist()
        data_lbl_col='NiA_tran'
        data_lbl_all_fh='%s/%s.csv' % (data_lbl_all_dh,data_lbl_col)
        data_lbl_all.to_csv(data_lbl_all_fh)
        
        for col in data_lbl_all:
            data_lbl_fn,tmp=col.split('.')
            data_lbl_fh='%s/data_lbl/%s/%s' % (prj_dh,type_form,data_lbl_fn)
            data_lbl=pd.read_csv(data_lbl_fh).set_index('mutids')
            if not data_lbl_col in data_lbl:
                data_lbl_cols=data_lbl.columns.tolist()
                data_lbl=pd.concat([data_lbl,
                                    data_lbl_all.loc[:,col]],axis=1)
                data_lbl.columns=data_lbl_cols+[data_lbl_col]
                data_lbl.index.name='mutids'
                data_lbl.to_csv(data_lbl_fh)
예제 #4
0
def make_deseq2_count(unsel,sel,data_deseq2_annot,data_lbl_col,prj_dh,type_form='aas'):
    """
    Makes DESeq2 count file

    :param unsel: unselected condition
    :param sel: selected condtition
    :param prj_dh: path to the project directory
    :param data_lbl_col: column of the data_lbl pandas table to be used 
    :param data_deseq2_annot: pandas table with annotation information 
    """
    data_deseq2_count_dh='%s/data_fit/%s_all' % (prj_dh,type_form)
    data_deseq2_count_fh='%s/%s_WRT_%s.deseq2_count.csv' % (data_deseq2_count_dh,sel,unsel)
    if not exists(data_deseq2_count_fh):
        data_lbl_fhs=["%s/data_lbl/%s/%s" % (prj_dh,type_form,s.split('.')[0]) for s in data_deseq2_annot.index]
        col_sep="."
        data_lbl_all=fhs2data_combo(data_lbl_fhs,cols=[data_lbl_col],index='mutids',col_sep=col_sep)
        data_lbl_all=debad(data_lbl_all,axis=0,condi='any',bad='nan')
        data_lbl_all.to_csv(data_deseq2_count_fh)
    else:
        data_lbl_all=pd.read_csv(data_deseq2_count_fh).set_index('mutids')
    return data_lbl_all,data_deseq2_count_fh
예제 #5
0
def data_lbl2data_fit_lite(fits_pairs,prj_dh,data_lbl_dh,data_fit_dh,force=False):
    """
    Short wrapper for conversion of mutation counts to fold changes

    :param fits_pair: list with pair of selected and reference condition
    :param pj_dh: path to the protject directory
    :param data_lbl_dh: path to the diorectory containing data_lbl csv tables
    :param data_fit_dh: path to the directory of data_fit csvs
    """
    data_fit_fh='%s/%s/aas/%s_WRT_%s' % (prj_dh,data_fit_dh,fits_pairs[1],fits_pairs[0])
    if not exists(data_fit_fh) or force:
        data_lbl_fhs=['%s/%s/aas/%s' % (prj_dh,data_lbl_dh,s) for s in fits_pairs]

        data_fit=fhs2data_combo(data_lbl_fhs,cols=['NiA_tran'],
        #                labels=['ref','sel'],
                       index='mutids',col_sep='.')
        data_fit.columns=['NiA_tran.ref','NiA_tran.sel']
        data_fit.loc[:,'FCA']=data_fit.loc[:,'NiA_tran.sel']-data_fit.loc[:,'NiA_tran.ref']
        data_fit.loc[:,'FCA_norm']=data_fit.loc[:,'NiA_tran.sel']-data_fit.loc[:,'NiA_tran.ref']
        if not exists(dirname(data_fit_fh)):
            makedirs(dirname(data_fit_fh))
        data_fit.to_csv(data_fit_fh)
예제 #6
0
def corrplot(info):
    """
    Plots a correlation matrix heatmap between range of features and fold change values

    :param info: dict, with the information of the experiment
    """

    from dms2dfe.lib.io_dfs import fhs2data_combo
    from glob import glob
    from dms2dfe.lib.plot_mut_data_heatmaps import clustermap
    from dms2dfe.lib.io_ml_data import make_dXy

    ml_input = info.ml_input
    prj_dh = info.prj_dh
    data_fit_fhs = glob('%s/data_fit/aas/*' % prj_dh)
    data_feats_all_fh = '%s/data_feats/aas/data_feats_all' % prj_dh
    data_feats_all = pd.read_csv(data_feats_all_fh).set_index('mutids')
    data_fit_all = fhs2data_combo(data_fit_fhs, ['%sA' % ml_input], 'mutids')
    data_fit_all.columns = [c.split(': ')[0] for c in data_fit_all]

    for c in data_fit_all:
        plot_fh = '%s/plots/aas/%s.corr.pdf' % (prj_dh, c)
        if not exists(plot_fh):
            if not exists(dirname(plot_fh)):
                makedirs(dirname(plot_fh))
            dXy = data_feats_all.join(data_fit_all[c])
            dXy, Xcols, ycol = make_dXy(dXy,
                                        ycol=c,
                                        if_rescalecols=False,
                                        unique_quantile=0.25)
            dXy, Xcols, ycol = feats_sel_corr(dXy, ycol, range_coef=[0.9, 0.8])
            g, ax = clustermap(
                dXy.corr(method='spearman'),
                highlight_col=c,
                vlim=[-0.5, 0.5],
                figsize=[10, 10],
                plot_fh=plot_fh,
            )
예제 #7
0
def data_lbl2data_fit_lite(fits_pairs,
                           prj_dh,
                           data_lbl_dh,
                           data_fit_dh,
                           force=False):
    """
    Short wrapper for conversion of mutation counts to fold changes

    :param fits_pair: list with pair of selected and reference condition
    :param pj_dh: path to the protject directory
    :param data_lbl_dh: path to the diorectory containing data_lbl csv tables
    :param data_fit_dh: path to the directory of data_fit csvs
    """
    data_fit_fh = '%s/%s/aas/%s_WRT_%s' % (prj_dh, data_fit_dh, fits_pairs[1],
                                           fits_pairs[0])
    if not exists(data_fit_fh) or force:
        data_lbl_fhs = [
            '%s/%s/aas/%s' % (prj_dh, data_lbl_dh, s) for s in fits_pairs
        ]

        data_fit = fhs2data_combo(
            data_lbl_fhs,
            cols=['NiA_tran'],
            #                labels=['ref','sel'],
            index='mutids',
            col_sep='.')
        data_fit.columns = ['NiA_tran.ref', 'NiA_tran.sel']
        data_fit.loc[:,
                     'FCA'] = data_fit.loc[:,
                                           'NiA_tran.sel'] - data_fit.loc[:,
                                                                          'NiA_tran.ref']
        data_fit.loc[:,
                     'FCA_norm'] = data_fit.loc[:,
                                                'NiA_tran.sel'] - data_fit.loc[:,
                                                                               'NiA_tran.ref']
        if not exists(dirname(data_fit_fh)):
            makedirs(dirname(data_fit_fh))
        data_fit.to_csv(data_fit_fh)
예제 #8
0
def plot_data_comparison_multiviolin(
        prj_dh,
        data_fits,
        col,
        data_fiti_ctrl=0,
        aasORcds="aas",
        ns=True,
        numeric=False,
        color_test=(0.6, 0.8, 1),  #'mediumpurple'#"lime"
        color_ctrl="lightgray",
        data_fits_labels=None,
        pval=True,
        stars=True,
        violin_width=0.9,
        color_xticks=None,
        force=False,
        ylims=None,
        col_hue='Conditions',
        label_test='Test',
        label_ctrl='Control',
        ylabel=None,
        figsize=[4, 3],
        plot_fh=None):
    """
    Plotting distributions of comparison of fold change data 

    :param prj_dh: path to project directory
    :param data_fits: pandas dataframe with fold change data
    :param col: column with the fold change data
    """
    data_fit_fhs = [
        "%s/data_fit/%s/%s" % (prj_dh, aasORcds, s) for s in data_fits
    ]
    if plot_fh != None:
        data_comparison_ctrl_fh = plot_fh + "data_comparison_ctrl.csv"
        data_comparison_test_fh = plot_fh + "data_comparison_test.csv"
    else:
        data_comparison_ctrl_fh = "data_comparison_ctrl.csv"
        data_comparison_test_fh = "data_comparison_test.csv"
    if (not exists(data_comparison_ctrl_fh)) or\
        (not exists(data_comparison_test_fh)) or force:

        data_fit_test_fhs = [fh for i, fh in enumerate(data_fit_fhs) if i != 0]
        data_fit_ctrl_fhs = [fh for i, fh in enumerate(data_fit_fhs) if i == 0]
        data_comparison_test = fhs2data_combo(data_fit_test_fhs, [col],
                                              index='mutids')
        data_comparison_ctrl = fhs2data_combo(data_fit_ctrl_fhs, [col],
                                              index='mutids')

        data_comparison_test.columns = data_fits[1:]
        data_comparison_ctrl.columns = data_fits[1:]
        #data_comparison_test.to_csv(data_comparison_test_fh)
        #data_comparison_ctrl.to_csv(data_comparison_ctrl_fh)
    else:
        data_comparison_test = pd.read_csv(data_comparison_test_fh).set_index(
            'mutids')
        data_comparison_ctrl = pd.read_csv(data_comparison_ctrl_fh).set_index(
            'mutids')

    data_all = pd.concat([
        data_comparison_ctrl.loc[:,
                                 data_comparison_ctrl.columns.tolist()[0]],
        data_comparison_test
    ],
                         axis=1)
    data_all.columns = data_fits

    data_comparison_test = data_comparison_test.unstack().reset_index()
    data_comparison_ctrl = data_comparison_ctrl.unstack().reset_index()

    data_comparison_test.columns = ["condi", "mutids", col]
    data_comparison_ctrl.columns = ["condi", "mutids", col]

    data_comparison_test.loc[:, col_hue] = label_test
    data_comparison_ctrl.loc[:, col_hue] = label_ctrl
    data_comparison = data_comparison_test.append(data_comparison_ctrl)

    y_max = data_comparison.loc[:, col].max()
    y_min = data_comparison.loc[:, col].min()

    data_fit_ctrl = data_fits[data_fiti_ctrl]
    data_fit_tests = [
        s for s in data_fits if data_fits.index(s) != data_fiti_ctrl
    ]

    # fig=
    plt.figure(figsize=figsize, dpi=300)
    ax = plt.subplot(111)
    import seaborn as sns
    from scipy.stats import mannwhitneyu, wilcoxon
    #     plt.style.use('seaborn-whitegrid')
    #     plt.style.use('seaborn-white')
    sns.violinplot(
        x="condi",
        y=col,
        hue=col_hue,
        split=True,
        data=data_comparison,
        #                    color='m',
        palette={
            label_test: color_test,
            label_ctrl: color_ctrl
        },
        inner="quartile",
        width=violin_width,
        cut=0,
        # bw=0.1,
        scale="width",
        ax=ax)
    plt.legend(
        title=col_hue,
        loc='upper center',
        bbox_to_anchor=(0.5, 1.1),  #, 1., .102),
        ncol=2,
        borderaxespad=0.1,
        frameon=True,
    )
    ax.set_xlabel("")
    if not ylabel is None:
        ax.set_ylabel(ylabel)
#     ax.grid(b=True)
    ax.yaxis.grid(True)
    if data_fits_labels != None:
        ax.set_xticklabels(data_fits_labels[1:])
    if ylabel != None:
        ax.set_ylabel(ylabel)
        plt.tight_layout()
    if ylims is None:
        ax.set_ylim([y_min, y_max])
    else:
        ax.set_ylim([ylims[0], ylims[1]])


#     return data_comparison,data_all
    if pval:
        col_ctrl = data_all.columns.tolist()[0]
        col_tests = data_all.columns.tolist()[1:]
        for col_testi in range(len(col_tests)):
            col_test = col_tests[col_testi]
            pval, side = get_wilcoxon(data_all,
                                      col_ctrl,
                                      col_test,
                                      side='one',
                                      denan=False)
            side = ''  #stitch
            # print pval
            if stars:
                pval = pval2stars(pval, ns=ns, numeric=False)
            if ns and (pval > 0.05):
                side = ''
                pval = ''
                result = "%s\n%s" % (side, pval)
            else:
                result = "%s\n%s" % (side, pval)
            # print result
            # print ax.get_ylim()[0]+(ax.get_ylim()[1]-ax.get_ylim()[0])*0.05
            ax.text(
                col_testi,
                ax.get_ylim()[0] +
                (ax.get_ylim()[1] - ax.get_ylim()[0]) * 0.05,
                result,
                ha='center',
                color='b',
                bbox=dict(
                    facecolor='w',
                    edgecolor='none',
                    # boxstyle='round',
                    alpha=0.6,
                ))
        # data_all.to_csv('test1.csv')
    if plot_fh != None:
        #plt.tight_layout()
        plt.savefig(plot_fh, format='pdf')
        plt.clf()
        plt.close()
    return ax, data_comparison, data_all
예제 #9
0
def transform_data_lbl_deseq(prj_dh,transform_type,rscript_fh,type_form='aas'):
    """
    Transforamtion of counts of mutants in data_lbl table using DESeq2


    :param prj_dh: path to the project directory
    :param transform_type: type of transformation rlog or VST
    :returns data_lbl: data_lbl with transformed counts
    """
    data_lbl_fhs=glob("%s/data_lbl/aas/*" % prj_dh)
    data_lbl_col='NiA_norm'
    col_sep="."
    data_lbl_all=fhs2data_combo(data_lbl_fhs,cols=[data_lbl_col],index='mutids',col_sep=col_sep)
    data_lbl_all_dh='%s/data_lbl/%s_all' % (prj_dh,type_form)
    if not exists(data_lbl_all_dh):
        makedirs(data_lbl_all_dh)
    data_lbl_all_fh='%s/%s.csv' % (data_lbl_all_dh,data_lbl_col)
    # data_lbl_all=debad(data_lbl_all,axis=0,condi='any',bad='nan')
    # #psudocount to avoid all zero error
    # data_lbl_all=data_lbl_all.fillna(0)
    # data_lbl_all=data_lbl_all+0.5 
    data_lbl_all.to_csv(data_lbl_all_fh)

    data_lbl_tran_col='NiA_tran'
    data_lbl_all_tran_fh='%s/%s.csv' % (data_lbl_all_dh,data_lbl_tran_col)
    if not exists(data_lbl_all_tran_fh):
        data_lbl_all_tran=pd.DataFrame(index=data_lbl_all.index)
        data_lbl_all_tran.index.name='mutids'
        repli=pd.read_csv('%s/cfg/repli' % prj_dh).set_index('varname')
        for avg in repli.index:
            avg_col="%s%s%s" % (avg,col_sep,data_lbl_col)
            if transform_type=='rlog':
                data_lbl_tran_fh="%s/%s.deseq2_annot.csv.deseq2_rld.csv" % (data_lbl_all_dh,avg_col)
            elif transform_type=='vst':
                data_lbl_tran_fh="%s/%s.deseq2_annot.csv.deseq2_vsd.csv" % (data_lbl_all_dh,avg_col)
            if not exists(data_lbl_tran_fh):
                data_deseq2_annot=pd.DataFrame(columns=['condition','type','number of lanes','total number of reads','exon counts'])
                data_deseq2_annot.index.name='file'
                data_deseq2_annot_fh='%s/%s.deseq2_annot.csv' % (data_lbl_all_dh,avg_col)
                data_deseq2_count=pd.DataFrame()
                data_deseq2_count_fh='%s/%s.deseq2_count.csv' % (data_lbl_all_dh,avg_col)
                for rep in repli.loc[avg,:]:
                    if not pd.isnull(rep):
                        rep_col="%s%s%s" % (rep,col_sep,data_lbl_col)
                        data_deseq2_annot.loc[rep_col,'condition']=rep
                        if len(data_deseq2_count)==0:
                            data_deseq2_count=pd.DataFrame(data_lbl_all.loc[:,rep_col].copy())
                        else:
                            data_deseq2_count.loc[:,rep_col]=data_lbl_all.loc[:,rep_col]
                # data_deseq2_annot_index_all=["%s%s%s" % (basename(fh),col_sep,data_lbl_col) \
                #                              for fh in data_lbl_fhs if (basename(fh) not in repli.index)]
                # data_deseq2_annot_index_no_reps=[i for i in data_deseq2_annot_index_all if (i not in data_deseq2_annot.index)]
                # for idx in data_deseq2_annot_index_no_reps:
                #     data_deseq2_annot.loc[idx,'condition']=idx
                data_deseq2_annot.to_csv(data_deseq2_annot_fh)
                data_deseq2_count=debad(data_deseq2_count,axis=0,condi='any',bad='nan')
                data_deseq2_count.to_csv(data_deseq2_count_fh)            
                deseq_fh="%s/deseq2.R" % (abspath(dirname(__file__)))
                log_fh="%s.log" % data_deseq2_annot_fh
                with open(log_fh,'a') as log_f:
                    deseq2_com='%s %s %s %s 1' % (rscript_fh,deseq_fh,data_deseq2_count_fh,data_deseq2_annot_fh)
                    # print deseq2_com
                    subprocess.call(deseq2_com,shell=True,stdout=log_f, stderr=subprocess.STDOUT)              
            data_lbl_tran=pd.read_csv(data_lbl_tran_fh).set_index('Unnamed: 0')
            data_lbl_tran.index.name='mutids'
            if len(data_lbl_all_tran)==0:
                data_lbl_all_tran=data_lbl_tran.copy()
            else:
                data_lbl_all_tran=data_lbl_all_tran.join(data_lbl_tran)
            # print data_lbl_all_tran.columns.tolist()
            # print len(data_lbl_all_tran)
        if len(data_lbl_all_tran.columns.tolist())>0:
            data_lbl_all_tran.to_csv(data_lbl_all_tran_fh)
        else:
            logging.error('transform_type can not be %s: no replicates found' % transform_type)
            sys.exit()
    else:        
        data_lbl_all_tran=pd.read_csv(data_lbl_all_tran_fh).set_index('mutids')

    data_lbl_tran_col='NiA_tran'    
    for col in data_lbl_all_tran:
        data_lbl_fn,tmp=col.split('.')
        data_lbl_fh='%s/data_lbl/%s/%s' % (prj_dh,type_form,data_lbl_fn)
        data_lbl=pd.read_csv(data_lbl_fh).set_index('mutids')
        if not data_lbl_tran_col in data_lbl:
            data_lbl_cols=data_lbl.columns.tolist()
            data_lbl=data_lbl.join(data_lbl_all_tran.loc[:,col])
            data_lbl.columns=data_lbl_cols+[data_lbl_tran_col]
            data_lbl.index.name='mutids'
            data_lbl.to_csv(data_lbl_fh)
예제 #10
0
def transform_data_lbl_deseq(prj_dh,
                             transform_type,
                             rscript_fh,
                             type_form='aas'):
    """
    Transforamtion of counts of mutants in data_lbl table using DESeq2


    :param prj_dh: path to the project directory
    :param transform_type: type of transformation rlog or VST
    :returns data_lbl: data_lbl with transformed counts
    """
    data_lbl_fhs = glob("%s/data_lbl/aas/*" % prj_dh)
    data_lbl_col = 'NiA_norm'
    col_sep = "."
    data_lbl_all = fhs2data_combo(data_lbl_fhs,
                                  cols=[data_lbl_col],
                                  index='mutids',
                                  col_sep=col_sep)
    data_lbl_all_dh = '%s/data_lbl/%s_all' % (prj_dh, type_form)
    if not exists(data_lbl_all_dh):
        makedirs(data_lbl_all_dh)
    data_lbl_all_fh = '%s/%s.csv' % (data_lbl_all_dh, data_lbl_col)
    # data_lbl_all=debad(data_lbl_all,axis=0,condi='any',bad='nan')
    # #psudocount to avoid all zero error
    # data_lbl_all=data_lbl_all.fillna(0)
    # data_lbl_all=data_lbl_all+0.5
    data_lbl_all.to_csv(data_lbl_all_fh)

    data_lbl_tran_col = 'NiA_tran'
    data_lbl_all_tran_fh = '%s/%s.csv' % (data_lbl_all_dh, data_lbl_tran_col)
    if not exists(data_lbl_all_tran_fh):
        data_lbl_all_tran = pd.DataFrame(index=data_lbl_all.index)
        data_lbl_all_tran.index.name = 'mutids'
        repli = pd.read_csv('%s/cfg/repli' % prj_dh).set_index('varname')
        for avg in repli.index:
            avg_col = "%s%s%s" % (avg, col_sep, data_lbl_col)
            if transform_type == 'rlog':
                data_lbl_tran_fh = "%s/%s.deseq2_annot.csv.deseq2_rld.csv" % (
                    data_lbl_all_dh, avg_col)
            elif transform_type == 'vst':
                data_lbl_tran_fh = "%s/%s.deseq2_annot.csv.deseq2_vsd.csv" % (
                    data_lbl_all_dh, avg_col)
            if not exists(data_lbl_tran_fh):
                data_deseq2_annot = pd.DataFrame(columns=[
                    'condition', 'type', 'number of lanes',
                    'total number of reads', 'exon counts'
                ])
                data_deseq2_annot.index.name = 'file'
                data_deseq2_annot_fh = '%s/%s.deseq2_annot.csv' % (
                    data_lbl_all_dh, avg_col)
                data_deseq2_count = pd.DataFrame()
                data_deseq2_count_fh = '%s/%s.deseq2_count.csv' % (
                    data_lbl_all_dh, avg_col)
                for rep in repli.loc[avg, :]:
                    if not pd.isnull(rep):
                        rep_col = "%s%s%s" % (rep, col_sep, data_lbl_col)
                        data_deseq2_annot.loc[rep_col, 'condition'] = rep
                        if len(data_deseq2_count) == 0:
                            data_deseq2_count = pd.DataFrame(
                                data_lbl_all.loc[:, rep_col].copy())
                        else:
                            data_deseq2_count.loc[:,
                                                  rep_col] = data_lbl_all.loc[:,
                                                                              rep_col]
                # data_deseq2_annot_index_all=["%s%s%s" % (basename(fh),col_sep,data_lbl_col) \
                #                              for fh in data_lbl_fhs if (basename(fh) not in repli.index)]
                # data_deseq2_annot_index_no_reps=[i for i in data_deseq2_annot_index_all if (i not in data_deseq2_annot.index)]
                # for idx in data_deseq2_annot_index_no_reps:
                #     data_deseq2_annot.loc[idx,'condition']=idx
                data_deseq2_annot.to_csv(data_deseq2_annot_fh)
                data_deseq2_count = debad(data_deseq2_count,
                                          axis=0,
                                          condi='any',
                                          bad='nan')
                data_deseq2_count.to_csv(data_deseq2_count_fh)
                deseq_fh = "%s/deseq2.R" % (abspath(dirname(__file__)))
                log_fh = "%s.log" % data_deseq2_annot_fh
                with open(log_fh, 'a') as log_f:
                    deseq2_com = '%s %s %s %s 1' % (rscript_fh, deseq_fh,
                                                    data_deseq2_count_fh,
                                                    data_deseq2_annot_fh)
                    # print deseq2_com
                    subprocess.call(deseq2_com,
                                    shell=True,
                                    stdout=log_f,
                                    stderr=subprocess.STDOUT)
            data_lbl_tran = pd.read_csv(data_lbl_tran_fh).set_index(
                'Unnamed: 0')
            data_lbl_tran.index.name = 'mutids'
            if len(data_lbl_all_tran) == 0:
                data_lbl_all_tran = data_lbl_tran.copy()
            else:
                data_lbl_all_tran = data_lbl_all_tran.join(data_lbl_tran)
            # print data_lbl_all_tran.columns.tolist()
            # print len(data_lbl_all_tran)
        if len(data_lbl_all_tran.columns.tolist()) > 0:
            data_lbl_all_tran.to_csv(data_lbl_all_tran_fh)
        else:
            logging.error('transform_type can not be %s: no replicates found' %
                          transform_type)
            sys.exit()
    else:
        data_lbl_all_tran = pd.read_csv(data_lbl_all_tran_fh).set_index(
            'mutids')

    data_lbl_tran_col = 'NiA_tran'
    for col in data_lbl_all_tran:
        data_lbl_fn, tmp = col.split('.')
        data_lbl_fh = '%s/data_lbl/%s/%s' % (prj_dh, type_form, data_lbl_fn)
        data_lbl = pd.read_csv(data_lbl_fh).set_index('mutids')
        if not data_lbl_tran_col in data_lbl:
            data_lbl_cols = data_lbl.columns.tolist()
            data_lbl = data_lbl.join(data_lbl_all_tran.loc[:, col])
            data_lbl.columns = data_lbl_cols + [data_lbl_tran_col]
            data_lbl.index.name = 'mutids'
            data_lbl.to_csv(data_lbl_fh)
예제 #11
0
def plot_data_comparison_multiviolin(prj_dh,data_fits,col,
                                     data_fiti_ctrl=0,
                                     aasORcds="aas",
                                     ns=True,numeric=False,
                                     color_test=(0.6, 0.8, 1),#'mediumpurple'#"lime"
                                     color_ctrl="lightgray",    
                                     data_fits_labels=None,
                                     pval=True,
                                     stars=True,
                                     violin_width=0.9,
                                     color_xticks=None,
                                     force=False,
                                     ylims=None,
                                     col_hue='Conditions',
                                    label_test='Test',
                                    label_ctrl='Control',
                                    ylabel=None,
                                     figsize=[4,3],
                                     plot_fh=None):
    """
    Plotting distributions of comparison of fold change data 

    :param prj_dh: path to project directory
    :param data_fits: pandas dataframe with fold change data
    :param col: column with the fold change data
    """
    data_fit_fhs=["%s/data_fit/%s/%s" % (prj_dh,aasORcds,s) for s in data_fits]
    if plot_fh!=None:
        data_comparison_ctrl_fh=plot_fh+"data_comparison_ctrl.csv"
        data_comparison_test_fh=plot_fh+"data_comparison_test.csv"
    else:
        data_comparison_ctrl_fh="data_comparison_ctrl.csv"
        data_comparison_test_fh="data_comparison_test.csv"
    if (not exists(data_comparison_ctrl_fh)) or\
        (not exists(data_comparison_test_fh)) or force:

        data_fit_test_fhs=[fh for i,fh in enumerate(data_fit_fhs) if i!=0]
        data_fit_ctrl_fhs=[fh for i,fh in enumerate(data_fit_fhs) if i==0]
        data_comparison_test=fhs2data_combo(data_fit_test_fhs,[col],index='mutids')
        data_comparison_ctrl=fhs2data_combo(data_fit_ctrl_fhs,[col],index='mutids')

        data_comparison_test.columns=data_fits[1:]    
        data_comparison_ctrl.columns=data_fits[1:]    
        #data_comparison_test.to_csv(data_comparison_test_fh)
        #data_comparison_ctrl.to_csv(data_comparison_ctrl_fh)
    else:    
        data_comparison_test=pd.read_csv(data_comparison_test_fh).set_index('mutids')
        data_comparison_ctrl=pd.read_csv(data_comparison_ctrl_fh).set_index('mutids')

    data_all=pd.concat([data_comparison_ctrl.loc[:,data_comparison_ctrl.columns.tolist()[0]],
                        data_comparison_test],axis=1)
    data_all.columns=data_fits

    data_comparison_test=data_comparison_test.unstack().reset_index()
    data_comparison_ctrl=data_comparison_ctrl.unstack().reset_index()

    data_comparison_test.columns=["condi","mutids",col]
    data_comparison_ctrl.columns=["condi","mutids",col]

    data_comparison_test.loc[:,col_hue]=label_test
    data_comparison_ctrl.loc[:,col_hue]=label_ctrl
    data_comparison=data_comparison_test.append(data_comparison_ctrl)
        
    y_max=data_comparison.loc[:,col].max()
    y_min=data_comparison.loc[:,col].min()

    data_fit_ctrl=data_fits[data_fiti_ctrl]
    data_fit_tests=[s for s in data_fits if data_fits.index(s)!=data_fiti_ctrl]

    # fig=
    plt.figure(figsize=figsize,dpi=300)
    ax=plt.subplot(111)
    import seaborn as sns
    from scipy.stats import mannwhitneyu,wilcoxon
#     plt.style.use('seaborn-whitegrid')
#     plt.style.use('seaborn-white')
    sns.violinplot(x="condi", y=col, 
                   hue=col_hue,
                   split=True,
                   data=data_comparison,
#                    color='m',
                   palette={label_test: color_test,
                            label_ctrl: color_ctrl},
                   inner="quartile",
                   width=violin_width,
                   cut=0, 
                   # bw=0.1,
                   scale="width",ax=ax)
    plt.legend(title=col_hue,loc='upper center',
               bbox_to_anchor=(0.5, 1.1),#, 1., .102),
               ncol=2, borderaxespad=0.1,frameon=True,
              )
    ax.set_xlabel("")
    if not ylabel is None:
        ax.set_ylabel(ylabel)
#     ax.grid(b=True)
    ax.yaxis.grid(True)
    if data_fits_labels!=None:
        ax.set_xticklabels(data_fits_labels[1:])
    if ylabel!=None:
        ax.set_ylabel(ylabel)
        plt.tight_layout()
    if ylims is None:
        ax.set_ylim([y_min,y_max])
    else:
        ax.set_ylim([ylims[0],ylims[1]])        
#     return data_comparison,data_all
    if pval:    
        col_ctrl=data_all.columns.tolist()[0]
        col_tests=data_all.columns.tolist()[1:]
        for col_testi in range(len(col_tests)):
            col_test=col_tests[col_testi]
            pval,side=get_wilcoxon(data_all,col_ctrl,col_test,side='one',denan=False)
            side='' #stitch
            # print pval
            if stars:
                pval=pval2stars(pval,ns=ns,numeric=False)
            if ns and (pval>0.05):
                side=''
                pval=''
                result="%s\n%s" % (side,pval)
            else:
                result="%s\n%s" % (side,pval)
            # print result
            # print ax.get_ylim()[0]+(ax.get_ylim()[1]-ax.get_ylim()[0])*0.05
            ax.text(col_testi,ax.get_ylim()[0]+(ax.get_ylim()[1]-ax.get_ylim()[0])*0.05,
                    result,ha='center',color='b',
                    bbox=dict(facecolor='w', edgecolor='none', 
                    # boxstyle='round',
                    alpha=0.6,)
                    )
        # data_all.to_csv('test1.csv')
    if plot_fh!=None:
        #plt.tight_layout()
        plt.savefig(plot_fh,format='pdf')
        plt.clf();plt.close()
    return ax,data_comparison,data_all