예제 #1
0
def make_cls_input(data_combo,y_coln_cls,middle_percentile_skipped):
    """
    Make input for classifier

    :param data_combo: pandas dataframe containing training data
    :param y_coln_cls: name of column containing target values
    :param middle_percentile_skipped: skip middle percentile at boundary while creating classes [0,..,1] 
    """
    data_ml=y2classes(data_combo,y_coln_cls,
                       middle_percentile_skipped=middle_percentile_skipped)
    data_ml=data_ml.drop(y_coln_cls,axis=1)
    X_cols_cls=data_ml.columns.tolist()
    y_coln_cls="classes"
    X_cols_cls=data_ml.columns.tolist().remove(y_coln_cls)
    data_ml_mutids=list(data_ml.index)
    # print sum(~pd.isnull(data_combo.loc[:,y_coln_cls]))
    data_combo=set_index(data_combo,"mutids")
    data_ml=set_index(data_ml,"mutids")
    # data_feats=set_index(data_feats,"mutids")
    # data_combo=pd.concat([data_feats,
    #                       data_combo.loc[:,y_coln_cls]],axis=1)
    # print sum(~pd.isnull(data_combo.loc[:,y_coln_cls]))
    # data_combo.index.name='mutids'
    y=data_ml.loc[:,y_coln_cls]
    data_ml=X_cols2binary(data_ml.drop(y_coln_cls,axis=1))
    data_ml.loc[:,y_coln_cls]=y#60data_ml.loc[:,y_coln_cls]
    data_ml=rescalecols(data_ml)
    data_cls_train=denan(data_ml,axis='both',condi='all any')
    # print sum(~pd.isnull(data_ml.loc[:,y_coln_cls]))

    data_cls_train_mutids=list(data_cls_train.index.values)
    data_cls_tests_mutids=[mutid for mutid in data_ml_mutids if not mutid in data_cls_train_mutids]
    data_cls_tests=data_ml.loc[data_cls_tests_mutids,:]
    return data_combo,data_ml,data_cls_train,data_cls_tests
예제 #2
0
def concat_feats(data_feats_all,data_feats,col_index):
    """
    Concatenates tables containing individual mutation wise and position wise features. 

    :param data_feats_all: mutation wise features
    :param data_feats: position wise features
    :param col_index: columns to be concatenated
    """

    data_feats_all=set_index(data_feats_all,col_index)
    data_feats    =set_index(data_feats    ,col_index)
    data_feats_all=data_feats_all.join(data_feats)
    return data_feats_all
예제 #3
0
def plot_submap(info,data_fit_fhs=None,plot_type="submap",
               feats_tup=["Mutant amino acid's Solvent Accessible Surface Area",'Solvent Accessible Surface Area',],
                feats_labels= ['SASA','SASA'],
                ):
    data_feats_all_fh='%s/data_feats/aas/data_feats_all' % info.prj_dh
    data_feats_all=pd.read_csv(data_feats_all_fh).set_index('mutids')
    data_feats_all=set_index(data_feats_all,col_index='mutids')
    if data_fit_fhs is None:
        data_fit_fhs=get_fhs('%s/data_fit/aas/' % info.prj_dh,
                            include='_WRT_',exclude='_inferred')
    type_form='aas'
    for data_fit_fh in data_fit_fhs:
        data_fit_fn=basename(data_fit_fh)
        data_fit=pd.read_csv(data_fit_fh)
        # data_plot=pd.concat([data_fit,data_feats_all],axis=1)
        data_plot=data_fit.join(data_feats_all)
        # data_plot.to_csv('test.csv')
        plot_fh="%s/plots/%s/%s.%s.pdf" % (info.prj_dh,type_form,data_fit_fn,plot_type) 
        if not exists(plot_fh):
            for c in ['FiAcol','FiArow']:
                data_plot.loc[:,c]=data_plot.loc[:,'FiA']
                data_plot.loc[:,c]=data_plot.loc[:,c].fillna(0)
            make_plot_cluster_sub_matrix(data_plot,
                                         'FiA',[[0,1]],
                                        feats=['FiAcol','FiArow'],
                                        feats_labels= feats_labels,
                                        row_cluster=False,
                                        col_cluster=False,
                                         plot_fh=plot_fh,
                                            test=True,
                                        )        
예제 #4
0
def rescale_fitnessbysynonymous(data_fit,col_fit="FCA_norm",col_fit_rescaled="FiA",syn2nan=True):
    """
    Rescale fold changes by the fold change of synonymous mutations at that position

    :param data_fit: pandas table with fold change values
    """
    if not sum(~pd.isnull(data_fit.loc[(data_fit.loc[:,'mut']==data_fit.loc[:,'ref']),col_fit]))==0:
        data_fit=set_index(data_fit,'mutids')
        if col_fit_rescaled in data_fit.columns:
            col_fit_rescaled_ori=col_fit_rescaled
            col_fit_rescaled    ="tmp"
        if not "refrefi" in data_fit:
            data_fit.loc[:,'refrefi']\
            =mutids_converter(data_fit.reset_index().loc[:,'mutids'],
                              'refrefi','aas')        
        for refrefi in data_fit.loc[:,"refrefi"].unique():
            data_fit_posi=data_fit.loc[data_fit.loc[:,"refrefi"]==refrefi,:]
            FiS=float(data_fit_posi.loc[data_fit_posi.loc[:,"mut"]==data_fit_posi.loc[:,"ref"],col_fit])
            for mutid in data_fit_posi.index:
                data_fit.loc[mutid,col_fit_rescaled]=data_fit.loc[mutid,col_fit]-FiS
        if "tmp" in data_fit.columns:
            data_fit.loc[:,col_fit_rescaled_ori]=data_fit.loc[:,"tmp"]
            data_fit=data_fit.drop("tmp",axis=1)
        if syn2nan:
            data_fit.loc[(data_fit.loc[:,'ref']==data_fit.loc[:,'mut']),col_fit_rescaled]=np.nan
        return data_fit
    else:
        logging.info('no synonymous mutations available')
        data_fit.loc[:,col_fit_rescaled]=np.nan
        # data_fit.loc[:,col_fit_rescaled]
        return data_fit
예제 #5
0
def make_GLM_norm(data_lbl_ref_fn,data_lbl_sel_fn,data_fit,info):
    """
    Wrapper for DESeq2 mediated GLM normalization

    :param data_lbl_ref: pandas table with counts of mutations from reference condition
    :param data_lbl_sel: pandas table with counts of mutations from selected condition
    :param data_fit: pandas table with fold change values
    :param info: dict with information of the experiment
    """
    data_lbl_col='NiA_norm'
    data_deseq2_annot,data_deseq2_annot_fh=make_deseq2_annot(data_lbl_ref_fn,data_lbl_sel_fn,
                                                             data_lbl_col,info.prj_dh)
    data_deseq2_count,data_deseq2_count_fh=make_deseq2_count(data_lbl_ref_fn,data_lbl_sel_fn,
                                                             data_deseq2_annot,data_lbl_col,info.prj_dh)
    data_deseq2_count=set_index(data_deseq2_count,'mutids')
    if len(data_deseq2_count.columns)==2:
        logging.error('transform_type can not be GLM: no replicates found')
        sys.exit()
    log_fh="%s.log" % data_deseq2_annot_fh
    data_deseq2_res_fh="%s.deseq2_res.csv" % data_deseq2_annot_fh
    if not exists(data_deseq2_res_fh):
        deseq_fh="%s/deseq2.R" % (abspath(dirname(__file__)))
        with open(log_fh,'a') as log_f:
            com='%s %s %s %s 2' % (info.rscript_fh,deseq_fh,data_deseq2_count_fh,data_deseq2_annot_fh)
#             print com
            subprocess.call(com,shell=True,stdout=log_f, stderr=subprocess.STDOUT)  
    try:
        data_deseq2_res=pd.read_csv(data_deseq2_res_fh).set_index('Unnamed: 0')
    except:
        logging.error('check deseq2 log for more info: %s' % basename(log_fh))
        logging.error("check if deseq2 is installed.")
    data_deseq2_res.index.name='mutids'
#     baseMean  log2FoldChange  lfcSE   stat    pvalue  padj
    test='Waldtest'
    multitest='fdr_bh'              
    col_test_pval="pval %s" % test
    col_test_stat="stat %s" % test
    col_multitest_pval="padj %s %s" % (test,multitest)

    cols=data_deseq2_res.columns.tolist()
    cols=[col_test_pval if s=='pvalue' else s for s in cols]
    cols=[col_test_stat if s=='stat' else s for s in cols]
    cols=[col_multitest_pval if s=='padj' else s for s in cols]
    data_deseq2_res.columns=cols

    data_deseq2_res.loc[:,'pval']=data_deseq2_res.loc[:,col_test_pval]
    data_deseq2_res.loc[:,'stat']=data_deseq2_res.loc[:,col_test_stat]
    data_deseq2_res.loc[:,'padj']=data_deseq2_res.loc[:,col_multitest_pval]    

    data_deseq2_res.loc[:,'FCA_norm']=data_deseq2_res.loc[:,'log2FoldChange']
    #set wald as default stat
    data_fit=data_fit.drop(['pval','stat','padj'],axis=1)
    data_fit=data_fit.join(data_deseq2_res)
    return data_fit
예제 #6
0
def class_comparison(dA,dB):
    """
    This classifies differences in fitness i.e. relative fitness into positive, negative or robust categories. 
    
    :param dc: dataframe with `dc`. 
    :returns dc: dataframe with `class__comparison` added according to fitness levels in input and selected samples in `dc`
    """
    dA=set_index(dA,'mutids')
    dB=set_index(dB,'mutids')
    dc=get_repli_FiA(dA).join(get_repli_FiA(dB),lsuffix='_test',rsuffix='_ctrl')
    up=data_fit2cutoffs(dc,sA='_reps_test',sB='_reps_ctrl',N=False)
    dw=-1*up

    diff=dA.loc[:,'FiA']-dB.loc[:,'FiA']
    diff.index.name='mutids'
    diff=diff.reset_index()
    mutids_up=diff.loc[(diff.loc[:,'FiA']>up),'mutids'].tolist()
    mutids_dw=diff.loc[(diff.loc[:,'FiA']<dw),'mutids'].tolist()

    dc.loc[mutids_up,'class_comparison']="positive"
    dc.loc[mutids_dw,'class_comparison']="negative"
    return dc.loc[:,'class_comparison']
예제 #7
0
def data_comparison2scatter_mutilayered(data,data_label,color_dots=None,
                                         mutids_heads=[],mutids_tails=[],
                                        col_filter=None,
                                        note_text=None,
                                        col_pvals=None,
                                        repel=0.045,
                                        figsize=[15,5],
                                        plot_fh=None):
    """
    Wrapper to plot multi layered scatter plot

    :param data: pandas dataframe
    :param data_label: label of the data
    """
    from dms2dfe.lib.io_strs import splitlabel

    # print data.shape
    data=set_index(data,'mutids')
    labels=splitlabel(data_label,splitby=' versus ',ctrl='$37^{0}$C')
    if not note_text is None:
        labels=["%s (%s)" % (l,note_text) for l in labels] 
    data.loc[:,labels[0]]=data.loc[:,'Fi_test']
    data.loc[:,labels[1]]=data.loc[:,'Fi_ctrl']
    if not col_pvals is None:
        data.loc[:,col_pvals]=np.log10(data.loc[:,col_pvals])
        if not data.index.name=='mutids':
            data.index.name='mutids'
#         print data.index
        zcol_threshold=np.log10(0.01)
    if not col_filter is None:
        data.loc[data.loc[:,col_filter],labels]
    cols=['mut','ref']+labels
    if not col_pvals is None:
        cols=cols+[col_pvals]
    data=denanrows(data.loc[:,cols])
    # print data.shape
    # print data.index.name
    # print data.columns.tolist()

    plot_scatter_mutilayered(data,labels[1],labels[0],
                             plot_fh=plot_fh,
                            color_dots=color_dots,
                             mutids_heads=mutids_heads,
                             mutids_tails=mutids_tails,
                             color_heads='b',color_tails='b',
                             col_z_mutations=col_pvals,
                             zcol_threshold=0.05,
                             repel=repel,
                             figsize=figsize,#[6.375,4.5],
                            )
예제 #8
0
def make_cls_input(data_combo, y_coln_cls, middle_percentile_skipped):
    """
    Make input for classifier

    :param data_combo: pandas dataframe containing training data
    :param y_coln_cls: name of column containing target values
    :param middle_percentile_skipped: skip middle percentile at boundary while creating classes [0,..,1] 
    """
    data_ml = y2classes(data_combo,
                        y_coln_cls,
                        middle_percentile_skipped=middle_percentile_skipped)
    data_ml = data_ml.drop(y_coln_cls, axis=1)
    X_cols_cls = data_ml.columns.tolist()
    y_coln_cls = "classes"
    X_cols_cls = data_ml.columns.tolist().remove(y_coln_cls)
    data_ml_mutids = list(data_ml.index)
    # print sum(~pd.isnull(data_combo.loc[:,y_coln_cls]))
    data_combo = set_index(data_combo, "mutids")
    data_ml = set_index(data_ml, "mutids")
    # data_feats=set_index(data_feats,"mutids")
    # data_combo=pd.concat([data_feats,
    #                       data_combo.loc[:,y_coln_cls]],axis=1)
    # print sum(~pd.isnull(data_combo.loc[:,y_coln_cls]))
    # data_combo.index.name='mutids'
    y = data_ml.loc[:, y_coln_cls]
    data_ml = X_cols2binary(data_ml.drop(y_coln_cls, axis=1))
    data_ml.loc[:, y_coln_cls] = y  #60data_ml.loc[:,y_coln_cls]
    data_ml = rescalecols(data_ml)
    data_cls_train = denan(data_ml, axis='both', condi='all any')
    # print sum(~pd.isnull(data_ml.loc[:,y_coln_cls]))

    data_cls_train_mutids = list(data_cls_train.index.values)
    data_cls_tests_mutids = [
        mutid for mutid in data_ml_mutids if not mutid in data_cls_train_mutids
    ]
    data_cls_tests = data_ml.loc[data_cls_tests_mutids, :]
    return data_combo, data_ml, data_cls_train, data_cls_tests
예제 #9
0
def get_data_lbl_reps(data_lbl_fn,
                      data_lbl_type,
                      repli,
                      info,
                      data_fit=None,
                      data_lbl_col='NiA_tran',
                      type_form='aas',
                      col_sep='.'):
    """
    Gets the replicates for a given filename of data_lbl

    :param data_lbl_fn: filename of data_lbl
    :param data_lbl_type: codon level or amino acid level mutations
    """
    if data_lbl_fn in repli.index:
        reps = repli.loc[data_lbl_fn, :].dropna()
        for rep in reps:
            data_lbl_fh = "%s/data_lbl/%s/%s" % (info.prj_dh, type_form, rep)
            if exists(data_lbl_fh):
                data_lbl = pd.read_csv(data_lbl_fh)
                data_lbl = set_index(data_lbl, 'mutids')

                data_fit_col = "%s%s%s%s%s" % (rep, col_sep, data_lbl_col,
                                               col_sep, data_lbl_type)
                data_lbl_col = "%s" % (data_lbl_col)
                if data_fit is None:
                    data_fit = data_lbl.loc[:, ['ref', 'refi', 'mut']].copy()
                data_fit.loc[:, data_fit_col] = data_lbl.loc[:, data_lbl_col]
            else:
                logging.warning('%s does not exist' % basename(data_lbl_fh))
        if len(reps) == 0:
            logging.warning("no replicates found in cfg: %s" % data_lbl_fn)
        else:
            return data_fit
    else:
        rep = data_lbl_fn
        data_lbl_fh = "%s/data_lbl/%s/%s" % (info.prj_dh, type_form, rep)
        # print info.prj_dh
        if exists(data_lbl_fh):
            data_lbl = pd.read_csv(data_lbl_fh).set_index('mutids')
            data_fit_col = "%s%s%s%s%s" % (rep, col_sep, data_lbl_col, col_sep,
                                           data_lbl_type)
            data_lbl_col = "%s" % (data_lbl_col)
            if data_fit is None:
                data_fit = data_lbl.loc[:, ['ref', 'refi', 'mut']].copy()
            data_fit.loc[:, data_fit_col] = data_lbl.loc[:, data_lbl_col]
            return data_fit
        else:
            logging.warning('does not exists: %s' % data_lbl_fn)
예제 #10
0
def make_dXy(dXy,ycol,unique_quantile=0.25,index="mutids",if_rescalecols=True):
    """
    Create a pandas table with target and predictor data

    :param dXy: pandas dataframe with target(y) and predictor(X) data
    :param ycol: column name of target values    
    """
    dXy=set_index(dXy,index)
    # print 'len(cols_del)=%s' % len(get_cols_del(dXy))
    dXy=dXy.drop(get_cols_del(dXy),axis=1)
    Xcols=[c for c in dXy.columns.tolist() if c!=ycol]
    Xunique=pd.DataFrame({'unique':[len(np.unique(dXy[c])) for c in Xcols]},index=[c for c in Xcols])
    Xcols=Xunique.index[Xunique['unique']>Xunique['unique'].quantile(unique_quantile)]
    dXy=dXy.loc[:,Xcols.tolist()+[ycol]]
    dXy=dXy.dropna(axis=1, how='all').dropna(axis=0, how='any')
    if if_rescalecols:
        Xcols=[c for c in dXy.columns.tolist() if c!=ycol]
        dXy.loc[:,Xcols]=rescalecols(dXy.loc[:,Xcols])
    return dXy,Xcols,ycol
예제 #11
0
def get_data_lbl_reps(data_lbl_fn,data_lbl_type,repli,info,data_fit=None,
                      data_lbl_col='NiA_tran',type_form='aas',col_sep='.'):
    """
    Gets the replicates for a given filename of data_lbl

    :param data_lbl_fn: filename of data_lbl
    :param data_lbl_type: codon level or amino acid level mutations
    """
    if data_lbl_fn in repli.index:
        reps=repli.loc[data_lbl_fn,:].dropna()
        for rep in reps:
            data_lbl_fh="%s/data_lbl/%s/%s" % (info.prj_dh,type_form,rep)
            if exists(data_lbl_fh):
                data_lbl=pd.read_csv(data_lbl_fh)
                data_lbl=set_index(data_lbl,'mutids')

                data_fit_col="%s%s%s%s%s" % (rep,col_sep,data_lbl_col,col_sep,data_lbl_type)
                data_lbl_col="%s" % (data_lbl_col)
                if data_fit is None:
                    data_fit=data_lbl.loc[:,['ref','refi','mut']].copy()
                data_fit.loc[:,data_fit_col]=data_lbl.loc[:,data_lbl_col]
            else:
                logging.warning('%s does not exist' % basename(data_lbl_fh))
        if len(reps)==0:
            logging.warning("no replicates found in cfg: %s" % data_lbl_fn)
        else:
            return data_fit
    else:
        rep=data_lbl_fn
        data_lbl_fh="%s/data_lbl/%s/%s" % (info.prj_dh,type_form,rep)
        # print info.prj_dh
        if exists(data_lbl_fh):    
            data_lbl=pd.read_csv(data_lbl_fh).set_index('mutids')
            data_fit_col="%s%s%s%s%s" % (rep,col_sep,data_lbl_col,col_sep,data_lbl_type)
            data_lbl_col="%s" % (data_lbl_col)
            if data_fit is None:
                data_fit=data_lbl.loc[:,['ref','refi','mut']].copy()
            data_fit.loc[:,data_fit_col]=data_lbl.loc[:,data_lbl_col]
            return data_fit
        else:
            logging.warning('does not exists: %s' % data_lbl_fn)
예제 #12
0
def rescale_fitnessbysynonymous(data_fit,
                                col_fit="FCA_norm",
                                col_fit_rescaled="FiA",
                                syn2nan=True):
    """
    Rescale fold changes by the fold change of synonymous mutations at that position

    :param data_fit: pandas table with fold change values
    """
    if not sum(~pd.isnull(data_fit.loc[
        (data_fit.loc[:, 'mut'] == data_fit.loc[:, 'ref']), col_fit])) == 0:
        data_fit = set_index(data_fit, 'mutids')
        if col_fit_rescaled in data_fit.columns:
            col_fit_rescaled_ori = col_fit_rescaled
            col_fit_rescaled = "tmp"
        if not "refrefi" in data_fit:
            data_fit.loc[:,'refrefi']\
            =mutids_converter(data_fit.reset_index().loc[:,'mutids'],
                              'refrefi','aas')
        for refrefi in data_fit.loc[:, "refrefi"].unique():
            data_fit_posi = data_fit.loc[data_fit.loc[:,
                                                      "refrefi"] == refrefi, :]
            FiS = float(data_fit_posi.loc[data_fit_posi.loc[:, "mut"] ==
                                          data_fit_posi.loc[:,
                                                            "ref"], col_fit])
            for mutid in data_fit_posi.index:
                data_fit.loc[mutid,
                             col_fit_rescaled] = data_fit.loc[mutid,
                                                              col_fit] - FiS
        if "tmp" in data_fit.columns:
            data_fit.loc[:, col_fit_rescaled_ori] = data_fit.loc[:, "tmp"]
            data_fit = data_fit.drop("tmp", axis=1)
        if syn2nan:
            data_fit.loc[(data_fit.loc[:, 'ref'] == data_fit.loc[:, 'mut']),
                         col_fit_rescaled] = np.nan
        return data_fit
    else:
        logging.info('no synonymous mutations available')
        data_fit.loc[:, col_fit_rescaled] = np.nan
        # data_fit.loc[:,col_fit_rescaled]
        return data_fit
예제 #13
0
def plot_submap(
    info,
    data_fit_fhs=None,
    plot_type="submap",
    feats_tup=[
        "Mutant amino acid's Solvent Accessible Surface Area",
        'Solvent Accessible Surface Area',
    ],
    feats_labels=['SASA', 'SASA'],
):
    data_feats_all_fh = '%s/data_feats/aas/data_feats_all' % info.prj_dh
    data_feats_all = pd.read_csv(data_feats_all_fh).set_index('mutids')
    data_feats_all = set_index(data_feats_all, col_index='mutids')
    if data_fit_fhs is None:
        data_fit_fhs = get_fhs('%s/data_fit/aas/' % info.prj_dh,
                               include='_WRT_',
                               exclude='_inferred')
    type_form = 'aas'
    for data_fit_fh in data_fit_fhs:
        data_fit_fn = basename(data_fit_fh)
        data_fit = pd.read_csv(data_fit_fh)
        # data_plot=pd.concat([data_fit,data_feats_all],axis=1)
        data_plot = data_fit.join(data_feats_all)
        # data_plot.to_csv('test.csv')
        plot_fh = "%s/plots/%s/%s.%s.pdf" % (info.prj_dh, type_form,
                                             data_fit_fn, plot_type)
        if not exists(plot_fh):
            for c in ['FiAcol', 'FiArow']:
                data_plot.loc[:, c] = data_plot.loc[:, 'FiA']
                data_plot.loc[:, c] = data_plot.loc[:, c].fillna(0)
            make_plot_cluster_sub_matrix(
                data_plot,
                'FiA',
                [[0, 1]],
                feats=['FiAcol', 'FiArow'],
                feats_labels=feats_labels,
                row_cluster=False,
                col_cluster=False,
                plot_fh=plot_fh,
                test=True,
            )
예제 #14
0
def make_GLM_norm(data_lbl_ref_fn, data_lbl_sel_fn, data_fit, info):
    """
    Wrapper for DESeq2 mediated GLM normalization

    :param data_lbl_ref: pandas table with counts of mutations from reference condition
    :param data_lbl_sel: pandas table with counts of mutations from selected condition
    :param data_fit: pandas table with fold change values
    :param info: dict with information of the experiment
    """
    data_lbl_col = 'NiA_norm'
    data_deseq2_annot, data_deseq2_annot_fh = make_deseq2_annot(
        data_lbl_ref_fn, data_lbl_sel_fn, data_lbl_col, info.prj_dh)
    data_deseq2_count, data_deseq2_count_fh = make_deseq2_count(
        data_lbl_ref_fn, data_lbl_sel_fn, data_deseq2_annot, data_lbl_col,
        info.prj_dh)
    data_deseq2_count = set_index(data_deseq2_count, 'mutids')
    if len(data_deseq2_count.columns) == 2:
        logging.error('transform_type can not be GLM: no replicates found')
        sys.exit()
    log_fh = "%s.log" % data_deseq2_annot_fh
    data_deseq2_res_fh = "%s.deseq2_res.csv" % data_deseq2_annot_fh
    if not exists(data_deseq2_res_fh):
        deseq_fh = "%s/deseq2.R" % (abspath(dirname(__file__)))
        with open(log_fh, 'a') as log_f:
            com = '%s %s %s %s 2' % (info.rscript_fh, deseq_fh,
                                     data_deseq2_count_fh,
                                     data_deseq2_annot_fh)
            #             print com
            subprocess.call(com,
                            shell=True,
                            stdout=log_f,
                            stderr=subprocess.STDOUT)
    try:
        data_deseq2_res = pd.read_csv(data_deseq2_res_fh).set_index(
            'Unnamed: 0')
    except:
        logging.error('check deseq2 log for more info: %s' % basename(log_fh))
        logging.error("check if deseq2 is installed.")
    data_deseq2_res.index.name = 'mutids'
    #     baseMean  log2FoldChange  lfcSE   stat    pvalue  padj
    test = 'Waldtest'
    multitest = 'fdr_bh'
    col_test_pval = "pval %s" % test
    col_test_stat = "stat %s" % test
    col_multitest_pval = "padj %s %s" % (test, multitest)

    cols = data_deseq2_res.columns.tolist()
    cols = [col_test_pval if s == 'pvalue' else s for s in cols]
    cols = [col_test_stat if s == 'stat' else s for s in cols]
    cols = [col_multitest_pval if s == 'padj' else s for s in cols]
    data_deseq2_res.columns = cols

    data_deseq2_res.loc[:, 'pval'] = data_deseq2_res.loc[:, col_test_pval]
    data_deseq2_res.loc[:, 'stat'] = data_deseq2_res.loc[:, col_test_stat]
    data_deseq2_res.loc[:, 'padj'] = data_deseq2_res.loc[:, col_multitest_pval]

    data_deseq2_res.loc[:, 'FCA_norm'] = data_deseq2_res.loc[:,
                                                             'log2FoldChange']
    #set wald as default stat
    data_fit = data_fit.drop(['pval', 'stat', 'padj'], axis=1)
    data_fit = data_fit.join(data_deseq2_res)
    return data_fit
예제 #15
0
def main(prj_dh, test=False, ml=False):
    """
    **--step 3**. Identifies molecular features that may determine fitness scores.
    
    This plots the results in following visualisations.
    
    .. code-block:: text
    
        ROC plots
        Relative importances of features
    
    :param prj_dh: path to project directory.
    """
    logging.info("start")

    if not exists(prj_dh):
        logging.error("Could not find '%s'" % prj_dh)
        sys.exit()
    configure.main(prj_dh)
    from dms2dfe.tmp import info

    from dms2dfe.tmp import info
    from dms2dfe.lib.io_ml import corrplot
    corrplot(info)

    if ml:
        from dms2dfe.lib.io_dfs import set_index
        from dms2dfe.lib.io_ml import data_fit2ml  #,get_cols_del,make_data_combo,data_combo2ml
        cores = int(info.cores)
        if hasattr(info, 'mut_type'):
            mut_type = info.mut_type
        else:
            mut_type = 'single'
        if hasattr(info, 'ml_input'):
            if info.ml_input == 'FC':
                ml_input = 'FCA_norm'
            elif info.ml_input == 'Fi':
                ml_input = 'FiA'
        else:
            ml_input = 'FCA_norm'
        type_form = "aas"
        if not exists("%s/plots/%s" % (prj_dh, type_form)):
            makedirs("%s/plots/%s" % (prj_dh, type_form))
        if not exists("%s/data_ml/%s" % (prj_dh, type_form)):
            makedirs("%s/data_ml/%s" % (prj_dh, type_form))
        data_feats = pd.read_csv("%s/data_feats/aas/data_feats_all" % (prj_dh))

        if mut_type == 'single':
            data_fit_keys = ["data_fit/%s/%s" % (type_form,basename(fh)) \
                             for fh in glob("%s/data_fit/aas/*" % prj_dh) \
                             if (not "inferred" in basename(fh)) and ("_WRT_" in basename(fh))]
            data_fit_keys = np.unique(data_fit_keys)

            if len(data_fit_keys) != 0:
                if test:
                    pooled_io_ml(data_fit_keys[0])
                    # for data_fit_key in data_fit_keys:
                    #     pooled_io_ml(data_fit_key)
                else:
                    for data_fit_key in data_fit_keys:
                        pooled_io_ml(data_fit_key)
                    # pool_io_ml=Pool(processes=int(cores))
                    # pool_io_ml.map(pooled_io_ml,data_fit_keys)
                    # pool_io_ml.close(); pool_io_ml.join()
            else:
                logging.info("already processed")
        elif mut_type == 'double':
            data_feats = set_index(data_feats, 'mutids')
            data_fit_dh = 'data_fit_dm'
            data_fit_keys = ["%s/%s/%s" % (data_fit_dh,type_form,basename(fh)) \
                             for fh in glob("%s/%s/aas/*" % (prj_dh,data_fit_dh)) \
                             if (not "inferred" in basename(fh)) and ("_WRT_" in basename(fh))]
            data_fit_keys = np.unique(data_fit_keys)
            ycol = ml_input
            Xcols = data_feats.columns
            if len(data_fit_keys) != 0:
                for data_fit_key in data_fit_keys:
                    data_fit_dm_fh = '%s/%s' % (prj_dh, data_fit_key)
                    data_combo_fh = '%s/data_ml/aas/%s.combo' % (
                        prj_dh, basename(data_fit_dm_fh))
                    force = False
                    if not exists(data_combo_fh) or force:
                        data_fit_dm = pd.read_csv(data_fit_dm_fh).set_index(
                            'mutids')
                        data_combo = make_data_combo(data_fit_dm, data_feats,
                                                     ycol, Xcols)
                        if not exists(dirname(data_combo_fh)):
                            makedirs(dirname(data_combo_fh))
                        data_combo.to_csv(data_combo_fh)
                    else:
                        data_combo = pd.read_csv(data_combo_fh).set_index(
                            'mutids')
                    logging.info('ml: start')
                    data_combo2ml(
                        data_combo,
                        basename(data_fit_dm_fh),
                        dirname(data_combo_fh),
                        dirname(data_combo_fh),
                        ycoln=ycol,
                        col_idx='mutids',
                        ml_type='cls',
                        middle_percentile_skipped=0.1,
                        force=False,
                    )

    def pooled_io_ml(data_fit_key):
        """
        This module makes use of muti threading to speed up `dms2dfe.lib.io_ml.data_fit2ml`.     
        
        :param data_fit_key: in the form <data_fit>/<aas/cds>/<name of file>.
        """
        from dms2dfe.tmp import info
        dX_fh = "%s/data_feats/aas/data_feats_all" % (info.prj_dh)
        dy_fh = '%s/%s' % (info.prj_dh, data_fit_key)
        logging.info('processing: %s' % basename(dy_fh))
        data_fit2ml(dX_fh, dy_fh, info, regORcls='cls')

    logging.shutdown()