예제 #1
0
def main(prj_dh,test=False):
    """
    **--step 4**. Compares test (eg. treated) and control (eg. untreated) experiments.
    
    The output data is saved in `data_comparison` format as described in :ref:`io`.
    
    :param prj_dh: path to project directory.
    """
    logging.info("start")
    if not exists(prj_dh) :
        logging.error("Could not find '%s'" % prj_dh)
        sys.exit()
    configure.main(prj_dh)
    from dms2dfe.tmp import info
    cores=info.cores
    
    # SET global variables
    global prj_dh_global
    prj_dh_global=prj_dh
    if exists('%s/cfg/comparison' % prj_dh):
        comparison_pairs_list    =getusable_comparison_list(prj_dh)    
        if test:
            pooled_data_fit2data_comparison(comparison_pairs_list[0])
        else:
            pool=Pool(processes=int(cores)) 
            pool.map(pooled_data_fit2data_comparison,comparison_pairs_list)
            pool.close(); pool.join()
    else :
        logging.warning("do not exist: cfg/comparison")
    data_fit_metrics=get_data_metrics(prj_dh)
    logging.shutdown()
예제 #2
0
def main(prj_dh):
    """
    **--step 5**. Generates vizualizations.

    #. Scatter grid plots raw counts in replicates, if present.
    #. Mutation matrix. of frequencies of mutants (log scaled). 
    #. Scatter plots of raw counts among selected and unselected samples 
    #. Mutation matrix. of Fitness values. 
    #. DFE plot. ie. Distribution of Fitness values for samples.
    #. Projections on PDB. Average of fitness values per residue are projected onto PDB file.  

    :param prj_dh: path to project directory.
    """
    logging.info("start")
    if not exists(prj_dh):
        logging.error("Could not find '%s'" % prj_dh)
        sys.exit()
    configure.main(prj_dh)
    from dms2dfe.tmp import info
    for type_form in ['aas', 'cds']:
        plots_dh = '%s/plots/%s' % (prj_dh, type_form)
        if not exists(plots_dh):
            makedirs(plots_dh)

    plot_coverage(info)
    plot_mutmap(info)
    plot_submap(info)
    plot_multisca(info)
    plot_pdb(info)
    plot_violin(info)
예제 #3
0
def main(prj_dh, test=False):
    """
    **--step 4**. Compares test (eg. treated) and control (eg. untreated) experiments.
    
    The output data is saved in `data_comparison` format as described in :ref:`io`.
    
    :param prj_dh: path to project directory.
    """
    logging.info("start")
    if not exists(prj_dh):
        logging.error("Could not find '%s'" % prj_dh)
        sys.exit()
    configure.main(prj_dh)
    from dms2dfe.tmp import info
    cores = info.cores

    # SET global variables
    global prj_dh_global
    prj_dh_global = prj_dh
    if exists('%s/cfg/comparison' % prj_dh):
        comparison_pairs_list = getusable_comparison_list(prj_dh)
        if test:
            pooled_data_fit2data_comparison(comparison_pairs_list[0])
        else:
            pool = Pool(processes=int(cores))
            pool.map(pooled_data_fit2data_comparison, comparison_pairs_list)
            pool.close()
            pool.join()
    else:
        logging.warning("do not exist: cfg/comparison")
    data_fit_metrics = get_data_metrics(prj_dh)
    logging.shutdown()
예제 #4
0
def main(prj_dh, test=False):
    """
    **--step 1**. Processes alignment (.sam file) and produces codon level mutation matrix of counts of mutations.
    
    :param prj_dh: path to project directory.
    """
    logging.info("start")

    # SET global variables
    global fsta_id, fsta_seqlen, fsta_seq, cds_ref, Q_cutoff, prj_dh_global

    prj_dh_global = prj_dh
    if not exists(prj_dh):
        logging.error("Could not find '%s'\n" % prj_dh)
        sys.exit()
    configure.main(prj_dh)
    from dms2dfe.tmp import info
    fsta_fh = info.fsta_fh
    Q_cutoff = int(info.Q_cutoff)
    cores = int(info.cores)
    samtools_fh = info.samtools_fh

    with open(fsta_fh, 'r') as fsta_data:
        for fsta_record in SeqIO.parse(fsta_data, "fasta"):
            fsta_id = fsta_record.id
            fsta_seq = str(fsta_record.seq)
            fsta_seqlen = len(fsta_seq)
            logging.info("ref name : '%s', length : '%d' " %
                         (fsta_record.id, fsta_seqlen))
    cds_ref = []
    for cdi in range(len(fsta_seq) / 3):
        cds_ref.append(str(fsta_seq[cdi * 3:cdi * 3 + 3]))

    sbam_fhs = getusablesbams_list(prj_dh)
    # check if bams are indexed
    for sbam_fh in sbam_fhs:
        sbam_index_fh = "%s.bai" % sbam_fh
        log_fh = "%s.log" % sbam_index_fh
        log_f = open(log_fh, 'a')
        if not exists(sbam_index_fh):
            com = "%s index %s" % (samtools_fh, sbam_fh)
            subprocess.call(com,
                            shell=True,
                            stdout=log_f,
                            stderr=subprocess.STDOUT)
            log_f.close()

    if len(sbam_fhs) != 0:
        if test:
            pooled(sbam_fhs[0])
        else:
            pool = Pool(processes=int(cores))  # T : get it from xls
            pool.map(pooled, sbam_fhs)
            pool.close()
            pool.join()
    else:
        logging.info("already processed")
    logging.shutdown()
예제 #5
0
def main(prj_dh,test=False):
    """
    **--step 1**. Processes alignment (.sam file) and produces codon level mutation matrix of counts of mutations.
    
    :param prj_dh: path to project directory.
    """
    logging.info("start")

    # SET global variables
    global fsta_id,fsta_seqlen,fsta_seq,cds_ref,Q_cutoff,prj_dh_global
    
    prj_dh_global=prj_dh
    if not exists(prj_dh) :
        logging.error("Could not find '%s'\n" % prj_dh)
        sys.exit()
    configure.main(prj_dh)
    from dms2dfe.tmp import info
    fsta_fh=info.fsta_fh
    Q_cutoff=int(info.Q_cutoff)
    cores=int(info.cores)
    samtools_fh=info.samtools_fh

    with open(fsta_fh,'r') as fsta_data:
        for fsta_record in SeqIO.parse(fsta_data, "fasta") :
            fsta_id=fsta_record.id
            fsta_seq=str(fsta_record.seq) 
            fsta_seqlen=len(fsta_seq)
            logging.info("ref name : '%s', length : '%d' " % (fsta_record.id, fsta_seqlen))
    cds_ref=[]
    for cdi in range(len(fsta_seq)/3) :
        cds_ref.append(str(fsta_seq[cdi*3:cdi*3+3]))

    sbam_fhs=getusablesbams_list(prj_dh)
    # check if bams are indexed
    for sbam_fh in sbam_fhs:
        sbam_index_fh="%s.bai" % sbam_fh
        log_fh="%s.log" % sbam_index_fh
        log_f = open(log_fh,'a')
        if not exists(sbam_index_fh):
            com= "%s index %s" % (samtools_fh,sbam_fh)
            subprocess.call(com,shell=True,stdout=log_f, stderr=subprocess.STDOUT)
            log_f.close()
            
    if len(sbam_fhs)!=0:
        if test:
            pooled(sbam_fhs[0])
        else:
            pool=Pool(processes=int(cores)) # T : get it from xls
            pool.map(pooled, sbam_fhs)
            pool.close(); pool.join()                
    else:
        logging.info("already processed")  
    logging.shutdown()
예제 #6
0
def main(prj_dh, test=False):
    """
    **--step 0.2**. Preprocesses and aligns sequencing files.

    The steps and required dependendencies are following. 

    .. code-block:: text

        Quality filtering        : using Trimmomatic.
        Alignment                : using bowtie2
        .sam to .bam conversion  : using samtools

    :param prj_dh: path to project directory
    """
    logging.info("start")
    global trimmomatic_fh, fsta_fh, alignment_type, bt2_ref_fh, bowtie2_fh, samtools_fh, bowtie2_com

    if not exists(prj_dh):
        logging.error("Could not find '%s'\n" % prj_dh)
        sys.exit()
    configure.main(prj_dh)
    from dms2dfe.tmp import info
    fsta_fh = info.fsta_fh
    cores = info.cores
    trimmomatic_fh = info.trimmomatic_fh
    bowtie2_fh = info.bowtie2_fh
    trimmomatic_com = info.trimmomatic_com
    bowtie2_com = info.bowtie2_com
    samtools_fh = info.samtools_fh
    alignment_type = info.alignment_type

    # make bowtie index
    bt2_ref_fh = splitext(fsta_fh)[0]
    if not exists("%s.1.bt2" % bt2_ref_fh):
        bowtie_ref_com="%s-build --quiet %s %s &> %s.logbt2bld" \
                        % (bowtie2_fh,fsta_fh,splitext(bt2_ref_fh)[0],bt2_ref_fh)
        subprocess.call(bowtie_ref_com, shell=True)
        logging.info("bt2_ref_fh do not exist, made one.")
    fastqs_list = getusablefastqs_list(prj_dh)
    # print fastqs_list
    if len(fastqs_list) != 0:
        if test:
            for fastq in fastqs_list:
                pooled(fastq)
        else:
            pool = Pool(processes=int(cores))
            pool.map(pooled, fastqs_list)
            pool.close()
            pool.join()
    else:
        logging.info("already processed")
    # cfg_h5.close()
    logging.shutdown()
예제 #7
0
def main(prj_dh,test=False):
    """
    **--step 0.2**. Preprocesses and aligns sequencing files.

    The steps and required dependendencies are following. 

    .. code-block:: text

        Quality filtering        : using Trimmomatic.
        Alignment                : using bowtie2
        .sam to .bam conversion  : using samtools

    :param prj_dh: path to project directory
    """
    logging.info("start")
    global trimmomatic_fh,fsta_fh,alignment_type,bt2_ref_fh,bowtie2_fh,samtools_fh,bowtie2_com

    if not exists(prj_dh) :
        logging.error("Could not find '%s'\n" % prj_dh)
        sys.exit()
    configure.main(prj_dh)
    from dms2dfe.tmp import info
    fsta_fh=info.fsta_fh
    cores=info.cores
    trimmomatic_fh=info.trimmomatic_fh
    bowtie2_fh=info.bowtie2_fh
    trimmomatic_com=info.trimmomatic_com
    bowtie2_com=info.bowtie2_com
    samtools_fh=info.samtools_fh
    alignment_type=info.alignment_type

    # make bowtie index
    bt2_ref_fh = splitext(fsta_fh)[0]
    if not exists("%s.1.bt2" % bt2_ref_fh):
        bowtie_ref_com="%s-build --quiet %s %s &> %s.logbt2bld" \
                        % (bowtie2_fh,fsta_fh,splitext(bt2_ref_fh)[0],bt2_ref_fh)
        subprocess.call(bowtie_ref_com,shell=True)
        logging.info("bt2_ref_fh do not exist, made one.")
    fastqs_list=getusablefastqs_list(prj_dh)
    # print fastqs_list
    if len(fastqs_list)!=0:
        if test:
            for fastq in fastqs_list:
                pooled(fastq)
        else:
            pool=Pool(processes=int(cores))  
            pool.map(pooled,fastqs_list)
            pool.close(); pool.join()
    else:
        logging.info("already processed")  
    # cfg_h5.close()
    logging.shutdown()
예제 #8
0
def main(prj_dh):
    """
    **--step 0.3**. Extracts molecular features of the gene.
    
    The out files are created in `prj_dh/data_feats`

    The steps and required dependendencies are following. 
    
    .. code-block:: text
    
        Secondary structure                      : using DSSP.
        Solvent Accessible Surface Area          : using DSSP.  
        Distance of a residue from reference atom: using Bio.PDB

    :param prj_dh: path to project directory.
    """
    logging.basicConfig(
        format='[%(asctime)s] %(levelname)s from %(funcName)s:\t%(message)s',
        level=logging.DEBUG)  # filename=cfg_xls_fh+'.log'
    logging.info("start")

    if not exists(prj_dh):
        logging.error("Could not find '%s'\n" % prj_dh)
        sys.exit()
    configure.main(prj_dh)
    from dms2dfe.tmp import info

    #FEATS PER POSITION
    data_feats_pos_fh = "%s/data_feats/aas/data_feats_pos" % prj_dh
    data_feats_pos = get_data_feats_pos(prj_dh,
                                        info,
                                        data_out_fh=data_feats_pos_fh)
    #FEATS PER SUBSTITUTION
    data_feats_sub_fh = "%s/data_feats/aas/data_feats_sub" % prj_dh
    data_feats_sub = get_data_feats_sub(data_out_fh=data_feats_sub_fh)
    #FEATS PER MUTATION
    data_feats_mut_fh = "%s/data_feats/aas/data_feats_mut" % prj_dh
    data_feats_mut = get_data_feats_mut(prj_dh, data_feats_mut_fh, info)
    #FEATS ALL
    data_feats_all_fh = "%s/data_feats/aas/data_feats_all" % prj_dh
    data_feats_all = get_data_feats_all(data_feats_mut_fh, data_feats_pos_fh,
                                        data_feats_sub_fh, data_feats_all_fh,
                                        info)
    #back compatibility
    feats_all_fh = "%s/data_feats/aas/feats_all" % prj_dh
    if not data_feats_pos is None:
        data_feats_pos.to_csv(feats_all_fh)

    logging.shutdown()
예제 #9
0
def main(prj_dh):
    """
    **--step 0.1**. Demultipexes .fastq files based on barcodes located at `prj_dh/cfg/barcodes`.

    :param prj_dh: path to project directory
    """
    logging.info("start")
    if not exists(prj_dh) :
        logging.error("Could not find '%s'\n" % prj_dh)
        sys.exit()
    configure.main(prj_dh)
    from dms2dfe.tmp import info
    trimmomatic_fh=info.trimmomatic_fh

    if exists('%s/cfg/barcodes' % prj_dh) :
        barcodes=pd.read_csv(prj_dh+'/cfg/barcodes')
        # print barcodes        
        if len(barcodes)!=0:
            fastq_R1_fhs=[str(s) for s in barcodes.loc[:,'fastq_R1_fh'].unique()]
            fastq_fhs_list=[[s, s.replace('R1','R2')] for s in fastq_R1_fhs if not exists("%s.qcd.fastq" % s)] # pairs
            for fastq_fhs_tp in fastq_fhs_list:
                fastq2qcd(fastq_fhs_tp,trimmomatic_fh)
#             fastq_R1_fhs=[fastq_R1_fh+".qcd.fastq" for fastq_R1_fh in fastq_R1_fhs if not ".qcd." in fastq_R1_fh]
            barcodes=barcodes.set_index('fastq_R1_fh')
            for fastq_R1_fh in fastq_R1_fhs:
                if exists(fastq_R1_fh):
                    fastq_R1_fh_barcodes=barcodes.loc[fastq_R1_fh,:]
                    fastq_R2_fh=fastq_R1_fh.replace('R1','R2') #str(fastq_R1_fh_barcodes.ix[0,'fastq_R2_fh'])
                    if (not exists("%s.qcd.fastq_unresolved_joined.qcd.fastq" % (fastq_R1_fh)))\
                    and (not exists("%s.qcd.fastq_unresolved_joined.qcd.fastq" % (fastq_R1_fh))):
                        if exists(fastq_R1_fh):
                            logging.info("processing: %s" % basename(fastq_R1_fh))
                            # print fastq_R1_fh_barcodes
                            barcode_R1s=[str(s) for s in list(fastq_R1_fh_barcodes.loc[:,'barcode_R1'])]
                            barcode_R2s=[str(s) for s in list(fastq_R1_fh_barcodes.loc[:,'barcode_R2'])]        
                            fastq_fns    =[str(s) for s in list(fastq_R1_fh_barcodes.loc[:,'fastq_fn'])]        
                            fastq2dplx(fastq_R1_fh+".qcd.fastq",fastq_R2_fh+".qcd.fastq",\
                                       barcode_R1s,barcode_R2s,fastq_fns)
                        else:
                            logging.info("fastq_R2_fh do not exist: %s" % fastq_R2_fh)
                    else:
                        logging.info("already done : %s" % fastq_R1_fh)        
                else:
                    logging.info("fastq_R1_fh do not exist: %s" % fastq_R1_fh)
        else:
            logging.info("skipping: because barcodes not present in cfg")        
    else:
        logging.info("skipping: because barcodes not present in cfg")        
    logging.shutdown()
예제 #10
0
def pipeline(prj_dh,step=None,test=False):        
    from dms2dfe import configure, ana0_fastq2dplx,ana0_fastq2sbam,ana0_getfeats,ana1_sam2mutmat,ana2_mutmat2fit,ana3_fit2comparison,ana4_modeller,ana4_plotter
    if exists(prj_dh):
        if step==0 or step==None:
            configure.main(prj_dh,"deps")
            configure.main(prj_dh)          
        if step==0.1 or step==None:
            ana0_fastq2dplx.main(prj_dh)
        if step==0.2 or step==None:
            ana0_fastq2sbam.main(prj_dh,test)
        if step==0.3:
            ana0_getfeats.main(prj_dh)
        if step==1 or step==None:
            ana1_sam2mutmat.main(prj_dh)
        if step==2 or step==None:
            ana2_mutmat2fit.main(prj_dh,test)
        if step==3 or step==None:
            ana0_getfeats.main(prj_dh)
            ana4_modeller.main(prj_dh,test)
        if step==4 or step==None:
            ana3_fit2comparison.main(prj_dh,test)
        if step==5 or step==None:
            ana0_getfeats.main(prj_dh)
            ana4_plotter.main(prj_dh)
        if step==None:
            logging.info("Location of output data: %s/plots/aas/data_comparison" % (prj_dh))
            logging.info("Location of output visualizations: %s/plots/aas/" % (prj_dh))
            logging.info("For information about file formats of outputs, refer to http://kc-lab.github.io/dms2dfe/io .")
    else:
        configure.main(prj_dh)                  
    logging.shutdown()
예제 #11
0
def main(prj_dh):
    """
    **--step 0.1**. Demultipexes .fastq files based on barcodes located at `prj_dh/cfg/barcodes`.

    :param prj_dh: path to project directory
    """
    logging.info("start")
    if not exists(prj_dh):
        logging.error("Could not find '%s'\n" % prj_dh)
        sys.exit()
    configure.main(prj_dh)
    from dms2dfe.tmp import info
    trimmomatic_fh = info.trimmomatic_fh

    if exists('%s/cfg/barcodes' % prj_dh):
        barcodes = pd.read_csv(prj_dh + '/cfg/barcodes')
        # print barcodes
        if len(barcodes) != 0:
            fastq_R1_fhs = [
                str(s) for s in barcodes.loc[:, 'fastq_R1_fh'].unique()
            ]
            fastq_fhs_list = [[s, s.replace('R1', 'R2')] for s in fastq_R1_fhs
                              if not exists("%s.qcd.fastq" % s)]  # pairs
            for fastq_fhs_tp in fastq_fhs_list:
                fastq2qcd(fastq_fhs_tp, trimmomatic_fh)


#             fastq_R1_fhs=[fastq_R1_fh+".qcd.fastq" for fastq_R1_fh in fastq_R1_fhs if not ".qcd." in fastq_R1_fh]
            barcodes = barcodes.set_index('fastq_R1_fh')
            for fastq_R1_fh in fastq_R1_fhs:
                if exists(fastq_R1_fh):
                    fastq_R1_fh_barcodes = barcodes.loc[fastq_R1_fh, :]
                    fastq_R2_fh = fastq_R1_fh.replace(
                        'R1',
                        'R2')  #str(fastq_R1_fh_barcodes.ix[0,'fastq_R2_fh'])
                    if (not exists("%s.qcd.fastq_unresolved_joined.qcd.fastq" % (fastq_R1_fh)))\
                    and (not exists("%s.qcd.fastq_unresolved_joined.qcd.fastq" % (fastq_R1_fh))):
                        if exists(fastq_R1_fh):
                            logging.info("processing: %s" %
                                         basename(fastq_R1_fh))
                            # print fastq_R1_fh_barcodes
                            barcode_R1s = [
                                str(s) for s in list(
                                    fastq_R1_fh_barcodes.loc[:, 'barcode_R1'])
                            ]
                            barcode_R2s = [
                                str(s) for s in list(
                                    fastq_R1_fh_barcodes.loc[:, 'barcode_R2'])
                            ]
                            fastq_fns = [
                                str(s) for s in list(
                                    fastq_R1_fh_barcodes.loc[:, 'fastq_fn'])
                            ]
                            fastq2dplx(fastq_R1_fh+".qcd.fastq",fastq_R2_fh+".qcd.fastq",\
                                       barcode_R1s,barcode_R2s,fastq_fns)
                        else:
                            logging.info("fastq_R2_fh do not exist: %s" %
                                         fastq_R2_fh)
                    else:
                        logging.info("already done : %s" % fastq_R1_fh)
                else:
                    logging.info("fastq_R1_fh do not exist: %s" % fastq_R1_fh)
        else:
            logging.info("skipping: because barcodes not present in cfg")
    else:
        logging.info("skipping: because barcodes not present in cfg")
    logging.shutdown()
예제 #12
0
def main(prj_dh, test=False, ml=False):
    """
    **--step 3**. Identifies molecular features that may determine fitness scores.
    
    This plots the results in following visualisations.
    
    .. code-block:: text
    
        ROC plots
        Relative importances of features
    
    :param prj_dh: path to project directory.
    """
    logging.info("start")

    if not exists(prj_dh):
        logging.error("Could not find '%s'" % prj_dh)
        sys.exit()
    configure.main(prj_dh)
    from dms2dfe.tmp import info

    from dms2dfe.tmp import info
    from dms2dfe.lib.io_ml import corrplot
    corrplot(info)

    if ml:
        from dms2dfe.lib.io_dfs import set_index
        from dms2dfe.lib.io_ml import data_fit2ml  #,get_cols_del,make_data_combo,data_combo2ml
        cores = int(info.cores)
        if hasattr(info, 'mut_type'):
            mut_type = info.mut_type
        else:
            mut_type = 'single'
        if hasattr(info, 'ml_input'):
            if info.ml_input == 'FC':
                ml_input = 'FCA_norm'
            elif info.ml_input == 'Fi':
                ml_input = 'FiA'
        else:
            ml_input = 'FCA_norm'
        type_form = "aas"
        if not exists("%s/plots/%s" % (prj_dh, type_form)):
            makedirs("%s/plots/%s" % (prj_dh, type_form))
        if not exists("%s/data_ml/%s" % (prj_dh, type_form)):
            makedirs("%s/data_ml/%s" % (prj_dh, type_form))
        data_feats = pd.read_csv("%s/data_feats/aas/data_feats_all" % (prj_dh))

        if mut_type == 'single':
            data_fit_keys = ["data_fit/%s/%s" % (type_form,basename(fh)) \
                             for fh in glob("%s/data_fit/aas/*" % prj_dh) \
                             if (not "inferred" in basename(fh)) and ("_WRT_" in basename(fh))]
            data_fit_keys = np.unique(data_fit_keys)

            if len(data_fit_keys) != 0:
                if test:
                    pooled_io_ml(data_fit_keys[0])
                    # for data_fit_key in data_fit_keys:
                    #     pooled_io_ml(data_fit_key)
                else:
                    for data_fit_key in data_fit_keys:
                        pooled_io_ml(data_fit_key)
                    # pool_io_ml=Pool(processes=int(cores))
                    # pool_io_ml.map(pooled_io_ml,data_fit_keys)
                    # pool_io_ml.close(); pool_io_ml.join()
            else:
                logging.info("already processed")
        elif mut_type == 'double':
            data_feats = set_index(data_feats, 'mutids')
            data_fit_dh = 'data_fit_dm'
            data_fit_keys = ["%s/%s/%s" % (data_fit_dh,type_form,basename(fh)) \
                             for fh in glob("%s/%s/aas/*" % (prj_dh,data_fit_dh)) \
                             if (not "inferred" in basename(fh)) and ("_WRT_" in basename(fh))]
            data_fit_keys = np.unique(data_fit_keys)
            ycol = ml_input
            Xcols = data_feats.columns
            if len(data_fit_keys) != 0:
                for data_fit_key in data_fit_keys:
                    data_fit_dm_fh = '%s/%s' % (prj_dh, data_fit_key)
                    data_combo_fh = '%s/data_ml/aas/%s.combo' % (
                        prj_dh, basename(data_fit_dm_fh))
                    force = False
                    if not exists(data_combo_fh) or force:
                        data_fit_dm = pd.read_csv(data_fit_dm_fh).set_index(
                            'mutids')
                        data_combo = make_data_combo(data_fit_dm, data_feats,
                                                     ycol, Xcols)
                        if not exists(dirname(data_combo_fh)):
                            makedirs(dirname(data_combo_fh))
                        data_combo.to_csv(data_combo_fh)
                    else:
                        data_combo = pd.read_csv(data_combo_fh).set_index(
                            'mutids')
                    logging.info('ml: start')
                    data_combo2ml(
                        data_combo,
                        basename(data_fit_dm_fh),
                        dirname(data_combo_fh),
                        dirname(data_combo_fh),
                        ycoln=ycol,
                        col_idx='mutids',
                        ml_type='cls',
                        middle_percentile_skipped=0.1,
                        force=False,
                    )

    def pooled_io_ml(data_fit_key):
        """
        This module makes use of muti threading to speed up `dms2dfe.lib.io_ml.data_fit2ml`.     
        
        :param data_fit_key: in the form <data_fit>/<aas/cds>/<name of file>.
        """
        from dms2dfe.tmp import info
        dX_fh = "%s/data_feats/aas/data_feats_all" % (info.prj_dh)
        dy_fh = '%s/%s' % (info.prj_dh, data_fit_key)
        logging.info('processing: %s' % basename(dy_fh))
        data_fit2ml(dX_fh, dy_fh, info, regORcls='cls')

    logging.shutdown()
예제 #13
0
def main(prj_dh,test=False):
    """
    **--step 2**. Converts mutation matrices (.mat files produced in upstream ana1_sam2mutmat module) and calculates the fitness scores.
    
    The output data is saved in `data_fit` format as described in :ref:`io`.
    
    :param prj_dh: path to project directory.
    """
    logging.info("start")
    if not exists(prj_dh) :
        logging.error("Could not find '%s'" % prj_dh)
        sys.exit()
    configure.main(prj_dh)

    global prj_dh_global,host,norm_type,fsta_len,cctmr_global,output_dh,prj_dh_global,lbls,Ni_cutoff,fsta_fh_global,clips

    from dms2dfe.tmp import info
    fsta_fh=info.fsta_fh
    cctmr=info.cctmr
    host=info.host
    cores=info.cores
    transform_type=info.transform_type
    norm_type=info.norm_type
    fsta_len=info.fsta_len
    Ni_cutoff=int(info.Ni_cutoff)
    rscript_fh=info.rscript_fh
    if hasattr(info, 'mut_type'):
        mut_type=info.mut_type
    else:
        mut_type='single'    
    
    lbls=pd.read_csv(prj_dh+'/cfg/lbls')
    lbls=lbls.set_index('varname')

    # SET global variables
    prj_dh_global=prj_dh
    fsta_fh_global=fsta_fh    
    if cctmr != 'nan':
        cctmr=[int("%s" % i) for i in cctmr.split(" ")]
        cctmr_global=[(cctmr[0],cctmr[1]),(cctmr[2],cctmr[3])]
    else:
        cctmr_global=None

    if info.clips != 'nan':
        clips=[int(s) for s in info.clips.split(' ')]
    else:
        clips=None

    if mut_type=='single':
        lbls_list=getusable_lbls_list(prj_dh)
        if len(lbls_list)!=0:
            if test:
                pooled_mut_mat_cds2data_lbl(lbls_list[0])
            else:
                pool_mut_mat_cds2data_lbl=Pool(processes=int(cores)) 
                pool_mut_mat_cds2data_lbl.map(pooled_mut_mat_cds2data_lbl,lbls_list)
                pool_mut_mat_cds2data_lbl.close(); pool_mut_mat_cds2data_lbl.join()
        else:
            logging.info("already processed: mut_mat_cds2data_lbl")
        #TRANSFORM
        if (transform_type=='rlog') or (transform_type=='vst'):
            logging.info("transforming frequencies: %s" % transform_type)
            transform_data_lbl_deseq(prj_dh,transform_type,rscript_fh)
        else:
            logging.info("transforming frequencies: %s" % transform_type)
            transform_data_lbl(prj_dh,transform_type)
        #FITNESS
        fits_pairs_list=getusable_fits_list(prj_dh,data_fit_dh='data_fit')    
        if len(fits_pairs_list)!=0:
            if test:
                # pooled_data_lbl2data_fit(fits_pairs_list[0])      
                for fits_pairs in fits_pairs_list:
                    pooled_data_lbl2data_fit(fits_pairs)
            else:
                pool_data_lbl2data_fit=Pool(processes=int(cores)) 
                pool_data_lbl2data_fit.map(pooled_data_lbl2data_fit,
                                           fits_pairs_list)
                pool_data_lbl2data_fit.close(); pool_data_lbl2data_fit.join()
        else:
            logging.info("already processed: data_lbl2data_fit")
    elif mut_type=='double':
        fits_pairs_list_dm=getusable_fits_list(prj_dh,data_fit_dh='data_fit_dm')    
        if len(fits_pairs_list_dm)!=0:
            if test:
                data_lbl2data_fit_dm(fits_pairs_list_dm[0],prj_dh,data_lbl_dh='data_lbl_dm',
                        data_fit_dh='data_fit_dm')      
            else:
                for fits_pairs in fits_pairs_list_dm:
                    data_lbl2data_fit_lite(fits_pairs,prj_dh,data_lbl_dh='data_lbl_dm',
                        data_fit_dh='data_fit_dm')
        else:
            logging.info("already processed: data_lbl2data_fit")
    logging.shutdown()
예제 #14
0
def test_configure():
    from dms2dfe import configure
    configure.main('prj')