Exemplo n.º 1
0
def make_linreg(expt_dir,
                name,
                gene_seq_fname,
                gene_len_fname,
                tr_codons_fname,
                te_codons_fname,
                outputs_fname,
                rel_cod_idxs=False,
                rel_nt_idxs=False,
                rel_struc_idxs=False,
                struc_fname=False):
    linreg.linreg_init(expt_dir,
                       name,
                       gene_seq_fname,
                       gene_len_fname,
                       tr_codons_fname,
                       te_codons_fname,
                       outputs_fname,
                       rel_cod_idxs=rel_cod_idxs,
                       rel_nt_idxs=rel_nt_idxs)
    len_dict = proc.get_len_dict(gene_len_fname)
    cds_dict = proc.get_cds_dict(gene_seq_fname, len_dict)
    if struc_fname:
        struc_dict = proc.get_struc_dict(struc_fname)
    else:
        struc_dict = False
    X_tr, X_te, y_tr, y_te = proc.get_data_matrices(
        cds_dict,
        tr_codons_fname,
        te_codons_fname,
        outputs_fname,
        rel_cod_idxs=rel_cod_idxs,
        rel_nt_idxs=rel_nt_idxs,
        rel_struc_idxs=rel_struc_idxs,
        struc_dict=struc_dict)
    wts, y_tr_hat, y_te_hat = linreg.train_and_predict(X_tr, X_te, y_tr, y_te)
    linreg_fname = "{0}/linreg/{1}".format(expt_dir, name)
    proc.pickle_obj(wts, "{0}/wts.pkl".format(linreg_fname))
    proc.pickle_obj(y_tr, "{0}/y_tr.pkl".format(linreg_fname))
    proc.pickle_obj(y_tr_hat, "{0}/y_tr_hat.pkl".format(linreg_fname))
    proc.pickle_obj(y_te, "{0}/y_te.pkl".format(linreg_fname))
    proc.pickle_obj(y_te_hat, "{0}/y_te_hat.pkl".format(linreg_fname))
    return wts, y_tr_hat, y_te_hat, y_tr, y_te
Exemplo n.º 2
0
        # Init vars
        rel_cod_idxs = range(-5, 5)
        rel_nt_idxs = range(-15, 15)
        rel_struc_idxs = range(-17, -14)

        nn_dir = nn_parent_dir + "/" + model_name +\
            "_rep{0}".format(model_rep)

        cod_trunc_5p = 20
        cod_trunc_3p = 20
        min_tot_cts = 200
        min_cod_w_cts = 100

        # Load transcriptome dicts
        len_dict = proc.get_len_dict(gene_len_fname)
        cds_dict = proc.get_cds_dict(gene_seq_fname, len_dict)
        struc_dict = proc.get_struc_dict(struc_fname)

        # Compute cts_by_codon and outputs
        cts_by_codon = proc.load_cts_by_codon(cts_by_codon_fname)
        outputs = proc.load_outputs(outputs_fname)
        paralog_groups = proc.load_paralog_groups(paralog_groups_fname)

        # Get gene list to compute performance metrics
        gene_set = cts_by_codon.keys()
        # Filter genes that are shorter than truncation regions
        gene_set = filter(lambda gene: len(cts_by_codon[gene]) > (cod_trunc_5p +
            cod_trunc_3p), gene_set)
        # Filter genes that don't have enough counts to meet cutoffs
        gene_set = filter(lambda gene: proc.has_enough_cts( \
            cts_by_codon[gene][cod_trunc_5p:-cod_trunc_3p], min_tot_cts, \
Exemplo n.º 3
0
def process_sam_file(expt_dir,
                     sam_fname,
                     gene_seq_fname,
                     gene_len_fname,
                     shift_dict,
                     cod_trunc_5p,
                     cod_trunc_3p,
                     min_fp_size,
                     max_fp_size,
                     num_tr_genes,
                     num_te_genes,
                     min_cts_per_gene,
                     min_cod_w_data,
                     raw_psct=0,
                     paralog_groups_fname=False,
                     overwrite=False,
                     folds=False):
    """
    Processes an RP sam file for an experiment.
    Makes in expt_dir/process:
        cts_by_codon file   (sum sam map wts per codon)
        outputs file        (scaled cts_by_codon, each gene mean centered at 1)
        te_bounds file      (first and last codon idxs per test set gene)
        te_data_table file  (data table for test set codons)
        tr_bounds file      (first and last codon idxs per training set gene)
        tr_data_table file  (data table for training set codons)

    Args: 
        expt_dir (str) - name of experiment directory
        sam_fname (str) - name of input sam file
        gene_seq_fname (str) - name of transcriptome fasta file
        gene_len_fname (str) - name of gene lengths file
        shift_dict (dict): 
            {fp_size (int): 
                {frame (int): shift (int, or False) for frame in range(2)}
                for fp_size in range(min_fp_size, max_fp_size + 1) }
        cod_trunc_5p (int): number of codons to exclude at start of each CDS
        cod_trunc_3p (int): number of codons to exclude at end of each CDS
        min_fp_size (int): minimum size footprint to accept in sam file
        max_fp_size (int): maximum size footprint to accept in sam file
        num_tr_genes (int): number of genes to sort into training set
        num_te_genes (int): number of genes to sort into test set
        min_cts_per_gene (int): 
            cutoff for total cts on gene to include gene in tr/te sets
        min_cod_w_data (int):
            cutoff for codons with data to include gene in tr/te sets
        raw_psct (float): psct to add to raw cts_by_codon values
        paralog_groups_fname (str):
            file containing one group of mutually paralogous genes on each line
            results in filtering out all but one paralog (NOTE which one?)
        overwrite (bool): flag to overwrite processed files, default False
        folds (bool): 
            might implement this later to divide genes into folds
            rather than explicit training/test sets

    Returns: 
        void, makes files listed above
    """
    # Load CDS dict, len dict
    len_dict = proc.get_len_dict(gene_len_fname)
    cds_dict = proc.get_cds_dict(gene_seq_fname, len_dict)

    # Process and write cts by codon file
    cts_by_codon = proc.get_cts_by_codon(sam_fname, cds_dict, len_dict,
                                         shift_dict, min_fp_size, max_fp_size)
    cts_by_codon_fname = expt_dir + \
        "/process/cts_by_codon.size.{0}.{1}.txt".format(
            min_fp_size, max_fp_size)
    if not os.path.isfile(cts_by_codon_fname):
        print "making file " + cts_by_codon_fname
        proc.write_cts_by_codon(cts_by_codon_fname, cts_by_codon)
    else:
        print "file " + cts_by_codon_fname + " already exists"

    #Process and write outputs file
    outputs = proc.get_outputs(cts_by_codon,
                               cod_trunc_5p,
                               cod_trunc_3p,
                               raw_psct=raw_psct)
    outputs_fname = expt_dir + \
        "/process/outputs.size.{0}.{1}.txt".format(min_fp_size, max_fp_size)
    if raw_psct:
        outputs_fname = outputs_fname[:-4] +\
            ".raw_psct.{0}.txt".format(raw_psct)
    if not os.path.isfile(outputs_fname):
        print "making file " + outputs_fname
        proc.write_outputs(outputs_fname, outputs)
    else:
        print "file " + outputs_fname + " already exists"

    #Make training and test set codon files
    tr_set_fname = expt_dir + "/process/tr_set_bounds.size." + \
        "{0}.{1}.trunc.{2}.{3}.min_cts.{4}.min_cod.{5}.top.{6}.txt".format(
            min_fp_size, max_fp_size, cod_trunc_5p, cod_trunc_3p,
            min_cts_per_gene, min_cod_w_data, num_tr_genes + num_te_genes)
    te_set_fname = expt_dir + "/process/te_set_bounds.size." + \
        "{0}.{1}.trunc.{2}.{3}.min_cts.{4}.min_cod.{5}.top.{6}.txt".format(
            min_fp_size, max_fp_size, cod_trunc_5p, cod_trunc_3p,
            min_cts_per_gene, min_cod_w_data, num_tr_genes + num_te_genes)
    print "making file " + tr_set_fname
    print "making file " + te_set_fname
    proc.make_codon_set_files(cds_dict,
                              cts_by_codon_fname,
                              outputs_fname,
                              tr_set_fname,
                              te_set_fname,
                              num_tr_genes,
                              num_te_genes,
                              cod_trunc_5p,
                              cod_trunc_3p,
                              min_cts_per_gene,
                              min_cod_w_data,
                              paralog_groups_fname=paralog_groups_fname,
                              overwrite=overwrite)