def num_pass_excel_gene_level(fail_pass_excel_dict, cur, ref_genome, out_path,
                              biotype, gencode, transcript_gene_map):
    file_name = "{}_num_pass_excel_gene_level".format(gencode)
    results = []
    for genome, (fail_ids, pass_specific_ids,
                 excel_ids) in fail_pass_excel_dict.iteritems():
        excel_genes = {
            transcript_gene_map[psl_lib.strip_alignment_numbers(x)]
            for x in excel_ids
        }
        pass_specific_genes = {
            transcript_gene_map[psl_lib.strip_alignment_numbers(x)]
            for x in pass_specific_ids
        }
        fail_genes = {
            transcript_gene_map[psl_lib.strip_alignment_numbers(x)]
            for x in fail_ids
        }
        num_genes = len(set(transcript_gene_map.values()))
        num_excel_genes = len(excel_genes)
        num_pass_genes = len(pass_specific_genes - excel_genes)
        num_fail_genes = len(fail_genes - (pass_specific_genes | excel_genes))
        num_no_aln = num_genes - (num_excel_genes + num_pass_genes +
                                  num_fail_genes)
        raw = np.array(
            [num_excel_genes, num_pass_genes, num_fail_genes, num_no_aln])
        assert all([x >= 0 for x in raw])
        norm = raw / (0.01 * num_genes)
        results.append([genome, norm])
    title_string = "Proportion of {:,} {} genes in {}\nwith at least one transcript categorized as Excellent/Pass/Fail"
    title_string = title_string.format(num_genes, biotype.replace("_", " "),
                                       gencode)
    legend_labels = ["Excellent", "Pass", "Fail", "NoAln"]
    plot_lib.stacked_barplot(results, legend_labels, out_path, file_name,
                             title_string)
Exemplo n.º 2
0
def highest_cov_aln(cur, genome, filter_chroms=None):
    """
    Returns the set of alignment IDs that represent the best alignment for each source transcript (that mapped over)
    Best is defined as highest %COV. Also reports the associated coverage and identity values.
    """
    tm_stats = get_stats(cur,
                         genome,
                         mode="transMap",
                         filter_chroms=filter_chroms)
    combined_covs = defaultdict(list)
    for aln_id, (cov, ident) in tm_stats.iteritems():
        tx_id = psl_lib.strip_alignment_numbers(aln_id)
        combined_covs[tx_id].append([aln_id, cov, ident])
    best_cov = {}
    for tx_id, vals in combined_covs.iteritems():
        best_cov[tx_id] = sorted(vals, key=lambda x: [-x[1], -x[2]])[0]
    return best_cov
Exemplo n.º 3
0
def build_pass_track(target, args):
    """
    Builds a specific track of Good transcripts for the current mode.
    """
    colors = {
        "coding": "59,101,69",
        "noncoding": "98,124,191",
        "not_pass": "******"
    }
    con, cur = sql_lib.attach_databases(args.outDir, args.mode)
    biotype_map = sql_lib.get_transcript_biotype_map(cur, args.refGenome)
    if args.mode == "augustus":
        query = etc.config.augustusEval(args.genome, args.refGenome)
        pass_ids = sql_lib.get_query_ids(cur, query)
        out_pass_bed_path, out_pass_big_bed_path = get_bed_paths(
            args.outDir, "augustus", args.genome)
        gp_dict = seq_lib.get_transcript_dict(args.augustusGp)
    elif args.mode == "reference":  # for reference, we are more interested in what is NOT Good
        query = etc.config.refEval(args.refGenome)
        pass_ids = biotype_map.viewkeys() - sql_lib.get_query_ids(
            cur, query)  # actually not pass
        out_pass_bed_path, out_pass_big_bed_path = get_bed_paths(
            args.outDir, "reference", args.refGenome)
        gp_dict = seq_lib.get_transcript_dict(args.annotationGp)
    elif args.mode == "transMap":
        pass_ids = get_all_tm_pass(cur, args.refGenome, args.genome)
        out_pass_bed_path, out_pass_big_bed_path = get_bed_paths(
            args.outDir, "transMap", args.genome)
        gp_dict = seq_lib.get_transcript_dict(args.targetGp)
    else:
        raise RuntimeError(
            "Somehow your argparse object does not contain a valid mode.")
    with open(out_pass_bed_path, "w") as outf:
        for aln_id, rec in gp_dict.iteritems():
            tx_id = psl_lib.strip_alignment_numbers(aln_id)
            if aln_id in pass_ids:
                if biotype_map[tx_id] == "protein_coding":
                    bed = rec.get_bed(rgb=colors["coding"])
                    outf.write("".join(["\t".join(map(str, bed)), "\n"]))
                else:
                    bed = rec.get_bed(rgb=colors["noncoding"])
                    outf.write("".join(["\t".join(map(str, bed)), "\n"]))
            else:
                bed = rec.get_bed(rgb=colors["not_pass"])
                outf.write("".join(["\t".join(map(str, bed)), "\n"]))
    make_big_bed(out_pass_bed_path, args.sizes, out_pass_big_bed_path)
def build_data_dict(id_names, id_list, transcript_gene_map,
                    gene_transcript_map):
    """
    Builds a dictionary mapping gene_id -> transcript_ids -> aln_ids in id_names bins (as an OrderedDict)
    """
    data_dict = defaultdict(dict)
    for gene_id in gene_transcript_map:
        for ens_id in gene_transcript_map[gene_id]:
            data_dict[gene_id][ens_id] = OrderedDict((x, []) for x in id_names)
    for ids, n in zip(*[id_list, id_names]):
        for aln_id in ids:
            ens_id = psl_lib.strip_alignment_numbers(aln_id)
            if ens_id not in transcript_gene_map:
                # Augustus was fed chrY transcripts
                continue
            gene_id = transcript_gene_map[ens_id]
            if gene_id in data_dict and ens_id in data_dict[gene_id]:
                data_dict[gene_id][ens_id][n].append(aln_id)
    return data_dict
def fix_gene_pred(gp, transcript_gene_map):
    """
    These genePreds have a few problems. First, the alignment numbers must be removed. Second, we want to fix
    the name2 field to be the gene name. Third, we want to set the unique ID field. Finally, we want to sort the whole
    thing by genomic coordinates.
    Also reports the number of genes and transcripts seen.
    """
    genes = set()
    txs = set()
    gp = sorted([x.split("\t") for x in gp], key=lambda x: [x[1], x[3]])
    fixed = []
    for x in gp:
        x[10] = x[0]  # use unique Aug/TM ID as unique identifier
        tx_id = psl_lib.strip_alignment_numbers(x[0])
        x[0] = tx_id
        gene_id = transcript_gene_map[tx_id]
        x[11] = gene_id
        fixed.append(x)
        genes.add(gene_id)
        txs.add(tx_id)
    return len(genes), len(txs), ["\t".join(x) for x in fixed]
Exemplo n.º 6
0
highest_covs = sql_lib.highest_cov_aln(cur, genome)
highest_cov_map = {x: y[0] for x, y in highest_covs.iteritems()}

gene_fail_ids = set(gene_fail_evaluation["Fail"])
transcript_fail_ids = []
for x in gene_fail_ids:
    t_ids = gene_transcript_map[x]
    for t_id in t_ids:
        if t_id in highest_cov_map:
            a_id = highest_cov_map[t_id]
            transcript_fail_ids.append(a_id)

df = pd.read_sql("Select AlignmentId,{} FROM main.'gorilla'".format(",".join(etc.config.all_classifiers)), con, index_col="AlignmentId")
df2 = df.ix[transcript_fail_ids]
gene_ids = {n: transcript_gene_map[psl_lib.strip_alignment_numbers(n)] for n in df2.index}
df3 = df2.copy()
df3["GeneId"] = pd.Series(gene_ids)
df3.to_csv("failed_gene_classifiers.tsv", sep="\t")

no_tm_ids = set(transcript_evaluation["NoTransMap"])
no_tm_genes = {n: transcript_gene_map[psl_lib.strip_alignment_numbers(n)] for n in no_tm_ids}



def augustusEval(genome):
    query = ("SELECT augustus.'gorilla'.AugustusAlignmentId FROM augustus_attributes.'{0}' JOIN main.'{0}' ON "
             "main.'{0}'.AlignmentId = augustus_attributes.'{0}'.AlignmentId JOIN augustus.'{0}' USING "
             "(AugustusAlignmentId) WHERE (AugustusNotSameStart = 0 OR "
             "(main.'{0}'.HasOriginalStart = 1 OR main.'{0}'.StartOutOfFrame = 1)) AND "
             "(AugustusNotSameStop = 0 OR HasOriginalStop = 1) AND "