def main():
    args = parse_args()
    con, cur = sql_lib.attach_databases(args.comparativeAnnotationDir,
                                        mode="transMap")
    highest_cov_dict = sql_lib.get_highest_cov_alns(cur, args.genomes,
                                                    args.filterChroms)
    for biotype in sql_lib.get_all_biotypes(cur,
                                            args.refGenome,
                                            gene_level=False):
        biotype_ids = sql_lib.get_biotype_ids(cur,
                                              args.refGenome,
                                              biotype,
                                              filter_chroms=args.filterChroms)
        transcript_gene_map = sql_lib.get_transcript_gene_map(
            cur, args.refGenome, biotype, filter_chroms=args.filterChroms)
        if len(
                biotype_ids
        ) > 50:  # hardcoded cutoff to avoid issues where this biotype/gencode mix is nearly empty
            fail_pass_excel_dict = get_fail_pass_excel_dict(
                cur, args.refGenome, args.genomes, highest_cov_dict, biotype,
                args.filterChroms)
            out_path = os.path.join(args.outDir, "transmap_analysis", biotype)
            mkdir_p(out_path)
            cov_ident_wrapper(highest_cov_dict, args.genomes, out_path,
                              biotype, args.gencode, biotype_ids)
            cat_plot_wrapper(cur, highest_cov_dict, args.genomes, out_path,
                             biotype, args.gencode, biotype_ids)
            paralogy_plot(cur, args.genomes, out_path, biotype, biotype_ids,
                          args.gencode)
            num_pass_excel(fail_pass_excel_dict, cur, args.refGenome, out_path,
                           biotype, args.gencode, biotype_ids)
            num_pass_excel_gene_level(fail_pass_excel_dict, cur,
                                      args.refGenome, out_path, biotype,
                                      args.gencode, transcript_gene_map)
def align_wrapper(target, recs, file_tree, ref_tx_fasta, target_genome_fasta,
                  comp_ann_path, ref_genome, mode):
    """
    Alignment wrapper for grouped CGP records or grouped consensus records.
    For CGP mode, pulls down a gene -> transcript map and uses this to determine alignment targets, if they exist.
    """
    tmp_dir = target.getGlobalTempDir()
    results = []
    if mode == "cgp":
        con, cur = attach_databases(comp_ann_path, mode="reference")
        gene_transcript_map = get_gene_transcript_map(cur,
                                                      ref_genome,
                                                      biotype="protein_coding")
    for rec in recs:
        gp = GenePredTranscript(rec.rstrip().split("\t"))
        gene_names = gp.name2.split(",")
        if mode == "cgp":
            tx_dict = {
                n: gene_transcript_map[n]
                for n in gene_names if n in gene_transcript_map
            }
            if len(tx_dict) > 0:
                results.extend(
                    align_cgp(tmp_dir, gp, target_genome_fasta, tx_dict,
                              ref_tx_fasta))
        else:
            results.append(
                align_consensus(tmp_dir, gp, target_genome_fasta,
                                ref_tx_fasta))
    with open(file_tree.getTempFile(), "w") as outf:
        for x in results:
            outf.write("".join([",".join(x), "\n"]))
Пример #3
0
def main():
    args = parse_args()
    mkdir_p(args.outDir)
    biotype_tx_counter = DefaultOrderedDict(lambda: defaultdict(int))
    biotype_gene_counter = DefaultOrderedDict(lambda: defaultdict(int))
    gencode_biotype_bin_dict_str = "etc.config.{}".format(args.gencode)
    gencode_biotype_bin_dict = eval(gencode_biotype_bin_dict_str)
    con, cur = sql_lib.attach_databases(args.compAnnPath, mode="reference")
    biotypes = sql_lib.get_all_biotypes(cur, args.refGenome, gene_level=True)
    for biotype in biotypes:
        tx_evals, gene_evals, gene_fail_evals, tx_dup_rate, tx_counts, gene_counts = load_evaluations(
            args.workDir, args.genomes, biotype)
        if len(tx_evals) == 0:  # a biotype may have nothing
            continue
        if biotype in args.biotypes:
            for (evals, counts), mode in zip(
                    *[[[tx_evals, tx_counts], [gene_evals, gene_counts]],
                      ["transcripts", "genes"]]):
                transcript_gene_plot(evals, args.outDir, args.gencode, mode,
                                     biotype)
                size_plot(counts, args.outDir, args.gencode, mode, biotype)
            gene_fail_plot(gene_fail_evals, args.outDir, args.gencode, biotype)
            dup_rate_plot(tx_dup_rate, args.outDir, args.gencode, biotype)
        biotype_bin = gencode_biotype_bin_dict.get(biotype, "Other")
        tx_evals_collapsed = collapse_evals(tx_evals)
        gene_evals_collapsed = collapse_evals(gene_evals)
        for genome, tx_count in tx_evals_collapsed.iteritems():
            biotype_tx_counter[genome][biotype_bin] += tx_count
            biotype_gene_counter[genome][biotype_bin] += gene_evals_collapsed[
                genome]
    for mode, counter in zip(*[["transcript", "gene"],
                               [biotype_tx_counter, biotype_gene_counter]]):
        biotype_stacked_plot(counter, args.outDir, args.gencode, mode)
Пример #4
0
def main():
    args = parse_args()
    # attach regular comparativeAnnotator reference databases in order to build gene-transcript map
    con, cur = sql_lib.attach_databases(args.compAnnPath, mode="reference")
    gene_transcript_map = sql_lib.get_gene_transcript_map(
        cur, args.refGenome, biotype="protein_coding")
    transcript_gene_map = sql_lib.get_transcript_gene_map(
        cur, args.refGenome, biotype="protein_coding")
    # open CGP database -- we don't need comparativeAnnotator databases anymore
    cgp_db = os.path.join(args.compAnnPath, args.cgpDb)
    con, cur = sql_lib.open_database(cgp_db)
    # load both consensus and CGP into dictionaries
    consensus_dict = seq_lib.get_transcript_dict(args.consensusGp)
    cgp_dict = seq_lib.get_transcript_dict(args.cgpGp)
    # load the BLAT results from the sqlite database
    cgp_stats_query = "SELECT CgpId,EnsId,AlignmentCoverage,AlignmentIdentity FROM '{}_cgp'".format(
        args.genome)
    cgp_stats_dict = sql_lib.get_multi_index_query_dict(cur,
                                                        cgp_stats_query,
                                                        num_indices=2)
    consensus_stats_query = (
        "SELECT EnsId,AlignmentCoverage,AlignmentIdentity FROM "
        "'{}_consensus'".format(args.genome))
    consensus_stats_dict = sql_lib.get_query_dict(cur, consensus_stats_query)
    # load the intron bits
    intron_dict = load_intron_bits(args.intronBitsPath)
    # final dictionaries
    final_consensus = {}
    metrics = {}
    # save all CGP transcripts which have no associated genes
    find_new_transcripts(cgp_dict, final_consensus, metrics)
    # save all CGP transcripts whose associated genes are not in the consensus
    consensus_genes = {x.name2 for x in consensus_dict.itervalues()}
    find_missing_transcripts(cgp_dict, consensus_genes, intron_dict,
                             final_consensus, metrics, consensus_dict,
                             gene_transcript_map)
    # remove all such transcripts from the cgp dict before we evaluate for updating
    cgp_dict = {
        x: y
        for x, y in cgp_dict.iteritems() if x not in final_consensus
    }
    update_transcripts(cgp_dict, consensus_dict, args.genome,
                       gene_transcript_map, transcript_gene_map, intron_dict,
                       final_consensus, metrics, cgp_stats_dict,
                       consensus_stats_dict)
    evaluate_cgp_consensus(final_consensus, metrics)
    # write results out to disk
    with open(
            os.path.join(args.metricsOutDir, args.genome + ".metrics.pickle"),
            "w") as outf:
        pickle.dump(metrics, outf)
    with open(args.outGp, "w") as outf:
        for tx_id, tx in sorted(final_consensus.iteritems(),
                                key=lambda x: [x[1].chromosome, x[1].start]):
            outf.write("\t".join(map(str, tx.get_gene_pred())) + "\n")
Пример #5
0
def build_classifier_tracks(target, query_fn, genome, args):
    query = query_fn(genome)
    query_name = query_fn.__name__
    con, cur = sql_lib.attach_databases(args.outDir, mode=args.mode)
    bed_recs = sql_lib.execute_query(cur, query)
    out_bed_path, out_big_bed_path = get_bed_paths(args.outDir, query_name,
                                                   genome)
    with open(out_bed_path, "w") as outf:
        for recs in bed_recs:
            for rec in recs:
                if rec is not None:
                    outf.write(rec)
    make_big_bed(out_bed_path, args.sizes, out_big_bed_path)
Пример #6
0
def main_ref_fn(target, comp_ann_path, gencode, ref_genome, base_out_path, filter_chroms):
    clust_title = "Hierarchical_clustering_of_transcript_classifiers"
    base_barplot_title = ("Classifiers failed by {} transcripts in the reference set {}\n")
    out_path = os.path.join(base_out_path, "clustering", ref_genome)
    mkdir_p(out_path)
    con, cur = sql_lib.attach_databases(comp_ann_path, mode="reference")
    biotype_ids = sql_lib.get_biotype_ids(cur, ref_genome, biotype, filter_chroms=filter_chroms)
    if len(biotype_ids) > 50:
        sql_data = sql_lib.load_data(con, ref_genome, etc.config.ref_classifiers, primary_key="TranscriptId")
        out_barplot_file = os.path.join(out_path, "reference_barplot_{}".format(gencode))
        barplot_title = base_barplot_title.format(biotype.replace("_", " "), gencode)
        munged, stats = munge_data(sql_data, biotype_ids)
        plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title)
        data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString())
        munged.to_csv(data_path)
        out_cluster_file = os.path.join(out_path, "reference_clustering_{}".format(gencode))
        target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
Пример #7
0
def build_pass_track(target, args):
    """
    Builds a specific track of Good transcripts for the current mode.
    """
    colors = {
        "coding": "59,101,69",
        "noncoding": "98,124,191",
        "not_pass": "******"
    }
    con, cur = sql_lib.attach_databases(args.outDir, args.mode)
    biotype_map = sql_lib.get_transcript_biotype_map(cur, args.refGenome)
    if args.mode == "augustus":
        query = etc.config.augustusEval(args.genome, args.refGenome)
        pass_ids = sql_lib.get_query_ids(cur, query)
        out_pass_bed_path, out_pass_big_bed_path = get_bed_paths(
            args.outDir, "augustus", args.genome)
        gp_dict = seq_lib.get_transcript_dict(args.augustusGp)
    elif args.mode == "reference":  # for reference, we are more interested in what is NOT Good
        query = etc.config.refEval(args.refGenome)
        pass_ids = biotype_map.viewkeys() - sql_lib.get_query_ids(
            cur, query)  # actually not pass
        out_pass_bed_path, out_pass_big_bed_path = get_bed_paths(
            args.outDir, "reference", args.refGenome)
        gp_dict = seq_lib.get_transcript_dict(args.annotationGp)
    elif args.mode == "transMap":
        pass_ids = get_all_tm_pass(cur, args.refGenome, args.genome)
        out_pass_bed_path, out_pass_big_bed_path = get_bed_paths(
            args.outDir, "transMap", args.genome)
        gp_dict = seq_lib.get_transcript_dict(args.targetGp)
    else:
        raise RuntimeError(
            "Somehow your argparse object does not contain a valid mode.")
    with open(out_pass_bed_path, "w") as outf:
        for aln_id, rec in gp_dict.iteritems():
            tx_id = psl_lib.strip_alignment_numbers(aln_id)
            if aln_id in pass_ids:
                if biotype_map[tx_id] == "protein_coding":
                    bed = rec.get_bed(rgb=colors["coding"])
                    outf.write("".join(["\t".join(map(str, bed)), "\n"]))
                else:
                    bed = rec.get_bed(rgb=colors["noncoding"])
                    outf.write("".join(["\t".join(map(str, bed)), "\n"]))
            else:
                bed = rec.get_bed(rgb=colors["not_pass"])
                outf.write("".join(["\t".join(map(str, bed)), "\n"]))
    make_big_bed(out_pass_bed_path, args.sizes, out_pass_big_bed_path)
def main():
    args = parse_args()
    con, cur = sql_lib.attach_databases(args.compAnnPath, mode=args.mode)
    biotypes = sql_lib.get_all_biotypes(cur, args.refGenome, gene_level=True)
    transcript_gene_map = sql_lib.get_transcript_gene_map(
        cur, args.refGenome, biotype=None, filter_chroms=args.filterChroms)
    gps = load_gps(
        args.gps
    )  # load all Augustus and transMap transcripts into one big dict
    consensus_base_path = os.path.join(args.outDir, args.genome)
    stats = get_stats(cur, args.genome, args.mode)
    ref_gene_intervals = build_ref_intervals(cur, args.genome)
    tgt_intervals = build_tgt_intervals(gps)
    for biotype in biotypes:
        gene_transcript_map = sql_lib.get_gene_transcript_map(
            cur,
            args.refGenome,
            biotype=biotype,
            filter_chroms=args.filterChroms)
        binned_transcripts, consensus = consensus_by_biotype(
            cur, args.refGenome, args.genome, biotype, gps,
            transcript_gene_map, gene_transcript_map, stats, args.mode,
            ref_gene_intervals, tgt_intervals)
        deduplicated_consensus, dup_count = deduplicate_consensus(
            consensus, gps, stats)
        if len(deduplicated_consensus
               ) > 0:  # some biotypes we may have nothing
            num_genes, num_txs = write_gps(deduplicated_consensus, gps,
                                           consensus_base_path, biotype,
                                           transcript_gene_map, args.mode)
            if biotype == "protein_coding":
                gene_transcript_evals = evaluate_coding_consensus(
                    binned_transcripts, stats, ref_gene_intervals, gps,
                    args.mode)
            else:
                gene_transcript_evals = evaluate_noncoding_consensus(
                    binned_transcripts, stats, gps)
            p = os.path.join(args.workDir, "_".join([args.genome, biotype]))
            mkdir_p(os.path.dirname(p))
            gene_transcript_evals["duplication_rate"] = dup_count
            gene_transcript_evals["gene_counts"] = num_genes
            gene_transcript_evals["tx_counts"] = num_txs
            with open(p, "w") as outf:
                pickle.dump(gene_transcript_evals, outf)
Пример #9
0
def main_fn(target, comp_ann_path, gencode, genome, ref_genome, base_out_path, filter_chroms):
    clust_title = "Hierarchical_clustering_of_transMap_classifiers"
    base_barplot_title = ("Classifiers failed by {} transcripts in the category {} in transMap analysis\n"
                          "Genome: {}.  Gencode set: {}.  {:,} ({:0.2f}%) of transcripts")
    out_path = os.path.join(base_out_path, "classifier_breakdown", genome)
    mkdir_p(out_path)
    con, cur = sql_lib.attach_databases(comp_ann_path, mode="transMap")
    fail_ids, passing_specific_ids, excellent_ids = sql_lib.get_fail_passing_excel_ids(cur, ref_genome, genome, biotype)
    biotype_ids = sql_lib.get_biotype_ids(cur, ref_genome, biotype, filter_chroms=filter_chroms)
    if len(biotype_ids) > 50:
        sql_data = sql_lib.load_data(con, genome, etc.config.clustering_classifiers)
        num_original_introns = sql_lib.load_data(con, genome, ["NumberIntrons"], table="attributes")
        for mode, ids in zip(*[["Fail", "Pass/NotExcellent"], [fail_ids, passing_specific_ids]]):
            mode_underscore = mode.replace("/", "_")
            out_barplot_file = os.path.join(out_path, "barplot_{}_{}_{}".format(genome, biotype, mode_underscore))
            percentage_of_set = 100.0 * len(ids) / len(biotype_ids)
            barplot_title = base_barplot_title.format(biotype.replace("_" , " "), mode, genome, gencode, len(ids), 
                                                      percentage_of_set)
            munged, stats = munge_intron_data(sql_data, num_original_introns, ids)
            plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title)
            data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString())
            munged.to_csv(data_path)
            out_cluster_file = os.path.join(out_path, "clustering_{}_{}_{}".format(genome, biotype, mode_underscore))
            target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
Пример #10
0
def main_augustus_fn(target, comp_ann_path, gencode, genome, base_out_path, filter_chroms):
    clust_title = "Hierarchical_clustering_of_augustus_classifiers"
    base_barplot_title = ("Augustus classifiers failed by {:,} transcripts derived from transMap\n"
                          "on the reference set {} with Augustus {}")
    out_path = os.path.join(base_out_path, "augustus_classifier_breakdown", genome)
    mkdir_p(out_path)
    con, cur = sql_lib.attach_databases(comp_ann_path, mode="augustus")
    highest_cov_dict = sql_lib.highest_cov_aln(cur, genome)
    highest_cov_ids = set(zip(*highest_cov_dict.itervalues())[0])
    sql_data = sql_lib.load_data(con, genome, etc.config.aug_classifiers, primary_key="AugustusAlignmentId", 
                                 table="augustus")
    base_filter_set = {x for x in sql_data.index if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids}
    for mode in ["1", "2"]:
        i = "I{}".format(mode)
        aug_mode = "trusting RNAseq more" if mode == "2" else "trusting RNAseq less"
        filter_set = {x for x in base_filter_set if i in x}
        out_barplot_file = os.path.join(out_path, "augustus_barplot_{}_{}_{}".format(genome, gencode, i))
        barplot_title = base_barplot_title.format(len(filter_set), gencode, aug_mode)
        munged, stats = munge_data(sql_data, filter_set)
        plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title)
        data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString())
        munged.to_csv(data_path)
        out_cluster_file = os.path.join(out_path, "augustus_clustering_{}_{}_{}".format(genome, gencode, i))
        target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
Пример #11
0
ref_gp = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/data/wgEncodeGencodeBasicV23.gp"
aug_gp = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/augustus/tmr/gorilla.output.gp"
aln_psl = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/transMap/gorilla/transMapGencodeBasicV23.psl"
ref_psl =  "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/data/wgEncodeGencodeBasicV23.psl"
ref_fasta = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/assemblies/susie_3_2/human.fa"
target_fasta = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/assemblies/susie_3_2/gorilla.fa"

tx_dict = seq_lib.get_transcript_dict(gp)
ref_dict = seq_lib.get_transcript_dict(ref_gp)
aug_dict = seq_lib.get_transcript_dict(aug_gp)
aln_dict = psl_lib.get_alignment_dict(aln_psl)
ref_aln_dict = psl_lib.get_alignment_dict(ref_psl)
seq_dict = seq_lib.get_sequence_dict(target_fasta)
ref_seq_dict = seq_lib.get_sequence_dict(ref_fasta)

con, cur = sql_lib.attach_databases("/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/comparativeAnnotation/2015-10-12/GencodeBasicV23", mode="augustus")

genome = 'gorilla'
ref_genome = 'human'
biotype = 'protein_coding'
filter_chroms = ["Y", "chrY"]

stats = merge_stats(cur, 'gorilla')
highest_cov_dict = sql_lib.highest_cov_aln(cur, "gorilla")
highest_cov_ids = set(zip(*highest_cov_dict.itervalues())[0])
biotype_ids = sql_lib.get_biotype_aln_ids(cur, 'gorilla', 'protein_coding')
highest_cov_ids &= biotype_ids
best_stats = {x: y for x, y in stats.iteritems() if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids}
best_tm = {x: y for x, y in best_stats.iteritems() if x in highest_cov_ids}
best_aug = {x: y for x, y in best_stats.iteritems() if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids and x not in highest_cov_ids}
r = {"higher_cov": [], "higher_ident": [], "higher_both": [], "worse": []}