def main(): args = parse_args() con, cur = sql_lib.attach_databases(args.comparativeAnnotationDir, mode="transMap") highest_cov_dict = sql_lib.get_highest_cov_alns(cur, args.genomes, args.filterChroms) for biotype in sql_lib.get_all_biotypes(cur, args.refGenome, gene_level=False): biotype_ids = sql_lib.get_biotype_ids(cur, args.refGenome, biotype, filter_chroms=args.filterChroms) transcript_gene_map = sql_lib.get_transcript_gene_map( cur, args.refGenome, biotype, filter_chroms=args.filterChroms) if len( biotype_ids ) > 50: # hardcoded cutoff to avoid issues where this biotype/gencode mix is nearly empty fail_pass_excel_dict = get_fail_pass_excel_dict( cur, args.refGenome, args.genomes, highest_cov_dict, biotype, args.filterChroms) out_path = os.path.join(args.outDir, "transmap_analysis", biotype) mkdir_p(out_path) cov_ident_wrapper(highest_cov_dict, args.genomes, out_path, biotype, args.gencode, biotype_ids) cat_plot_wrapper(cur, highest_cov_dict, args.genomes, out_path, biotype, args.gencode, biotype_ids) paralogy_plot(cur, args.genomes, out_path, biotype, biotype_ids, args.gencode) num_pass_excel(fail_pass_excel_dict, cur, args.refGenome, out_path, biotype, args.gencode, biotype_ids) num_pass_excel_gene_level(fail_pass_excel_dict, cur, args.refGenome, out_path, biotype, args.gencode, transcript_gene_map)
def align_wrapper(target, recs, file_tree, ref_tx_fasta, target_genome_fasta, comp_ann_path, ref_genome, mode): """ Alignment wrapper for grouped CGP records or grouped consensus records. For CGP mode, pulls down a gene -> transcript map and uses this to determine alignment targets, if they exist. """ tmp_dir = target.getGlobalTempDir() results = [] if mode == "cgp": con, cur = attach_databases(comp_ann_path, mode="reference") gene_transcript_map = get_gene_transcript_map(cur, ref_genome, biotype="protein_coding") for rec in recs: gp = GenePredTranscript(rec.rstrip().split("\t")) gene_names = gp.name2.split(",") if mode == "cgp": tx_dict = { n: gene_transcript_map[n] for n in gene_names if n in gene_transcript_map } if len(tx_dict) > 0: results.extend( align_cgp(tmp_dir, gp, target_genome_fasta, tx_dict, ref_tx_fasta)) else: results.append( align_consensus(tmp_dir, gp, target_genome_fasta, ref_tx_fasta)) with open(file_tree.getTempFile(), "w") as outf: for x in results: outf.write("".join([",".join(x), "\n"]))
def main(): args = parse_args() mkdir_p(args.outDir) biotype_tx_counter = DefaultOrderedDict(lambda: defaultdict(int)) biotype_gene_counter = DefaultOrderedDict(lambda: defaultdict(int)) gencode_biotype_bin_dict_str = "etc.config.{}".format(args.gencode) gencode_biotype_bin_dict = eval(gencode_biotype_bin_dict_str) con, cur = sql_lib.attach_databases(args.compAnnPath, mode="reference") biotypes = sql_lib.get_all_biotypes(cur, args.refGenome, gene_level=True) for biotype in biotypes: tx_evals, gene_evals, gene_fail_evals, tx_dup_rate, tx_counts, gene_counts = load_evaluations( args.workDir, args.genomes, biotype) if len(tx_evals) == 0: # a biotype may have nothing continue if biotype in args.biotypes: for (evals, counts), mode in zip( *[[[tx_evals, tx_counts], [gene_evals, gene_counts]], ["transcripts", "genes"]]): transcript_gene_plot(evals, args.outDir, args.gencode, mode, biotype) size_plot(counts, args.outDir, args.gencode, mode, biotype) gene_fail_plot(gene_fail_evals, args.outDir, args.gencode, biotype) dup_rate_plot(tx_dup_rate, args.outDir, args.gencode, biotype) biotype_bin = gencode_biotype_bin_dict.get(biotype, "Other") tx_evals_collapsed = collapse_evals(tx_evals) gene_evals_collapsed = collapse_evals(gene_evals) for genome, tx_count in tx_evals_collapsed.iteritems(): biotype_tx_counter[genome][biotype_bin] += tx_count biotype_gene_counter[genome][biotype_bin] += gene_evals_collapsed[ genome] for mode, counter in zip(*[["transcript", "gene"], [biotype_tx_counter, biotype_gene_counter]]): biotype_stacked_plot(counter, args.outDir, args.gencode, mode)
def main(): args = parse_args() # attach regular comparativeAnnotator reference databases in order to build gene-transcript map con, cur = sql_lib.attach_databases(args.compAnnPath, mode="reference") gene_transcript_map = sql_lib.get_gene_transcript_map( cur, args.refGenome, biotype="protein_coding") transcript_gene_map = sql_lib.get_transcript_gene_map( cur, args.refGenome, biotype="protein_coding") # open CGP database -- we don't need comparativeAnnotator databases anymore cgp_db = os.path.join(args.compAnnPath, args.cgpDb) con, cur = sql_lib.open_database(cgp_db) # load both consensus and CGP into dictionaries consensus_dict = seq_lib.get_transcript_dict(args.consensusGp) cgp_dict = seq_lib.get_transcript_dict(args.cgpGp) # load the BLAT results from the sqlite database cgp_stats_query = "SELECT CgpId,EnsId,AlignmentCoverage,AlignmentIdentity FROM '{}_cgp'".format( args.genome) cgp_stats_dict = sql_lib.get_multi_index_query_dict(cur, cgp_stats_query, num_indices=2) consensus_stats_query = ( "SELECT EnsId,AlignmentCoverage,AlignmentIdentity FROM " "'{}_consensus'".format(args.genome)) consensus_stats_dict = sql_lib.get_query_dict(cur, consensus_stats_query) # load the intron bits intron_dict = load_intron_bits(args.intronBitsPath) # final dictionaries final_consensus = {} metrics = {} # save all CGP transcripts which have no associated genes find_new_transcripts(cgp_dict, final_consensus, metrics) # save all CGP transcripts whose associated genes are not in the consensus consensus_genes = {x.name2 for x in consensus_dict.itervalues()} find_missing_transcripts(cgp_dict, consensus_genes, intron_dict, final_consensus, metrics, consensus_dict, gene_transcript_map) # remove all such transcripts from the cgp dict before we evaluate for updating cgp_dict = { x: y for x, y in cgp_dict.iteritems() if x not in final_consensus } update_transcripts(cgp_dict, consensus_dict, args.genome, gene_transcript_map, transcript_gene_map, intron_dict, final_consensus, metrics, cgp_stats_dict, consensus_stats_dict) evaluate_cgp_consensus(final_consensus, metrics) # write results out to disk with open( os.path.join(args.metricsOutDir, args.genome + ".metrics.pickle"), "w") as outf: pickle.dump(metrics, outf) with open(args.outGp, "w") as outf: for tx_id, tx in sorted(final_consensus.iteritems(), key=lambda x: [x[1].chromosome, x[1].start]): outf.write("\t".join(map(str, tx.get_gene_pred())) + "\n")
def build_classifier_tracks(target, query_fn, genome, args): query = query_fn(genome) query_name = query_fn.__name__ con, cur = sql_lib.attach_databases(args.outDir, mode=args.mode) bed_recs = sql_lib.execute_query(cur, query) out_bed_path, out_big_bed_path = get_bed_paths(args.outDir, query_name, genome) with open(out_bed_path, "w") as outf: for recs in bed_recs: for rec in recs: if rec is not None: outf.write(rec) make_big_bed(out_bed_path, args.sizes, out_big_bed_path)
def main_ref_fn(target, comp_ann_path, gencode, ref_genome, base_out_path, filter_chroms): clust_title = "Hierarchical_clustering_of_transcript_classifiers" base_barplot_title = ("Classifiers failed by {} transcripts in the reference set {}\n") out_path = os.path.join(base_out_path, "clustering", ref_genome) mkdir_p(out_path) con, cur = sql_lib.attach_databases(comp_ann_path, mode="reference") biotype_ids = sql_lib.get_biotype_ids(cur, ref_genome, biotype, filter_chroms=filter_chroms) if len(biotype_ids) > 50: sql_data = sql_lib.load_data(con, ref_genome, etc.config.ref_classifiers, primary_key="TranscriptId") out_barplot_file = os.path.join(out_path, "reference_barplot_{}".format(gencode)) barplot_title = base_barplot_title.format(biotype.replace("_", " "), gencode) munged, stats = munge_data(sql_data, biotype_ids) plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title) data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString()) munged.to_csv(data_path) out_cluster_file = os.path.join(out_path, "reference_clustering_{}".format(gencode)) target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def build_pass_track(target, args): """ Builds a specific track of Good transcripts for the current mode. """ colors = { "coding": "59,101,69", "noncoding": "98,124,191", "not_pass": "******" } con, cur = sql_lib.attach_databases(args.outDir, args.mode) biotype_map = sql_lib.get_transcript_biotype_map(cur, args.refGenome) if args.mode == "augustus": query = etc.config.augustusEval(args.genome, args.refGenome) pass_ids = sql_lib.get_query_ids(cur, query) out_pass_bed_path, out_pass_big_bed_path = get_bed_paths( args.outDir, "augustus", args.genome) gp_dict = seq_lib.get_transcript_dict(args.augustusGp) elif args.mode == "reference": # for reference, we are more interested in what is NOT Good query = etc.config.refEval(args.refGenome) pass_ids = biotype_map.viewkeys() - sql_lib.get_query_ids( cur, query) # actually not pass out_pass_bed_path, out_pass_big_bed_path = get_bed_paths( args.outDir, "reference", args.refGenome) gp_dict = seq_lib.get_transcript_dict(args.annotationGp) elif args.mode == "transMap": pass_ids = get_all_tm_pass(cur, args.refGenome, args.genome) out_pass_bed_path, out_pass_big_bed_path = get_bed_paths( args.outDir, "transMap", args.genome) gp_dict = seq_lib.get_transcript_dict(args.targetGp) else: raise RuntimeError( "Somehow your argparse object does not contain a valid mode.") with open(out_pass_bed_path, "w") as outf: for aln_id, rec in gp_dict.iteritems(): tx_id = psl_lib.strip_alignment_numbers(aln_id) if aln_id in pass_ids: if biotype_map[tx_id] == "protein_coding": bed = rec.get_bed(rgb=colors["coding"]) outf.write("".join(["\t".join(map(str, bed)), "\n"])) else: bed = rec.get_bed(rgb=colors["noncoding"]) outf.write("".join(["\t".join(map(str, bed)), "\n"])) else: bed = rec.get_bed(rgb=colors["not_pass"]) outf.write("".join(["\t".join(map(str, bed)), "\n"])) make_big_bed(out_pass_bed_path, args.sizes, out_pass_big_bed_path)
def main(): args = parse_args() con, cur = sql_lib.attach_databases(args.compAnnPath, mode=args.mode) biotypes = sql_lib.get_all_biotypes(cur, args.refGenome, gene_level=True) transcript_gene_map = sql_lib.get_transcript_gene_map( cur, args.refGenome, biotype=None, filter_chroms=args.filterChroms) gps = load_gps( args.gps ) # load all Augustus and transMap transcripts into one big dict consensus_base_path = os.path.join(args.outDir, args.genome) stats = get_stats(cur, args.genome, args.mode) ref_gene_intervals = build_ref_intervals(cur, args.genome) tgt_intervals = build_tgt_intervals(gps) for biotype in biotypes: gene_transcript_map = sql_lib.get_gene_transcript_map( cur, args.refGenome, biotype=biotype, filter_chroms=args.filterChroms) binned_transcripts, consensus = consensus_by_biotype( cur, args.refGenome, args.genome, biotype, gps, transcript_gene_map, gene_transcript_map, stats, args.mode, ref_gene_intervals, tgt_intervals) deduplicated_consensus, dup_count = deduplicate_consensus( consensus, gps, stats) if len(deduplicated_consensus ) > 0: # some biotypes we may have nothing num_genes, num_txs = write_gps(deduplicated_consensus, gps, consensus_base_path, biotype, transcript_gene_map, args.mode) if biotype == "protein_coding": gene_transcript_evals = evaluate_coding_consensus( binned_transcripts, stats, ref_gene_intervals, gps, args.mode) else: gene_transcript_evals = evaluate_noncoding_consensus( binned_transcripts, stats, gps) p = os.path.join(args.workDir, "_".join([args.genome, biotype])) mkdir_p(os.path.dirname(p)) gene_transcript_evals["duplication_rate"] = dup_count gene_transcript_evals["gene_counts"] = num_genes gene_transcript_evals["tx_counts"] = num_txs with open(p, "w") as outf: pickle.dump(gene_transcript_evals, outf)
def main_fn(target, comp_ann_path, gencode, genome, ref_genome, base_out_path, filter_chroms): clust_title = "Hierarchical_clustering_of_transMap_classifiers" base_barplot_title = ("Classifiers failed by {} transcripts in the category {} in transMap analysis\n" "Genome: {}. Gencode set: {}. {:,} ({:0.2f}%) of transcripts") out_path = os.path.join(base_out_path, "classifier_breakdown", genome) mkdir_p(out_path) con, cur = sql_lib.attach_databases(comp_ann_path, mode="transMap") fail_ids, passing_specific_ids, excellent_ids = sql_lib.get_fail_passing_excel_ids(cur, ref_genome, genome, biotype) biotype_ids = sql_lib.get_biotype_ids(cur, ref_genome, biotype, filter_chroms=filter_chroms) if len(biotype_ids) > 50: sql_data = sql_lib.load_data(con, genome, etc.config.clustering_classifiers) num_original_introns = sql_lib.load_data(con, genome, ["NumberIntrons"], table="attributes") for mode, ids in zip(*[["Fail", "Pass/NotExcellent"], [fail_ids, passing_specific_ids]]): mode_underscore = mode.replace("/", "_") out_barplot_file = os.path.join(out_path, "barplot_{}_{}_{}".format(genome, biotype, mode_underscore)) percentage_of_set = 100.0 * len(ids) / len(biotype_ids) barplot_title = base_barplot_title.format(biotype.replace("_" , " "), mode, genome, gencode, len(ids), percentage_of_set) munged, stats = munge_intron_data(sql_data, num_original_introns, ids) plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title) data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString()) munged.to_csv(data_path) out_cluster_file = os.path.join(out_path, "clustering_{}_{}_{}".format(genome, biotype, mode_underscore)) target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def main_augustus_fn(target, comp_ann_path, gencode, genome, base_out_path, filter_chroms): clust_title = "Hierarchical_clustering_of_augustus_classifiers" base_barplot_title = ("Augustus classifiers failed by {:,} transcripts derived from transMap\n" "on the reference set {} with Augustus {}") out_path = os.path.join(base_out_path, "augustus_classifier_breakdown", genome) mkdir_p(out_path) con, cur = sql_lib.attach_databases(comp_ann_path, mode="augustus") highest_cov_dict = sql_lib.highest_cov_aln(cur, genome) highest_cov_ids = set(zip(*highest_cov_dict.itervalues())[0]) sql_data = sql_lib.load_data(con, genome, etc.config.aug_classifiers, primary_key="AugustusAlignmentId", table="augustus") base_filter_set = {x for x in sql_data.index if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids} for mode in ["1", "2"]: i = "I{}".format(mode) aug_mode = "trusting RNAseq more" if mode == "2" else "trusting RNAseq less" filter_set = {x for x in base_filter_set if i in x} out_barplot_file = os.path.join(out_path, "augustus_barplot_{}_{}_{}".format(genome, gencode, i)) barplot_title = base_barplot_title.format(len(filter_set), gencode, aug_mode) munged, stats = munge_data(sql_data, filter_set) plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title) data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString()) munged.to_csv(data_path) out_cluster_file = os.path.join(out_path, "augustus_clustering_{}_{}_{}".format(genome, gencode, i)) target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
ref_gp = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/data/wgEncodeGencodeBasicV23.gp" aug_gp = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/augustus/tmr/gorilla.output.gp" aln_psl = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/transMap/gorilla/transMapGencodeBasicV23.psl" ref_psl = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/data/wgEncodeGencodeBasicV23.psl" ref_fasta = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/assemblies/susie_3_2/human.fa" target_fasta = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/assemblies/susie_3_2/gorilla.fa" tx_dict = seq_lib.get_transcript_dict(gp) ref_dict = seq_lib.get_transcript_dict(ref_gp) aug_dict = seq_lib.get_transcript_dict(aug_gp) aln_dict = psl_lib.get_alignment_dict(aln_psl) ref_aln_dict = psl_lib.get_alignment_dict(ref_psl) seq_dict = seq_lib.get_sequence_dict(target_fasta) ref_seq_dict = seq_lib.get_sequence_dict(ref_fasta) con, cur = sql_lib.attach_databases("/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/comparativeAnnotation/2015-10-12/GencodeBasicV23", mode="augustus") genome = 'gorilla' ref_genome = 'human' biotype = 'protein_coding' filter_chroms = ["Y", "chrY"] stats = merge_stats(cur, 'gorilla') highest_cov_dict = sql_lib.highest_cov_aln(cur, "gorilla") highest_cov_ids = set(zip(*highest_cov_dict.itervalues())[0]) biotype_ids = sql_lib.get_biotype_aln_ids(cur, 'gorilla', 'protein_coding') highest_cov_ids &= biotype_ids best_stats = {x: y for x, y in stats.iteritems() if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids} best_tm = {x: y for x, y in best_stats.iteritems() if x in highest_cov_ids} best_aug = {x: y for x, y in best_stats.iteritems() if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids and x not in highest_cov_ids} r = {"higher_cov": [], "higher_ident": [], "higher_both": [], "worse": []}