def main(): args = parse_args() # attach regular comparativeAnnotator reference databases in order to build gene-transcript map con, cur = sql_lib.attach_databases(args.compAnnPath, mode="reference") gene_transcript_map = sql_lib.get_gene_transcript_map( cur, args.refGenome, biotype="protein_coding") transcript_gene_map = sql_lib.get_transcript_gene_map( cur, args.refGenome, biotype="protein_coding") # open CGP database -- we don't need comparativeAnnotator databases anymore cgp_db = os.path.join(args.compAnnPath, args.cgpDb) con, cur = sql_lib.open_database(cgp_db) # load both consensus and CGP into dictionaries consensus_dict = seq_lib.get_transcript_dict(args.consensusGp) cgp_dict = seq_lib.get_transcript_dict(args.cgpGp) # load the BLAT results from the sqlite database cgp_stats_query = "SELECT CgpId,EnsId,AlignmentCoverage,AlignmentIdentity FROM '{}_cgp'".format( args.genome) cgp_stats_dict = sql_lib.get_multi_index_query_dict(cur, cgp_stats_query, num_indices=2) consensus_stats_query = ( "SELECT EnsId,AlignmentCoverage,AlignmentIdentity FROM " "'{}_consensus'".format(args.genome)) consensus_stats_dict = sql_lib.get_query_dict(cur, consensus_stats_query) # load the intron bits intron_dict = load_intron_bits(args.intronBitsPath) # final dictionaries final_consensus = {} metrics = {} # save all CGP transcripts which have no associated genes find_new_transcripts(cgp_dict, final_consensus, metrics) # save all CGP transcripts whose associated genes are not in the consensus consensus_genes = {x.name2 for x in consensus_dict.itervalues()} find_missing_transcripts(cgp_dict, consensus_genes, intron_dict, final_consensus, metrics, consensus_dict, gene_transcript_map) # remove all such transcripts from the cgp dict before we evaluate for updating cgp_dict = { x: y for x, y in cgp_dict.iteritems() if x not in final_consensus } update_transcripts(cgp_dict, consensus_dict, args.genome, gene_transcript_map, transcript_gene_map, intron_dict, final_consensus, metrics, cgp_stats_dict, consensus_stats_dict) evaluate_cgp_consensus(final_consensus, metrics) # write results out to disk with open( os.path.join(args.metricsOutDir, args.genome + ".metrics.pickle"), "w") as outf: pickle.dump(metrics, outf) with open(args.outGp, "w") as outf: for tx_id, tx in sorted(final_consensus.iteritems(), key=lambda x: [x[1].chromosome, x[1].start]): outf.write("\t".join(map(str, tx.get_gene_pred())) + "\n")
def build_pass_track(target, args): """ Builds a specific track of Good transcripts for the current mode. """ colors = { "coding": "59,101,69", "noncoding": "98,124,191", "not_pass": "******" } con, cur = sql_lib.attach_databases(args.outDir, args.mode) biotype_map = sql_lib.get_transcript_biotype_map(cur, args.refGenome) if args.mode == "augustus": query = etc.config.augustusEval(args.genome, args.refGenome) pass_ids = sql_lib.get_query_ids(cur, query) out_pass_bed_path, out_pass_big_bed_path = get_bed_paths( args.outDir, "augustus", args.genome) gp_dict = seq_lib.get_transcript_dict(args.augustusGp) elif args.mode == "reference": # for reference, we are more interested in what is NOT Good query = etc.config.refEval(args.refGenome) pass_ids = biotype_map.viewkeys() - sql_lib.get_query_ids( cur, query) # actually not pass out_pass_bed_path, out_pass_big_bed_path = get_bed_paths( args.outDir, "reference", args.refGenome) gp_dict = seq_lib.get_transcript_dict(args.annotationGp) elif args.mode == "transMap": pass_ids = get_all_tm_pass(cur, args.refGenome, args.genome) out_pass_bed_path, out_pass_big_bed_path = get_bed_paths( args.outDir, "transMap", args.genome) gp_dict = seq_lib.get_transcript_dict(args.targetGp) else: raise RuntimeError( "Somehow your argparse object does not contain a valid mode.") with open(out_pass_bed_path, "w") as outf: for aln_id, rec in gp_dict.iteritems(): tx_id = psl_lib.strip_alignment_numbers(aln_id) if aln_id in pass_ids: if biotype_map[tx_id] == "protein_coding": bed = rec.get_bed(rgb=colors["coding"]) outf.write("".join(["\t".join(map(str, bed)), "\n"])) else: bed = rec.get_bed(rgb=colors["noncoding"]) outf.write("".join(["\t".join(map(str, bed)), "\n"])) else: bed = rec.get_bed(rgb=colors["not_pass"]) outf.write("".join(["\t".join(map(str, bed)), "\n"])) make_big_bed(out_pass_bed_path, args.sizes, out_pass_big_bed_path)
def main(): args = parse_args() aln_dict = psl_lib.get_alignment_dict(args.psl) ref_aln_dict = psl_lib.get_alignment_dict(args.refPsl) tx_dict = seq_lib.get_transcript_dict(args.gp) with open(args.outPath, "w") as outf: for aln_id, aln in sorted(aln_dict.iteritems(), key=lambda x: x[0]): ref_aln = ref_aln_dict[psl_lib.remove_alignment_number(aln_id)] t = tx_dict[aln_id] vec = build_intron_vector(aln, ref_aln, t, args.fuzz_distance) outf.write("{}\t{}\n".format(aln_id, ",".join(vec)))
def ref_attr_table(ref_genome, db_path, attr_file, ref_gp): """ This function is used to add an extra table in reference mode holding all of the basic attributes. Basically directly dumping the tsv into sqlite3 with the addition of a refChrom column. """ df = pd.read_table(attr_file, sep="\t", index_col=3, header=0) ref_dict = seq_lib.get_transcript_dict(ref_gp) chromosome_dict = { "refChrom": {x: y.chromosome for x, y in ref_dict.iteritems()} } chromosome_df = pd.DataFrame.from_dict(chromosome_dict) df2 = pd.merge(df, chromosome_df, left_index=True, right_index=True) with sql_lib.ExclusiveSqlConnection(db_path) as con: df2.to_sql(ref_genome, con, if_exists="replace", index_label="TranscriptId")
import os import lib.sql_lib as sql_lib import lib.seq_lib as seq_lib import lib.psl_lib as psl_lib import lib.comp_ann_lib as comp_ann_lib import itertools gp = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/transMap/gorilla/transMapGencodeBasicV23.gp" ref_gp = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/data/wgEncodeGencodeBasicV23.gp" aug_gp = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/augustus/tmr/gorilla.output.gp" aln_psl = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/transMap/gorilla/transMapGencodeBasicV23.psl" ref_psl = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/data/wgEncodeGencodeBasicV23.psl" ref_fasta = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/assemblies/susie_3_2/human.fa" target_fasta = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/assemblies/susie_3_2/gorilla.fa" tx_dict = seq_lib.get_transcript_dict(gp) ref_dict = seq_lib.get_transcript_dict(ref_gp) aug_dict = seq_lib.get_transcript_dict(aug_gp) aln_dict = psl_lib.get_alignment_dict(aln_psl) ref_aln_dict = psl_lib.get_alignment_dict(ref_psl) seq_dict = seq_lib.get_sequence_dict(target_fasta) ref_seq_dict = seq_lib.get_sequence_dict(ref_fasta) con, cur = sql_lib.attach_databases("/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/comparativeAnnotation/2015-10-12/GencodeBasicV23", mode="augustus") genome = 'gorilla' ref_genome = 'human' biotype = 'protein_coding' filter_chroms = ["Y", "chrY"] stats = merge_stats(cur, 'gorilla')
def get_annotation_dict(self): self.annotation_dict = seq_lib.get_transcript_dict(self.annotation_gp)
def get_augustus_transcript_dict(self): self.augustus_transcript_dict = seq_lib.get_transcript_dict( self.augustus_gp)
def get_transcript_dict(self): self.transcript_dict = seq_lib.get_transcript_dict(self.tgt_gp)