Пример #1
0
def main():
    args = parse_args()
    # attach regular comparativeAnnotator reference databases in order to build gene-transcript map
    con, cur = sql_lib.attach_databases(args.compAnnPath, mode="reference")
    gene_transcript_map = sql_lib.get_gene_transcript_map(
        cur, args.refGenome, biotype="protein_coding")
    transcript_gene_map = sql_lib.get_transcript_gene_map(
        cur, args.refGenome, biotype="protein_coding")
    # open CGP database -- we don't need comparativeAnnotator databases anymore
    cgp_db = os.path.join(args.compAnnPath, args.cgpDb)
    con, cur = sql_lib.open_database(cgp_db)
    # load both consensus and CGP into dictionaries
    consensus_dict = seq_lib.get_transcript_dict(args.consensusGp)
    cgp_dict = seq_lib.get_transcript_dict(args.cgpGp)
    # load the BLAT results from the sqlite database
    cgp_stats_query = "SELECT CgpId,EnsId,AlignmentCoverage,AlignmentIdentity FROM '{}_cgp'".format(
        args.genome)
    cgp_stats_dict = sql_lib.get_multi_index_query_dict(cur,
                                                        cgp_stats_query,
                                                        num_indices=2)
    consensus_stats_query = (
        "SELECT EnsId,AlignmentCoverage,AlignmentIdentity FROM "
        "'{}_consensus'".format(args.genome))
    consensus_stats_dict = sql_lib.get_query_dict(cur, consensus_stats_query)
    # load the intron bits
    intron_dict = load_intron_bits(args.intronBitsPath)
    # final dictionaries
    final_consensus = {}
    metrics = {}
    # save all CGP transcripts which have no associated genes
    find_new_transcripts(cgp_dict, final_consensus, metrics)
    # save all CGP transcripts whose associated genes are not in the consensus
    consensus_genes = {x.name2 for x in consensus_dict.itervalues()}
    find_missing_transcripts(cgp_dict, consensus_genes, intron_dict,
                             final_consensus, metrics, consensus_dict,
                             gene_transcript_map)
    # remove all such transcripts from the cgp dict before we evaluate for updating
    cgp_dict = {
        x: y
        for x, y in cgp_dict.iteritems() if x not in final_consensus
    }
    update_transcripts(cgp_dict, consensus_dict, args.genome,
                       gene_transcript_map, transcript_gene_map, intron_dict,
                       final_consensus, metrics, cgp_stats_dict,
                       consensus_stats_dict)
    evaluate_cgp_consensus(final_consensus, metrics)
    # write results out to disk
    with open(
            os.path.join(args.metricsOutDir, args.genome + ".metrics.pickle"),
            "w") as outf:
        pickle.dump(metrics, outf)
    with open(args.outGp, "w") as outf:
        for tx_id, tx in sorted(final_consensus.iteritems(),
                                key=lambda x: [x[1].chromosome, x[1].start]):
            outf.write("\t".join(map(str, tx.get_gene_pred())) + "\n")
Пример #2
0
def build_pass_track(target, args):
    """
    Builds a specific track of Good transcripts for the current mode.
    """
    colors = {
        "coding": "59,101,69",
        "noncoding": "98,124,191",
        "not_pass": "******"
    }
    con, cur = sql_lib.attach_databases(args.outDir, args.mode)
    biotype_map = sql_lib.get_transcript_biotype_map(cur, args.refGenome)
    if args.mode == "augustus":
        query = etc.config.augustusEval(args.genome, args.refGenome)
        pass_ids = sql_lib.get_query_ids(cur, query)
        out_pass_bed_path, out_pass_big_bed_path = get_bed_paths(
            args.outDir, "augustus", args.genome)
        gp_dict = seq_lib.get_transcript_dict(args.augustusGp)
    elif args.mode == "reference":  # for reference, we are more interested in what is NOT Good
        query = etc.config.refEval(args.refGenome)
        pass_ids = biotype_map.viewkeys() - sql_lib.get_query_ids(
            cur, query)  # actually not pass
        out_pass_bed_path, out_pass_big_bed_path = get_bed_paths(
            args.outDir, "reference", args.refGenome)
        gp_dict = seq_lib.get_transcript_dict(args.annotationGp)
    elif args.mode == "transMap":
        pass_ids = get_all_tm_pass(cur, args.refGenome, args.genome)
        out_pass_bed_path, out_pass_big_bed_path = get_bed_paths(
            args.outDir, "transMap", args.genome)
        gp_dict = seq_lib.get_transcript_dict(args.targetGp)
    else:
        raise RuntimeError(
            "Somehow your argparse object does not contain a valid mode.")
    with open(out_pass_bed_path, "w") as outf:
        for aln_id, rec in gp_dict.iteritems():
            tx_id = psl_lib.strip_alignment_numbers(aln_id)
            if aln_id in pass_ids:
                if biotype_map[tx_id] == "protein_coding":
                    bed = rec.get_bed(rgb=colors["coding"])
                    outf.write("".join(["\t".join(map(str, bed)), "\n"]))
                else:
                    bed = rec.get_bed(rgb=colors["noncoding"])
                    outf.write("".join(["\t".join(map(str, bed)), "\n"]))
            else:
                bed = rec.get_bed(rgb=colors["not_pass"])
                outf.write("".join(["\t".join(map(str, bed)), "\n"]))
    make_big_bed(out_pass_bed_path, args.sizes, out_pass_big_bed_path)
Пример #3
0
def main():
    args = parse_args()
    aln_dict = psl_lib.get_alignment_dict(args.psl)
    ref_aln_dict = psl_lib.get_alignment_dict(args.refPsl)
    tx_dict = seq_lib.get_transcript_dict(args.gp)
    with open(args.outPath, "w") as outf:
        for aln_id, aln in sorted(aln_dict.iteritems(), key=lambda x: x[0]):
            ref_aln = ref_aln_dict[psl_lib.remove_alignment_number(aln_id)]
            t = tx_dict[aln_id]
            vec = build_intron_vector(aln, ref_aln, t, args.fuzz_distance)
            outf.write("{}\t{}\n".format(aln_id, ",".join(vec)))
Пример #4
0
def ref_attr_table(ref_genome, db_path, attr_file, ref_gp):
    """
    This function is used to add an extra table in reference mode holding all of the basic attributes.
    Basically directly dumping the tsv into sqlite3 with the addition of a refChrom column.
    """
    df = pd.read_table(attr_file, sep="\t", index_col=3, header=0)
    ref_dict = seq_lib.get_transcript_dict(ref_gp)
    chromosome_dict = {
        "refChrom": {x: y.chromosome
                     for x, y in ref_dict.iteritems()}
    }
    chromosome_df = pd.DataFrame.from_dict(chromosome_dict)
    df2 = pd.merge(df, chromosome_df, left_index=True, right_index=True)
    with sql_lib.ExclusiveSqlConnection(db_path) as con:
        df2.to_sql(ref_genome,
                   con,
                   if_exists="replace",
                   index_label="TranscriptId")
Пример #5
0
import os
import lib.sql_lib as sql_lib
import lib.seq_lib as seq_lib
import lib.psl_lib as psl_lib
import lib.comp_ann_lib as comp_ann_lib
import itertools

gp = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/transMap/gorilla/transMapGencodeBasicV23.gp"
ref_gp = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/data/wgEncodeGencodeBasicV23.gp"
aug_gp = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/augustus/tmr/gorilla.output.gp"
aln_psl = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/transMap/gorilla/transMapGencodeBasicV23.psl"
ref_psl =  "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/transMap/2015-10-06/data/wgEncodeGencodeBasicV23.psl"
ref_fasta = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/assemblies/susie_3_2/human.fa"
target_fasta = "/hive/groups/recon/projs/gorilla_eichler/pipeline_data/assemblies/susie_3_2/gorilla.fa"

tx_dict = seq_lib.get_transcript_dict(gp)
ref_dict = seq_lib.get_transcript_dict(ref_gp)
aug_dict = seq_lib.get_transcript_dict(aug_gp)
aln_dict = psl_lib.get_alignment_dict(aln_psl)
ref_aln_dict = psl_lib.get_alignment_dict(ref_psl)
seq_dict = seq_lib.get_sequence_dict(target_fasta)
ref_seq_dict = seq_lib.get_sequence_dict(ref_fasta)

con, cur = sql_lib.attach_databases("/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/comparativeAnnotation/2015-10-12/GencodeBasicV23", mode="augustus")

genome = 'gorilla'
ref_genome = 'human'
biotype = 'protein_coding'
filter_chroms = ["Y", "chrY"]

stats = merge_stats(cur, 'gorilla')
Пример #6
0
 def get_annotation_dict(self):
     self.annotation_dict = seq_lib.get_transcript_dict(self.annotation_gp)
Пример #7
0
 def get_augustus_transcript_dict(self):
     self.augustus_transcript_dict = seq_lib.get_transcript_dict(
         self.augustus_gp)
Пример #8
0
 def get_transcript_dict(self):
     self.transcript_dict = seq_lib.get_transcript_dict(self.tgt_gp)