def run(self, shortIntronSize=30):
     self.getAugustusTranscriptDict()
     self.getTranscriptDict()
     classify_dict = {}
     details_dict = defaultdict(list)
     for aug_aId, aug_t in self.augustusTranscriptDict.iteritems():
         if psl_lib.remove_augustus_alignment_number(
                 aug_aId) not in self.transcriptDict:
             continue
         t = self.transcriptDict[psl_lib.remove_augustus_alignment_number(
             aug_aId)]
         if aug_t.strand != t.strand or aug_t.chromosome != t.chromosome:
             continue
         aug_t_intervals = aug_t.exonIntervals
         merged_t_intervals = seq_lib.gap_merge_intervals(
             t.exonIntervals, gap=shortIntronSize)
         for interval in aug_t_intervals:
             if seq_lib.interval_not_intersect_intervals(
                     merged_t_intervals, interval):
                 classify_dict[aug_aId] = 1
                 details_dict[aug_aId].append(
                     interval.get_bed(self.rgb,
                                      "/".join([self.column, aug_aId])))
         if aug_aId not in classify_dict:
             classify_dict[aug_aId] = 0
     self.dumpValueDicts(classify_dict, details_dict)
 def run(self):
     self.getAugustusTranscriptDict()
     self.getTranscriptDict()
     classify_dict = {}
     details_dict = {}
     for aug_aId, aug_t in self.augustusTranscriptDict.iteritems():
         if psl_lib.remove_augustus_alignment_number(
                 aug_aId) not in self.transcriptDict:
             continue
         t = self.transcriptDict[psl_lib.remove_augustus_alignment_number(
             aug_aId)]
         if aug_t.strand != t.strand or aug_t.chromosome != t.chromosome or t.thickStart == t.thickStop:
             continue
         if t.thickStart != aug_t.thickStart or t.thickStop != aug_t.thickStop:
             classify_dict[aug_aId] = 1
             s = aug_t.getCdsLength()
             if s > 9:
                 details_dict[aug_aId] = [
                     seq_lib.cds_coordinate_to_bed(aug_t, 0, 3, self.rgb,
                                                   self.column),
                     seq_lib.cds_coordinate_to_bed(aug_t, s - 3, s,
                                                   self.rgb, self.column)
                 ]
             else:
                 details_dict[aug_aId] = seq_lib.cds_coordinate_to_bed(
                     aug_t, 0, s, self.rgb, self.column)
         else:
             classify_dict[aug_aId] = 0
     self.dumpValueDicts(classify_dict, details_dict)
 def run(self):
     self.getAugustusTranscriptDict()
     self.getTranscriptDict()
     classify_dict = {}
     details_dict = defaultdict(list)
     for aug_aId, aug_t in self.augustusTranscriptDict.iteritems():
         if psl_lib.remove_augustus_alignment_number(aug_aId) not in self.transcriptDict:
             continue
         t = self.transcriptDict[psl_lib.remove_augustus_alignment_number(aug_aId)]
         if aug_t.strand != t.strand or aug_t.chromosome != t.chromosome:
             classify_dict[aug_aId] = 1
             details_dict[aug_aId] = seq_lib.transcript_to_bed(aug_t, self.rgb, self.column)
         else:
             classify_dict[aug_aId] = 0
     self.dumpValueDicts(classify_dict, details_dict)
예제 #4
0
 def augustus_transcript_transmap_iterator(self):
     if self.transcript_dict is None:
         self.get_transcript_dict()
     for aug_id, aug_t in self.augustus_transcript_iterator():
         t = self.transcript_dict[psl_lib.remove_augustus_alignment_number(
             aug_id)]
         yield aug_id, aug_t, t
 def run(self):
     r = re.compile("-[0-9]+-")
     self.getAugustusTranscriptDict()
     counts = Counter("-".join(r.split(aug_aId)) for aug_aId in self.augustusTranscriptDict)
     details_dict = {}
     classify_dict = {}
     for aug_aId, aug_t in self.augustusTranscriptDict.iteritems():
         if counts[psl_lib.remove_augustus_alignment_number(aug_aId)] > 1:
             details_dict[aug_aId] = seq_lib.transcript_to_bed(
                 aug_t,
                 self.rgb,
                 self.column + "_{}_Copies".format(counts[psl_lib.remove_augustus_alignment_number(aug_aId)] - 1),
             )
             classify_dict[aug_aId] = 1
         else:
             classify_dict[aug_aId] = 0
     self.dumpValueDicts(classify_dict, details_dict)
 def run(self):
     r = re.compile("-[0-9]+-")
     self.getAugustusTranscriptDict()
     counts = Counter("-".join(r.split(aug_aId))
                      for aug_aId in self.augustusTranscriptDict)
     details_dict = {}
     classify_dict = {}
     for aug_aId, aug_t in self.augustusTranscriptDict.iteritems():
         if counts[psl_lib.remove_augustus_alignment_number(aug_aId)] > 1:
             details_dict[aug_aId] = seq_lib.transcript_to_bed(
                 aug_t, self.rgb, self.column + "_{}_Copies".format(counts[
                     psl_lib.remove_augustus_alignment_number(aug_aId)] -
                                                                    1))
             classify_dict[aug_aId] = 1
         else:
             classify_dict[aug_aId] = 0
     self.dumpValueDicts(classify_dict, details_dict)
 def run(self):
     self.getAugustusTranscriptDict()
     self.getTranscriptDict()
     classify_dict = {}
     details_dict = defaultdict(list)
     for aug_aId, aug_t in self.augustusTranscriptDict.iteritems():
         if psl_lib.remove_augustus_alignment_number(
                 aug_aId) not in self.transcriptDict:
             continue
         t = self.transcriptDict[psl_lib.remove_augustus_alignment_number(
             aug_aId)]
         if aug_t.strand != t.strand or aug_t.chromosome != t.chromosome:
             classify_dict[aug_aId] = 1
             details_dict[aug_aId] = seq_lib.transcript_to_bed(
                 aug_t, self.rgb, self.column)
         else:
             classify_dict[aug_aId] = 0
     self.dumpValueDicts(classify_dict, details_dict)
 def run(self, shortIntronSize=30):
     self.getAugustusTranscriptDict()
     self.getTranscriptDict()
     classify_dict = {}
     details_dict = defaultdict(list)
     for aug_aId, aug_t in self.augustusTranscriptDict.iteritems():
         if psl_lib.remove_augustus_alignment_number(aug_aId) not in self.transcriptDict:
             continue
         t = self.transcriptDict[psl_lib.remove_augustus_alignment_number(aug_aId)]
         if aug_t.strand != t.strand or aug_t.chromosome != t.chromosome:
             continue
         aug_t_intervals = aug_t.exonIntervals
         merged_t_intervals = seq_lib.gap_merge_intervals(t.exonIntervals, gap=shortIntronSize)
         for interval in aug_t_intervals:
             if seq_lib.interval_not_intersect_intervals(merged_t_intervals, interval):
                 classify_dict[aug_aId] = 1
                 details_dict[aug_aId].append(interval.get_bed(self.rgb, "/".join([self.column, aug_aId])))
         if aug_aId not in classify_dict:
             classify_dict[aug_aId] = 0
     self.dumpValueDicts(classify_dict, details_dict)
 def initializeDb(self, dbPath, classifiers, dataType=None):
     if dataType is None:
         columnDefinitions = [[x.__name__, x.dataType()] for x in classifiers]
     else:
         columnDefinitions = [[x.__name__, dataType] for x in classifiers]
     # find alignment IDs from PSLs (primary key for database)
     for genome, gp in izip(self.genomes, self.augustusGps):
         aug_aIds = set(x.split()[11] for x in open(gp))
         aIds = [psl_lib.remove_augustus_alignment_number(x) for x in aug_aIds]
         self.initializeSqlTable(dbPath, genome, columnDefinitions, self.primaryKeyColumn)
         self.initializeSqlRows(dbPath, genome, aug_aIds, self.primaryKeyColumn)
         self.buildNameRow(dbPath, genome, aug_aIds, aIds, self.primaryKeyColumn)
def is_tie(best_alns):
    """
    If we have more than one best transcript, is at least one from transMap and one from Augustus?
    """
    seen = set()
    for aln_id in best_alns:
        ens_id = psl_lib.remove_augustus_alignment_number(aln_id)
        if ens_id in seen:
            return True
        else:
            seen.add(ens_id)
    return False
 def run(self):
     self.getAugustusTranscriptDict()
     self.getTranscriptDict()
     classify_dict = {}
     details_dict = {}
     for aug_aId, aug_t in self.augustusTranscriptDict.iteritems():
         if psl_lib.remove_augustus_alignment_number(aug_aId) not in self.transcriptDict:
             continue
         t = self.transcriptDict[psl_lib.remove_augustus_alignment_number(aug_aId)]
         if aug_t.strand != t.strand or aug_t.chromosome != t.chromosome or t.thickStart == t.thickStop:
             continue
         if t.thickStart != aug_t.thickStart or t.thickStop != aug_t.thickStop:
             classify_dict[aug_aId] = 1
             s = aug_t.getCdsLength()
             if s > 9:
                 details_dict[aug_aId] = [
                     seq_lib.cds_coordinate_to_bed(aug_t, 0, 3, self.rgb, self.column),
                     seq_lib.cds_coordinate_to_bed(aug_t, s - 3, s, self.rgb, self.column),
                 ]
             else:
                 details_dict[aug_aId] = seq_lib.cds_coordinate_to_bed(aug_t, 0, s, self.rgb, self.column)
         else:
             classify_dict[aug_aId] = 0
     self.dumpValueDicts(classify_dict, details_dict)
예제 #12
0
 def initializeDb(self, dbPath, classifiers, dataType=None):
     if dataType is None:
         columnDefinitions = [[x.__name__, x.dataType()]
                              for x in classifiers]
     else:
         columnDefinitions = [[x.__name__, dataType] for x in classifiers]
     # find alignment IDs from PSLs (primary key for database)
     for genome, gp in izip(self.genomes, self.augustusGps):
         aug_aIds = set(x.split()[11] for x in open(gp))
         aIds = [
             psl_lib.remove_augustus_alignment_number(x) for x in aug_aIds
         ]
         self.initializeSqlTable(dbPath, genome, columnDefinitions,
                                 self.primaryKeyColumn)
         self.initializeSqlRows(dbPath, genome, aug_aIds,
                                self.primaryKeyColumn)
         self.buildNameRow(dbPath, genome, aug_aIds, aIds,
                           self.primaryKeyColumn)
예제 #13
0
def database(genome, db, db_path, tmp_dir, mode):
    data_dict = {}
    mkdir_p(os.path.dirname(db_path))
    data_path = os.path.join(tmp_dir, db)
    for col in os.listdir(data_path):
        p = os.path.join(data_path, col)
        with open(p) as p_h:
            data_dict[col] = pickle.load(p_h)
    if mode == "reference":
        index_label = "TranscriptId"
    elif mode == "transMap":
        index_label = "AlignmentId"
    else:
        index_label = "AugustusAlignmentId"
        # Hack to add transMap alignment ID column to Augustus databases.
        aug_ids = data_dict.itervalues().next().viewkeys()
        data_dict["AlignmentId"] = {
            x: psl_lib.remove_augustus_alignment_number(x)
            for x in aug_ids
        }
    sql_lib.write_dict(data_dict, db_path, genome, index_label)
예제 #14
0
def align(target, g, target_fasta, chunk, ref_fasta, out_path):
    g_f = Fasta(target_fasta)
    r_f = Fasta(ref_fasta)
    results = []
    for aug_aId in chunk:
        aId = remove_augustus_alignment_number(aug_aId)
        gencode_id = remove_alignment_number(aId)
        gencode_seq = str(r_f[gencode_id])
        aug_seq = str(g_f[aug_aId])
        tmp_aug = os.path.join(target.getLocalTempDir(), "tmp_aug")
        tmp_gencode = os.path.join(target.getLocalTempDir(), "tmp_gencode")
        fastaWrite(tmp_aug, aug_aId, aug_seq)
        fastaWrite(tmp_gencode, gencode_id, gencode_seq)
        r = popenCatch("blat {} {} -out=psl -noHead /dev/stdout".format(tmp_gencode, tmp_aug))
        r = r.split("\n")[:-3]
        if len(r) == 0:
            results.append([aug_aId, "0", "0"])
        else:
            p_list = [PslRow(x) for x in r]
            results.append(map(str, [aug_aId, identity(p_list), coverage(p_list)]))
    with open(os.path.join(out_path, getRandomAlphaNumericString(10) + ".txt"), "w") as outf:
        for x in results:
            outf.write("\t".join(x) + "\n")
예제 #15
0
def main_augustus_fn(target, comp_ann_path, gencode, genome, base_out_path, filter_chroms):
    clust_title = "Hierarchical_clustering_of_augustus_classifiers"
    base_barplot_title = ("Augustus classifiers failed by {:,} transcripts derived from transMap\n"
                          "on the reference set {} with Augustus {}")
    out_path = os.path.join(base_out_path, "augustus_classifier_breakdown", genome)
    mkdir_p(out_path)
    con, cur = sql_lib.attach_databases(comp_ann_path, mode="augustus")
    highest_cov_dict = sql_lib.highest_cov_aln(cur, genome)
    highest_cov_ids = set(zip(*highest_cov_dict.itervalues())[0])
    sql_data = sql_lib.load_data(con, genome, etc.config.aug_classifiers, primary_key="AugustusAlignmentId", 
                                 table="augustus")
    base_filter_set = {x for x in sql_data.index if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids}
    for mode in ["1", "2"]:
        i = "I{}".format(mode)
        aug_mode = "trusting RNAseq more" if mode == "2" else "trusting RNAseq less"
        filter_set = {x for x in base_filter_set if i in x}
        out_barplot_file = os.path.join(out_path, "augustus_barplot_{}_{}_{}".format(genome, gencode, i))
        barplot_title = base_barplot_title.format(len(filter_set), gencode, aug_mode)
        munged, stats = munge_data(sql_data, filter_set)
        plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title)
        data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString())
        munged.to_csv(data_path)
        out_cluster_file = os.path.join(out_path, "augustus_clustering_{}_{}_{}".format(genome, gencode, i))
        target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def align(target, target_fasta, chunk, ref_fasta, file_tree):
    g_f = Fasta(target_fasta)
    r_f = Fasta(ref_fasta)
    results = []
    tmp_aug = os.path.join(target.getGlobalTempDir(), "tmp_aug")
    tmp_gencode = os.path.join(target.getGlobalTempDir(), "tmp_gencode")
    tmp_psl = os.path.join(target.getGlobalTempDir(), "tmp_psl")
    with open(tmp_aug, "w") as tmp_aug_h, open(tmp_gencode,
                                               "w") as tmp_gencode_h:
        for tgt_id in chunk:
            query_id = remove_augustus_alignment_number(tgt_id)
            gencode_id = remove_alignment_number(query_id)
            gencode_seq = str(r_f[gencode_id])
            aug_seq = str(g_f[tgt_id])
            fastaWrite(tmp_aug_h, tgt_id, aug_seq)
            fastaWrite(tmp_gencode_h, gencode_id, gencode_seq)
    system("blat {} {} -out=psl -noHead {}".format(tmp_aug, tmp_gencode,
                                                   tmp_psl))
    r = popenCatch("simpleChain -outPsl {} /dev/stdout".format(tmp_psl))
    r = r.split("\n")[:-1]
    r_d = defaultdict(list)
    for p in tokenize_stream(r):
        psl = PslRow(p)
        r_d[psl.t_name].append(psl)
    assert len(r_d.viewkeys() & set(chunk)) > 0, (r_d.viewkeys(), set(chunk))
    for tgt_id in chunk:
        if tgt_id not in r_d:
            results.append([tgt_id, query_id, "0", "0"])
        else:
            p_list = [[min(x.coverage, x.target_coverage), x.identity]
                      for x in r_d[tgt_id]]
            best_cov, best_ident = sorted(p_list, key=lambda x: x[0])[-1]
            results.append(map(str, [tgt_id, query_id, best_cov, best_ident]))
    with open(file_tree.getTempFile(), "w") as outf:
        for x in results:
            outf.write("".join([",".join(x), "\n"]))
예제 #17
0
def strip_alignment_numbers(aln_id):
    """
    Convenience function for stripping both Augustus and transMap alignment IDs from a aln_id
    """
    return remove_alignment_number(remove_augustus_alignment_number(aln_id))
예제 #18
0
seq_dict = seq_lib.get_sequence_dict(target_fasta)
ref_seq_dict = seq_lib.get_sequence_dict(ref_fasta)

con, cur = sql_lib.attach_databases("/hive/groups/recon/projs/gorilla_eichler/pipeline_data/comparative/susie_3_2/comparativeAnnotation/2015-10-12/GencodeBasicV23", mode="augustus")

genome = 'gorilla'
ref_genome = 'human'
biotype = 'protein_coding'
filter_chroms = ["Y", "chrY"]

stats = merge_stats(cur, 'gorilla')
highest_cov_dict = sql_lib.highest_cov_aln(cur, "gorilla")
highest_cov_ids = set(zip(*highest_cov_dict.itervalues())[0])
biotype_ids = sql_lib.get_biotype_aln_ids(cur, 'gorilla', 'protein_coding')
highest_cov_ids &= biotype_ids
best_stats = {x: y for x, y in stats.iteritems() if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids}
best_tm = {x: y for x, y in best_stats.iteritems() if x in highest_cov_ids}
best_aug = {x: y for x, y in best_stats.iteritems() if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids and x not in highest_cov_ids}
r = {"higher_cov": [], "higher_ident": [], "higher_both": [], "worse": []}
for aug_id in best_aug:
    aug_cov, aug_ident = best_aug[aug_id]
    tm_cov, tm_ident = best_tm[psl_lib.remove_augustus_alignment_number(aug_id)]
    if aug_cov > tm_cov and aug_ident > tm_ident:
        r["higher_both"].append(aug_id)
    elif aug_cov > tm_cov:
        r["higher_cov"].append(aug_id)
    elif aug_ident > tm_ident:
        r["higher_ident"].append(aug_id)
    else:
        r["worse"].append(aug_id)