def run(self): counts = Counter( psl_lib.remove_alignment_number(aln_id) for aln_id, aln in self.alignment_iterator()) for aln_id, t in self.transcript_iterator(): count = counts[psl_lib.remove_alignment_number(aln_id)] - 1 if count > 0: name = self.column + "_{}_Copies".format(count) bed_rec = seq_lib.transcript_to_bed(t, self.rgb, name) self.details_dict[aln_id].append(bed_rec) self.classify_dict[aln_id] = count self.dump_results_to_disk()
def run(self): self.getTranscriptDict() counts = Counter(psl_lib.remove_alignment_number(aId) for aId in self.transcriptDict) detailsDict = {} classifyDict = {} for aId, t in self.transcriptDict.iteritems(): if counts[psl_lib.remove_alignment_number(aId)] > 1: detailsDict[aId] = seq_lib.transcript_to_bed(t, self.rgb, self.column + "_{}_Copies".format( counts[psl_lib.remove_alignment_number(aId)] - 1)) classifyDict[aId] = 1 else: classifyDict[aId] = 0 self.dumpValueDicts(classifyDict, detailsDict)
def run(self): self.getTranscriptDict() self.getAnnotationDict() self.getSeqDict() self.getRefDict() self.getAlignmentDict() detailsDict = defaultdict(list) classifyDict = {} for aId, t in self.transcriptDict.iteritems(): a = self.annotationDict[psl_lib.remove_alignment_number(aId)] aln = self.alignmentDict[aId] if a.getCdsLength() <= 75 or t.getCdsLength() <= 75: continue # TODO: this will miss an inframe stop if it is the last 3 bases that are not the annotated stop. # use the logic from EndStop to flag this codons = list(codonPairIterator(a, t, aln, self.seqDict, self.refDict))[:-1] for i, target_codon, query_codon in codons: if seq_lib.codon_to_amino_acid(target_codon) == "*": if target_codon == query_codon: detailsDict[aId].append(seq_lib.cds_coordinate_to_bed(t, i, i + 3, self.colors["input"], self.column)) else: detailsDict[aId].append(seq_lib.cds_coordinate_to_bed(t, i, i + 3, self.rgb, self.column)) classifyDict[aId] = 1 if aId not in classifyDict: classifyDict[aId] = 0 self.dumpValueDicts(classifyDict, detailsDict)
def run(self): stopCodons = ('TAA', 'TGA', 'TAG') self.getAlignmentDict() self.getTranscriptDict() self.getAnnotationDict() self.getSeqDict() self.getRefDict() detailsDict = {} classifyDict = {} for aId, t in self.transcriptDict.iteritems(): a = self.annotationDict[psl_lib.remove_alignment_number(aId)] aln = self.alignmentDict[aId] if a.getCdsLength() <= 75 or t.getCdsLength() <= 75: continue s = t.getCdsLength() cds_positions = [t.chromosome_coordinate_to_cds(aln.query_coordinate_to_target(a.cds_coordinate_to_transcript(i))) for i in xrange(s - 4, s - 1)] if None in cds_positions or t.get_cds(self.seqDict)[-3:] not in stopCodons: # does this problem exist in the reference? if a.get_cds(self.refDict)[-3:] not in stopCodons: detailsDict[aId] = seq_lib.cds_coordinate_to_bed(t, s - 3, s, self.colors["input"], self.column) else: detailsDict[aId] = seq_lib.cds_coordinate_to_bed(t, s - 3, s, self.rgb, self.column) classifyDict[aId] = 1 else: classifyDict[aId] = 0 self.dumpValueDicts(classifyDict, detailsDict)
def run(self, shortIntronSize=30): self.getTranscriptDict() self.getSeqDict() self.getAlignmentDict() self.getAnnotationDict() self.getRefDict() detailsDict = defaultdict(list) classifyDict = {} for aId, t in self.transcriptDict.iteritems(): a = self.annotationDict[psl_lib.remove_alignment_number(aId)] aln = self.alignmentDict[aId] for intron in t.intronIntervals: if len(intron) <= shortIntronSize: continue elif intron.start >= t.thickStart and intron.stop < t.thickStop: continue seq = intron.get_sequence(self.seqDict, strand=True) donor, acceptor = seq[:2], seq[-2:] if donor not in self.non_canonical or self.non_canonical[donor] != acceptor: classifyDict[aId] = 1 # is this a intron that exists in the reference that also has this problem? if compareIntronToReference(intron, a, aln, self.non_canonical, self.refDict) is True: detailsDict[aId].append(seq_lib.splice_intron_interval_to_bed(t, intron, self.colors["input"], self.column)) else: detailsDict[aId].append(seq_lib.splice_intron_interval_to_bed(t, intron, self.rgb, self.column)) if aId not in classifyDict: classifyDict[aId] = 0 self.dumpValueDicts(classifyDict, detailsDict)
def run(self): self.getTranscriptDict() self.getAlignmentDict() self.getAnnotationDict() self.getSeqDict() self.getRefDict() detailsDict = {} classifyDict = {} for aId, t in self.transcriptDict.iteritems(): a = self.annotationDict[psl_lib.remove_alignment_number(aId)] aln = self.alignmentDict[aId] # do not include noncoding transcripts or lift-overs that contain less than 25 codons if a.getCdsLength() <= 75 or t.getCdsLength() <= 75: continue cds_positions = [t.chromosome_coordinate_to_cds(aln.query_coordinate_to_target(a.cds_coordinate_to_transcript(i))) for i in xrange(3)] if None in cds_positions: detailsDict[aId] = seq_lib.cds_coordinate_to_bed(t, 0, 3, self.rgb, self.column) classifyDict[aId] = 1 elif t.get_cds(self.seqDict)[:3] != "ATG": if a.get_cds(self.refDict)[:3] != "ATG": detailsDict[aId] = seq_lib.cds_coordinate_to_bed(t, 0, 3, self.colors["input"], self.column) else: detailsDict[aId] = seq_lib.cds_coordinate_to_bed(t, 0, 3, self.rgb, self.column) classifyDict[aId] = 1 else: classifyDict[aId] = 0 self.dumpValueDicts(classifyDict, detailsDict)
def run(self): self.getTranscriptDict() self.getAnnotationDict() detailsDict = {} classifyDict = {} for aId, t in self.transcriptDict.iteritems(): a = self.annotationDict[psl_lib.remove_alignment_number(aId)] # do not include noncoding transcripts or lift-overs that contain less than 25 codon if a.getCdsLength() <= 75 or t.getCdsLength() <= 75: continue # is this is a problem in the reference? # remove all -1 frames because those are UTR exons a_frames = [x for x in a.exonFrames if x != -1] if a.strand is True and a_frames[0] != 0 or a.strand is False and a_frames[-1] != 0: classifyDict[aId] = 1 detailsDict[aId] = seq_lib.cds_coordinate_to_bed(t, 0, 3, self.colors["input"], self.column) continue # remove all -1 frames because those are UTR exons t_frames = [x for x in t.exonFrames if x != -1] if t.strand is True and t_frames[0] != 0 or t.strand is False and t_frames[-1] != 0: classifyDict[aId] = 1 detailsDict[aId] = seq_lib.cds_coordinate_to_bed(t, 0, 3, self.rgb, self.column) continue classifyDict[aId] = 0 self.dumpValueDicts(classifyDict, detailsDict)
def alignment_refalignment_transcript_annotation_iterator(self): if self.annotation_dict is None: self.get_annotation_dict() for aln_id, aln, ref_aln, t in self.alignment_refalignment_transcript_iterator( ): a = self.annotation_dict[psl_lib.remove_alignment_number(aln_id)] yield aln_id, aln, ref_aln, t, a
def run(self, shortIntronSize=30): self.getAnnotationDict() self.getTranscriptDict() self.getAlignmentDict() detailsDict = defaultdict(list) classifyDict = {} for aId, aln in self.alignmentDict.iteritems(): if aId not in self.transcriptDict: continue t = self.transcriptDict[aId] a = self.annotationDict[psl_lib.remove_alignment_number(aId)] original_introns = {(x.start, x.stop) for x in a.intronIntervals} target_introns = set() target_intron_mapping = {} for intron in t.intronIntervals: a_start = a.transcript_coordinate_to_chromosome(aln.target_coordinate_to_query(intron.start - 1)) + 1 a_stop = a.transcript_coordinate_to_chromosome(aln.target_coordinate_to_query(intron.stop)) target_introns.add((a_start, a_stop)) target_intron_mapping[(a_start, a_stop)] = intron missing_introns = original_introns - target_introns if len(missing_introns) != 0: classifyDict[aId] = 1 not_original_introns = target_introns - original_introns for a_start, a_stop in not_original_introns: intron = target_intron_mapping[(a_start, a_stop)] if len(intron) >= shortIntronSize: detailsDict[aId].append(seq_lib.splice_intron_interval_to_bed(t, intron, self.rgb, self.column)) else: classifyDict[aId] = 0 self.dumpValueDicts(classifyDict, detailsDict)
def run(self): self.get_annotation_dict() results_dict = { aln_id: self.annotation_dict[psl_lib.remove_alignment_number(aln_id)].start for aln_id, t in self.transcript_iterator() } self.dump_attribute_results_to_disk(results_dict)
def alignment_transcript_annotation_iterator(self): """ Convenience function for iterating over alignment, ref transcript and tgt transcript """ if self.annotation_dict is None: self.get_annotation_dict() for aln_id, aln, t in self.alignment_transcript_iterator(): a = self.annotation_dict[psl_lib.remove_alignment_number(aln_id)] yield aln_id, aln, t, a
def main(): args = parse_args() aln_dict = psl_lib.get_alignment_dict(args.psl) ref_aln_dict = psl_lib.get_alignment_dict(args.refPsl) tx_dict = seq_lib.get_transcript_dict( with open(args.outPath, "w") as outf: for aln_id, aln in sorted(aln_dict.iteritems(), key=lambda x: x[0]): ref_aln = ref_aln_dict[psl_lib.remove_alignment_number(aln_id)] t = tx_dict[aln_id] vec = build_intron_vector(aln, ref_aln, t, args.fuzz_distance) outf.write("{}\t{}\n".format(aln_id, ",".join(vec)))
def run(self, cdsCutoff=75): self.getTranscriptDict() self.getAnnotationDict() detailsDict = {} classifyDict = {} for aId, t in self.transcriptDict.iteritems(): # do not include noncoding transcripts a = self.annotationDict[psl_lib.remove_alignment_number(aId)] if a.getCdsLength() < 3: continue elif a.getCdsLength() <= cdsCutoff: detailsDict[aId] = seq_lib.transcript_to_bed(t, self.colors["input"], self.column) classifyDict[aId] = 1 elif t.getCdsLength() <= cdsCutoff: detailsDict[aId] = seq_lib.transcript_to_bed(t, self.rgb, self.column) classifyDict[aId] = 1 else: classifyDict[aId] = 0 self.dumpValueDicts(classifyDict, detailsDict)
def run(self): self.getAlignmentDict() self.getTranscriptDict() self.getAnnotationDict() detailsDict = defaultdict(list) classifyDict = {} for aId, aln in self.alignmentDict.iteritems(): if aId not in self.transcriptDict: continue t = self.transcriptDict[aId] a = self.annotationDict[psl_lib.remove_alignment_number(aId)] # do not include noncoding transcripts or lift-overs that contain less than 1 codon if a.getCdsLength() <= 75 or t.getCdsLength() <= 75: continue frame_shifts = list(frameShiftIterator(a, t, aln)) if len(frame_shifts) == 0: classifyDict[aId] = 0 continue indel_starts, indel_stops, spans = zip(*frame_shifts) # calculate cumulative frame by adding each span and taking mod 3 - zeroes imply regaining frame # note that this code prepends a 0 to the list, offsetting all values by 1. This is useful. cumulative_frame = map(lambda x: x % 3, reduce(lambda l, v: (l.append(l[-1] + v) or l), spans, [0])) # every start is when a zero existed in the previous spot in cumulative_frame windowed_starts = [x for x, y in izip(indel_starts, cumulative_frame) if y == 0 or x == indel_starts[0]] # every stop is when a zero exists at this cumulative_frame windowed_stops = [x for x, y in izip(indel_stops, cumulative_frame[1:]) if y == 0] # sanity check assert any([len(windowed_starts) == len(windowed_stops), len(windowed_starts) - 1 == len(windowed_stops)]),\ (self.genome, self.column, aId) # now we need to fix frame and stops - if this shift extends to the end of the transcript, add that stop # additionally, if this is a negative strand transcript, flip starts/stops so that start is always < stop if len(windowed_stops) < len(windowed_starts) and t.strand is False: windowed_stops.append(t.thickStart) windowed_stops, windowed_starts = windowed_starts, windowed_stops elif len(windowed_stops) < len(windowed_starts): windowed_stops.append(t.thickStop) elif t.strand is False: windowed_stops, windowed_starts = windowed_starts, windowed_stops for start, stop in izip(windowed_starts, windowed_stops): detailsDict[aId].append(seq_lib.chromosome_coordinate_to_bed(t, start, stop, self.rgb, self.column)) classifyDict[aId] = 1 self.dumpValueDicts(classifyDict, detailsDict)
def run(self): self.getAlignmentDict() self.getTranscriptDict() self.getAnnotationDict() detailsDict = {} classifyDict = {} for aId, t in self.transcriptDict.iteritems(): a = self.annotationDict[psl_lib.remove_alignment_number(aId)] if a.getCdsLength() <= 75 or t.getCdsLength() <= 75: continue if t.getCdsLength() % 3 != 0 and a.getCdsLength() % 3 != 0: detailsDict[aId] = seq_lib.chromosome_coordinate_to_bed(t, t.thickStart, t.thickStop, self.colors["input"], self.column) classifyDict[aId] = 1 elif t.getCdsLength() % 3 != 0: detailsDict[aId] = seq_lib.chromosome_coordinate_to_bed(t, t.thickStart, t.thickStop, self.rgb, self.column) classifyDict[aId] = 1 else: classifyDict[aId] = 0 self.dumpValueDicts(classifyDict, detailsDict)
def run(self, mult3=False): self.getAlignmentDict() self.getTranscriptDict() self.getAnnotationDict() detailsDict = {} classifyDict = {} for aId, aln in self.alignmentDict.iteritems(): if aId not in self.transcriptDict: continue t = self.transcriptDict[aId] a = self.annotationDict[psl_lib.remove_alignment_number(aId)] # do not include noncoding transcripts or lift-overs that contain less than 25 codon if a.getCdsLength() <= 75 or t.getCdsLength() <= 75: continue deletions = [seq_lib.chromosome_region_to_bed(t, start, stop, self.rgb, self.column) for start, stop, size in deletionIterator(a, t, aln, mult3) if start >= t.thickStart and stop < t.thickStop] if len(deletions) > 0: detailsDict[aId] = deletions classifyDict[aId] = 1 else: classifyDict[aId] = 0 self.dumpValueDicts(classifyDict, detailsDict)
def categorized_plot(cur, highest_cov_dict, genomes, out_path, file_name, biotype, biotype_ids, gencode, query_fn): results = [] for g in genomes: best_ids = set(zip(*highest_cov_dict[g].itervalues())[0]) query = query_fn(g, biotype, details=False) categorized_ids = sql_lib.get_query_ids(cur, query) num_categorized = len({ x for x in categorized_ids if x in best_ids and psl_lib.remove_alignment_number(x) in biotype_ids }) norm = num_categorized / (0.01 * len(biotype_ids)) results.append([g, norm, num_categorized]) title_string = "Proportion of {:,} {} transcripts in {}\ncategorized as {}" title_string = title_string.format(len(biotype_ids), biotype, gencode, query_fn.__name__) plot_lib.barplot(results, out_path, file_name, title_string, adjust_y=False)
def align(target, g, target_fasta, chunk, ref_fasta, out_path): g_f = Fasta(target_fasta) r_f = Fasta(ref_fasta) results = [] for aug_aId in chunk: aId = remove_augustus_alignment_number(aug_aId) gencode_id = remove_alignment_number(aId) gencode_seq = str(r_f[gencode_id]) aug_seq = str(g_f[aug_aId]) tmp_aug = os.path.join(target.getLocalTempDir(), "tmp_aug") tmp_gencode = os.path.join(target.getLocalTempDir(), "tmp_gencode") fastaWrite(tmp_aug, aug_aId, aug_seq) fastaWrite(tmp_gencode, gencode_id, gencode_seq) r = popenCatch("blat {} {} -out=psl -noHead /dev/stdout".format(tmp_gencode, tmp_aug)) r = r.split("\n")[:-3] if len(r) == 0: results.append([aug_aId, "0", "0"]) else: p_list = [PslRow(x) for x in r] results.append(map(str, [aug_aId, identity(p_list), coverage(p_list)])) with open(os.path.join(out_path, getRandomAlphaNumericString(10) + ".txt"), "w") as outf: for x in results: outf.write("\t".join(x) + "\n")
def run(self): self.getTranscriptDict() self.getAnnotationDict() self.getSeqDict() self.getRefDict() self.getAlignmentDict() detailsDict = defaultdict(list) classifyDict = {} for aId, aln in self.alignmentDict.iteritems(): if aId not in self.transcriptDict: continue t = self.transcriptDict[aId] a = self.annotationDict[psl_lib.remove_alignment_number(aId)] if a.getCdsLength() <= 75 or t.getCdsLength() <= 75: continue for i, target_codon, query_codon in codonPairIterator(a, t, aln, self.seqDict, self.refDict): if target_codon != query_codon and seq_lib.codon_to_amino_acid(target_codon) == \ seq_lib.codon_to_amino_acid(query_codon): detailsDict[aId].append(seq_lib.cds_coordinate_to_bed(t, i, i + 3, self.rgb, self.column)) classifyDict[aId] = 1 if aId not in classifyDict: classifyDict[aId] = 0 self.dumpValueDicts(classifyDict, detailsDict)
def align(target, target_fasta, chunk, ref_fasta, file_tree): g_f = Fasta(target_fasta) r_f = Fasta(ref_fasta) results = [] tmp_aug = os.path.join(target.getGlobalTempDir(), "tmp_aug") tmp_gencode = os.path.join(target.getGlobalTempDir(), "tmp_gencode") tmp_psl = os.path.join(target.getGlobalTempDir(), "tmp_psl") with open(tmp_aug, "w") as tmp_aug_h, open(tmp_gencode, "w") as tmp_gencode_h: for tgt_id in chunk: query_id = remove_augustus_alignment_number(tgt_id) gencode_id = remove_alignment_number(query_id) gencode_seq = str(r_f[gencode_id]) aug_seq = str(g_f[tgt_id]) fastaWrite(tmp_aug_h, tgt_id, aug_seq) fastaWrite(tmp_gencode_h, gencode_id, gencode_seq) system("blat {} {} -out=psl -noHead {}".format(tmp_aug, tmp_gencode, tmp_psl)) r = popenCatch("simpleChain -outPsl {} /dev/stdout".format(tmp_psl)) r = r.split("\n")[:-1] r_d = defaultdict(list) for p in tokenize_stream(r): psl = PslRow(p) r_d[psl.t_name].append(psl) assert len(r_d.viewkeys() & set(chunk)) > 0, (r_d.viewkeys(), set(chunk)) for tgt_id in chunk: if tgt_id not in r_d: results.append([tgt_id, query_id, "0", "0"]) else: p_list = [[min(x.coverage, x.target_coverage), x.identity] for x in r_d[tgt_id]] best_cov, best_ident = sorted(p_list, key=lambda x: x[0])[-1] results.append(map(str, [tgt_id, query_id, best_cov, best_ident])) with open(file_tree.getTempFile(), "w") as outf: for x in results: outf.write("".join([",".join(x), "\n"])) rules <- apriori(data.transactions, parameter=list(support=0.01, confidence=0.7)) # i have no f*****g clue how to interpret this, but at least the frequency plot is nice # now I want to re-run this on just Basic set protein_coding. I can't just dump to csv this time. from scripts.coverage_identity_ok_plots import * from scripts.consensus import * attrs = "/cluster/home/ifiddes/mus_strain_data/pipeline_data/comparative/1504/transMap/2015-05-28/data/wgEncodeGencodeAttrsVM4.tsv" coding_ids = get_all_ids(attrs, biotype="protein_coding") basic_ids = {x.split()[0] for x in open("/cluster/home/ifiddes/mus_strain_data/pipeline_data/comparative/1504/transMap/2015-05-28/data/")} basic_coding = {x for x in basic_ids if x in coding_ids} tm_cmd = """SELECT AlignmentId,{} FROM main.'C57B6NJ'""".format(",".join(tm_fields)) r = cur.execute(tm_cmd).fetchall() r_coding = [x for x in r if remove_alignment_number(x[0]) in basic_coding] with open("transmap_coding_only.csv", "w") as outf: outf.write("AlignmentId," + ",".join(tm_fields) + "\n") for x in r_coding: outf.write(",".join(map(str, x)) + "\n") data <- read.csv("/hive/users/ifiddes/comparativeAnnotator/transmap_coding_only.csv", row.names=1, header=T, na.strings="None") data[] <- 0 mat <- sapply(, as.logical) mat.t <- t(mat) library(stats) d <- dist(mat.t, method="binary") hc <- hclust(d, method="ward") pdf("transMap_clustered_classifiers_coding_basic.pdf") plot(hc)
def strip_alignment_numbers(aln_id): """ Convenience function for stripping both Augustus and transMap alignment IDs from a aln_id """ return remove_alignment_number(remove_augustus_alignment_number(aln_id))
def run(self): results_dict = { aln_id: psl_lib.remove_alignment_number(aln_id) for aln_id, t in self.transcript_iterator() } self.dump_attribute_results_to_disk(results_dict)
def run(self): self.getAnnotationDict() self.getAlignmentDict() valueDict = {aId: self.annotationDict[psl_lib.remove_alignment_number(aId)].chromosome for aId in self.alignmentDict} self.dumpValueDict(valueDict)
def run(self): self.getAttributeDict() self.getAlignmentDict() valueDict = {aId: self.attributeDict[psl_lib.remove_alignment_number(aId)].transcriptType for aId in self.alignmentDict} self.dumpValueDict(valueDict)
def run(self): self.getAlignmentDict() valueDict = {aId: psl_lib.remove_alignment_number(aId) for aId in self.alignmentDict} self.dumpValueDict(valueDict)
def run(self): self.getAnnotationDict() self.getAlignmentDict() valueDict = {aId: seq_lib.convert_strand(self.annotationDict[psl_lib.remove_alignment_number(aId)].strand) for aId in self.alignmentDict} self.dumpValueDict(valueDict)