def processChunk(prediction_id, gene_id, contig, strand, regions): if gene_id == None: return ## re-arrange positions on negative strand if Genomics.IsNegativeStrand(strand): # convert to negative strand coordinates counting from 0 coordinate_offset = max(map(lambda x: x[1], regions)) regions = map( lambda x: (coordinate_offset - x[1], coordinate_offset - x[0]), regions) regions.sort() else: coordinate_offset = 0 offset = 0 for start, end in regions: start -= offset for x in range(start + 2, end, 3): if coordinate_offset: # the factor -1 results from the open/closed # bracket notation c = coordinate_offset - x - 1 else: c = 0 locations.append( (prediction_id, gene_id, contig, strand, c, c + 1)) offset = (end - start) % 3 if (offset != 0): if options.loglevel >= 1: options.stdlog.write( "# WARNING: prediction=%s, gene=%s on %s:%s : frame did not add up\n" % (prediction_id, gene_id, contig, strand))
def annotateRegulons( iterator, fasta, tss, options ): """annotate regulons within iterator. Entries specied with ``--restrict-source`` are annotated. """ gene_iterator = GTF.gene_iterator( iterator ) ngenes, ntranscripts, nregulons = 0, 0, 0 upstream, downstream = options.upstream, options.downstream for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand( gene[0][0].strand ) lcontig = fasta.getLength( gene[0][0].contig ) regulons = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min( [x.start for x in transcript ] ), max( [x.end for x in transcript ] ) if tss: # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream else: # add range to both sides of tts if is_negative_strand: interval = mi - options.downstream, mi + options.upstream else: interval = ma - options.upstream, ma + options.downstream interval = ( min( lcontig, max( 0, interval[0] ) ), min( lcontig, max( 0, interval[1] ) ) ) regulons.append( interval ) transcript_ids.append( transcript[0].transcript_id ) if options.merge_promotors: # merge the regulons (and rename - as sort order might have changed) regulons = Intervals.combine( regulons ) transcript_ids = ["%i" % (x+1) for x in range(len(regulons) )] gtf = GTF.Entry() gtf.fromGTF( gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id ) gtf.source = "regulon" x = 0 for start, end in regulons: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write( "%s\n" % str(gtf) ) nregulons += 1 x += 1 E.info( "ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons) )
def annotateTTS(iterator, fasta, options): """annotate termination sites within iterator. Entries specified with ``--restrict-source are annotated``. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, npromotors = 0, 0, 0 for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) tts = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript]), max( [x.end for x in transcript]) transcript_ids.append(transcript[0].transcript_id) # if tts is directly at start/end of contig, the tss will # be within an exon. otherwise, it is outside an exon. if is_negative_strand: tts.append( (max(0, mi - options.promotor), max(options.promotor, mi))) else: tts.append( (min(ma, lcontig - options.promotor), min(lcontig, ma + options.promotor))) if options.merge_promotors: # merge the promotors (and rename - as sort order might have # changed) tts = Intervals.combine(tts) transcript_ids = ["%i" % (x + 1) for x in range(len(tts))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "tts" x = 0 for start, end in tts: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) npromotors += 1 x += 1 if options.loglevel >= 1: options.stdlog.write( "# ngenes=%i, ntranscripts=%i, ntss=%i\n" % (ngenes, ntranscripts, npromotors))
def annotateExons(iterator, fasta, options): """annotate exons within iterator.""" gene_iterator = GTF.gene_iterator(iterator) ninput, noutput, noverlapping = 0, 0, 0 for this in gene_iterator: ninput += 1 intervals = collections.defaultdict(list) ntranscripts = len(this) is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand) for exons in this: # make sure these are sorted correctly exons.sort(key=lambda x: x.start) if is_negative_strand: exons.reverse() nexons = len(exons) for i, e in enumerate(exons): intervals[(e.start, e.end)].append((i + 1, nexons)) gtf = GTF.Entry() gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id) gtf.addAttribute("ntranscripts", ntranscripts) gtfs = [] for r, pos in intervals.items(): g = GTF.Entry().copy(gtf) g.start, g.end = r g.addAttribute("nused", len(pos)) g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos])) gtfs.append(g) gtfs.sort(key=lambda x: x.start) for g in gtfs: options.stdout.write("%s\n" % str(g)) # check for exon overlap intervals = [(g.start, g.end) for g in gtfs] nbefore = len(intervals) nafter = len(Intervals.combine(intervals)) if nafter != nbefore: noverlapping += 1 noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
def transform_third_codon(start, end, intervals_with_gff): """transform: only return nucleotide positions in window (start, end) that are in third codon position. """ intervals = [] for istart, iend, gff in intervals_with_gff: if gff.frame == ".": raise ValueError("need a frame for third codon positions.") # frame = nucleotides from start to next codon frame = int(gff.frame) # to make life easier, convert to 0-based coordinates, # with zero starting at first position in window # re-arrange positions on negative strand if Genomics.IsNegativeStrand(gff.strand): # convert to negative strand coordinates counting from 0 coordinate_offset = end reverse = True istart, iend = end - iend, end - istart else: istart, iend = istart - start, iend - start reverse = False coordinate_offset = start # make sure that you start on a second codon position and within window if istart < 0: frame = (frame + istart) % 3 istart = 0 if frame != 0: istart -= (3 - frame) istart += 2 iend = min(iend, end - start) for x in range(istart, iend, 3): if reverse: c = coordinate_offset - x - 1 else: c = coordinate_offset + x intervals.append((c, c + 1)) return Intervals.combineIntervals(intervals)
def buildSequenceVariants(self, seq, strand, pos, snp): '''build new sequence by modifying a sequence fragment in seq at pos with snp. It is assumed that seq is already oriented according to strand. The strand is used to revert the snp if necessary. Note that only sequences different from seq will be returned. returns is_homozygous, seqs ''' is_negative_strand = Genomics.IsNegativeStrand(strand) reference_base = snp.reference_base if reference_base != "*" and is_negative_strand: reference_base = Genomics.complement(reference_base) new_sequences = [] is_homozygous = True if reference_base != "*": if seq[pos].upper() != reference_base.upper(): raise ValueError("base mismatch at snp %i, expected %s, got %s in %s at position %i; snp=%s" % (snp.pos, reference_base, seq[pos], seq, pos, ";".join(map(str, snp)))) # single base changes variant_bases = Genomics.resolveAmbiguousNA(snp.genotype) if len(variant_bases) == 1: is_homozygous = True else: is_homozygous = False for variant_base in variant_bases: if is_negative_strand: variant_base = Genomics.complement(variant_base) s = list(seq) s[pos] = variant_base s = "".join(s) if s != seq: new_sequences.append(s) else: variants = snp.genotype.split("/") is_homozygous = False for variant in variants: s = list(seq) # samtools denotes insert/deletion after position # while python is before/at position, hence the pos+1 if variant[0] == "+": toinsert = variant[1:].upper() if is_negative_strand: toinsert = Genomics.complement(toinsert) s.insert(pos, toinsert) else: s.insert(pos + 1, toinsert) elif variant[0] == "-": # pos+1+len(x)-1 = pos+len(x) todelete = variant[1:].upper() l = len(todelete) if is_negative_strand: # delete left of pos xstart = max(0, pos - l) xend = pos todelete = todelete[:min(l, pos)] else: # delete right of pos xstart = pos + 1 xend = min(self.mSize, pos + 1 + l) todelete = todelete[:self.mSize - (pos + 1)] deleted = "".join(s[xstart:xend]) if is_negative_strand: deleted = Genomics.complement(deleted) if deleted != todelete: raise ValueError("base mismatch at indel %i, expected %s, got %s in %s at position %i(%i:%i); is_negative_strand=%s, snp=%s" % (snp.pos, todelete, deleted, seq, pos, xstart, xend, is_negative_strand, ";".join(map(str, snp)))) del s[xstart:xend] elif variant[0] == "*": is_homozygous = True else: raise ValueError("unknown variant sign '%s'" % variant[0]) s = "".join(s) if s != seq: new_sequences.append(s) return is_homozygous, new_sequences
def annotateGenes(iterator, fasta, options): """annotate gene structures This method outputs intervals for first/middle/last exon/intron, UTRs and flanking regions. This method annotates per transcript. In order to achieve a unique tiling, use only a single transcript per gene and remove any overlap between genes. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, nskipped = 0, 0, 0 results = [] increment = options.increment introns_detail = "introns" in options.detail exons_detail = "exons" in options.detail for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) try: lcontig = fasta.getLength(gene[0][0].contig) except KeyError: nskipped += 1 continue results = [] for transcript in gene: def _add(interval, anno): gtf = GTF.Entry() gtf.contig = transcript[0].contig gtf.gene_id = transcript[0].gene_id gtf.transcript_id = transcript[0].transcript_id gtf.strand = transcript[0].strand gtf.feature = anno gtf.start, gtf.end = interval results.append(gtf) ntranscripts += 1 exons = [(x.start, x.end) for x in transcript if x.feature == "exon"] if len(exons) == 0: nskipped += 1 exons.sort() introns = [] end = exons[0][1] for exon in exons[1:]: introns.append((end, exon[0])) end = exon[1] # add flank start, end = exons[0][0], exons[-1][1] upstream, downstream = [], [] for x in range(0, options.flank, increment): upstream.append((start - increment, start)) start -= increment downstream.append((end, end + increment)) end += increment # remove out-of-bounds coordinates upstream = [x for x in upstream if x[0] >= 0] downstream = [x for x in downstream if x[1] <= lcontig] if is_negative_strand: exons.reverse() introns.reverse() upstream, downstream = downstream, upstream # add exons if exons_detail: _add(exons[0], "first_exon") if len(exons) > 1: _add(exons[-1], "last_exon") for e in exons[1:-1]: _add(e, "middle_exon") else: for e in exons: _add(e, "exon") # add introns if introns_detail: if len(introns) > 0: _add(introns[0], "first_intron") if len(introns) > 1: _add(introns[-1], "last_intron") for i in introns[1:-1]: _add(i, "middle_intron") else: for i in introns: _add(i, "intron") for x, u in enumerate(upstream): _add(u, "upstream_%i" % (increment * (x + 1))) for x, u in enumerate(downstream): _add(u, "downstream_%i" % (increment * (x + 1))) results.sort(key=lambda x: x.feature) cache = [] for key, vals in itertools.groupby(results, key=lambda x: x.feature): v = list(vals) intervals = [(x.start, x.end) for x in v] intervals = Intervals.combine(intervals) for start, end in intervals: r = GTF.Entry() r.copy(v[0]) r.start, r.end = start, end cache.append(r) cache.sort(key=lambda x: x.start) for r in cache: options.stdout.write("%s\n" % str(r)) E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" % (ngenes, ntranscripts, nskipped))
def annotateGREATDomains(iterator, fasta, options): """build great domains extend from TSS a basal region. """ gene_iterator = GTF.gene_iterator(iterator) counter = E.Counter() upstream, downstream = options.upstream, options.downstream radius = options.radius outfile = options.stdout regions = [] #################################################################### # define basal regions for each gene # take all basal regions per transcript and merge them # Thus, the basal region of a gene might be larger than the sum # of options.upstream + options.downstream for gene in gene_iterator: counter.genes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) regulons = [] transcript_ids = [] # collect every basal region per transcript for transcript in gene: counter.transcripts += 1 mi, ma = min([x.start for x in transcript]), max( [x.end for x in transcript]) # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream interval = (min(lcontig, max(0, interval[0])), min(lcontig, max(0, interval[1]))) regulons.append(interval) transcript_ids.append(transcript[0].transcript_id) # take first/last entry start, end = min(x[0] for x in regulons), max(x[1] for x in regulons) gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "greatdomain" gtf.start, gtf.end = start, end regions.append(gtf) regions.sort(key=lambda x: (x.contig, x.start)) outf = IOTools.openFile("test.gff", "w") for x in regions: outf.write(str(x) + "\n") outf.close() #################################################################### # extend basal regions regions.sort(key=lambda x: (x.contig, x.start)) # iterate within groups of overlapping basal regions groups = list(GTF.iterator_overlaps(iter(regions))) counter.groups = len(groups) last_end = 0 reset = False for region_id, group in enumerate(groups): # collect basal intervals in group intervals = [(x.start, x.end) for x in group] def overlapsBasalRegion(pos): for start, end in intervals: if start == pos or end == pos: continue if start <= pos < end: return True if start > pos: return False return False # deal with boundary cases - end of contig if region_id < len(groups) - 1: nxt = groups[region_id + 1] if nxt[0].contig == group[0].contig: next_start = min([x.start for x in nxt]) else: next_start = fasta.getLength(group[0].contig) reset = True else: next_start = fasta.getLength(group[0].contig) reset = True # last_end = basal extension of previous group # next_start = basal_extension of next group # extend region to previous/next group always extend # dowstream, but upstream only extend if basal region of an # interval is not already overlapping another basal region # within the group save_end = 0 for gtf in group: save_end = max(save_end, gtf.end) if gtf.strand == "+": if not overlapsBasalRegion(gtf.start): gtf.start = max(gtf.start - radius, last_end) # always extend downstream gtf.end = min(gtf.end + radius, next_start) else: # always extend downstream gtf.start = max(gtf.start - radius, last_end) if not overlapsBasalRegion(gtf.end): gtf.end = min(gtf.end + radius, next_start) outfile.write(str(gtf) + "\n") counter.regulons += 1 if len(group) > 1: counter.overlaps += len(group) else: counter.nonoverlaps += 1 if reset: last_end = 0 reset = False else: last_end = save_end E.info("%s" % str(counter))
margin_sbjct_from, margin_sbjct_to, query_token, sbjct_token, sbjct_strand ) = segment try: lgenome = fasta.getLength( sbjct_token ) except KeyError: nunknown += 1 if sbjct_token not in unknown: unknown[sbjct_token] = 0 unknown[sbjct_token] += 1 continue min_sbjct_to = min( min_sbjct_to, lgenome ) margin_sbjct_to = min( margin_sbjct_to, lgenome ) if options.forward_coordinates: if Genomics.IsNegativeStrand( sbjct_strand ): margin_sbjct_from, margin_sbjct_to = lgenome-margin_sbjct_to, lgenome-margin_sbjct_from min_sbjct_from, min_sbjct_to = lgenome-min_sbjct_to, lgenome-min_sbjct_from if options.no_sequence: fragment = "" else: # get genomic sequence fragment = fasta.getSequence( sbjct_token, sbjct_strand, margin_sbjct_from, margin_sbjct_to, as_array = False) if peptide_sequences.has_key( query_token ): peptide_sequence = peptide_sequences[query_token] else: peptide_sequence = None
def processEntries(name, entries, options, fasta, contigs): ## reorder, if negative strand # if Genomics.IsNegativeStrand( entries[0].strand ): # entries.reverse() is_negative = Genomics.IsNegativeStrand(entries[0].strand) contig = entries[0].contig lcontig = contigs[contig] # sort in-order in transcript entries.sort(key=lambda x: x.start) if is_negative: entries.reverse() for gff in entries: if gff.end > lcontig or gff.start >= lcontig: E.warn( "coordinates for %s on %s out of bounds (%i:%i > %i)" % \ (str(gff.mAttributes), contig, gff.start, gff.end, lcontig) ) return False if options.convert_to_cds: cds_start = 0 t = 0 for gff in entries: t += gff.end - gff.start options.stdout.write("\t".join( map(str, ( name, entries[0].contig, "+", 1, 0, cds_start, t, cds_start, t, ))) + "\n") else: n = 0 cds_start = 0 cds_end = 0 if options.reset_coordinates: if is_negative: offset = -(entries[-1].start) else: offset = -(entries[0].start) else: offset = 0 for gff in entries: n += 1 cds_end += gff.end - gff.start if offset: gff.start += offset gff.end += offset options.stdout.write("\t".join( map(str, (name, gff.contig, gff.strand, gff.frame, n, cds_start, cds_end, gff.start, gff.end))) + "\n") cds_start = cds_end return True