def _get_counts_and_sequence(gtf_iterator, bam, fasta, seperate_UTRs=False): '''Called by pentamer_enrichment. This function will return an iterator that yeilds tuples of profiles accross transcripts or introns and the sequence for which the profile is determined''' for transcript in gtf_iterator: E.debug("Counting transcript %s" % transcript[0].transcript_id) contig, strand = transcript[0].contig, transcript[0].strand # exons exons = GTF.asRanges(transcript, "exon") sequence = "".join(fasta.getSequence(contig, strand, exon[0], exon[1]) for exon in exons) exon_counts = count_transcript(transcript, bam) yield (exon_counts, sequence) # introns intron_intervals = GTF.toIntronIntervals(transcript) intron_counts = count_intervals(bam, intron_intervals, contig, strand) if intron_counts.sum() == 0: continue for intron in intron_intervals: seq = fasta.getSequence(contig, strand, intron[0], intron[1]) profile = intron_counts.loc[float(intron[0]):float(intron[1])] profile.index = profile.index - intron[0] yield (profile, seq)
def _get_counts_and_sequence(gtf_iterator, bam, fasta, seperate_UTRs=False): '''Called by pentamer_enrichment. This function will return an iterator that yeilds tuples of profiles accross transcripts or introns and the sequence for which the profile is determined''' for transcript in gtf_iterator: E.debug("Counting transcript %s" % transcript[0].transcript_id) contig, strand = transcript[0].contig, transcript[0].strand # exons exons = GTF.asRanges(transcript, "exon") sequence = "".join( fasta.getSequence(contig, strand, exon[0], exon[1]) for exon in exons) exon_counts = count_transcript(transcript, bam) yield (exon_counts, sequence) # introns intron_intervals = GTF.toIntronIntervals(transcript) intron_counts = count_intervals(bam, intron_intervals, contig, strand) if intron_counts.sum() == 0: continue for intron in intron_intervals: seq = fasta.getSequence(contig, strand, intron[0], intron[1]) profile = intron_counts.loc[float(intron[0]):float(intron[1])] profile.index = profile.index - intron[0] yield (profile, seq)
def processing_index(interval_iterator, bam, window_size=50): '''Calculate the processing index for the speicied sample, using the provided interval_iterator to get the cleavage sites. The iterator can be GTF or BED, as long as it has end, contig and strand attributes. The end attribute will be used to define the cleavage site. The proccessing index for G genes is defined as: .. math:: pi = log_2( \frac{\sum_{i=1}^{G} N_i^{PM}}{\sum_{i=1}^{G} N_i^M}) after Baejen et al Mol Cell 5(55):745-757. However, Beaejen et al normalise this number to the total number of genes, which seems wrong to me. ''' n_pm = 0 n_m = 0 for site in interval_iterator: if site.strand == "+": pos = site.end elif site.strand == "-": pos = site.start else: raise ValueError( "processing index not valid for unstranded cleavage points in " "entry\n" + str(site)+"\n") upstream_interval = (pos - window_size, pos) downstream_interval = (pos, pos + window_size) counts = count_intervals(bam, [upstream_interval, downstream_interval], site.contig, site.strand) # We are currently in genome cooridinates, not transcript if site.strand == "+": # pandas indexing is inclusive n_up = counts.iloc[:pos-1].sum() n_down = counts.iloc[pos:].sum() elif site.strand == "-": n_up = counts.iloc[pos:].sum() n_down = counts.iloc[:pos-1].sum() n_pm += n_down n_m += n_up-n_down pi = np.log2(float(n_pm)/float(max(1, n_m))) return pi
def _get_counts_and_sequence(gtf_iterator, bam, fasta, seperate_UTRs=False): '''Called by pentamer_enrichment. This function will return an iterator that yeilds tuples of profiles accross transcripts or introns and the sequence for which the profile is determined''' for transcript in gtf_iterator: transcript = [e for e in transcript if hasattr(e, "transcript_id")] if len(transcript)==0: continue contig, strand = transcript[0].contig, transcript[0].strand # exons exons = GTF.asRanges(transcript, "exon") try: sequence = "".join(fasta.getSequence(contig, "+", exon[0], exon[1]) for exon in exons) except KeyError: continue if strand == "-": sequence = revcomp(sequence) exon_counts = count_transcript(transcript, bam) if exon_counts.sum() > 0: yield (exon_counts, sequence) # introns intron_intervals = GTF.toIntronIntervals(transcript) intron_counts = count_intervals(bam, intron_intervals, contig, strand) if intron_counts.sum() == 0: continue for intron in intron_intervals: seq = fasta.getSequence(contig, "+", intron[0], intron[1]) if strand == "-": seq = revcomp(seq) profile = intron_counts.loc[float(intron[0]):float(intron[1])] profile.index = profile.index - intron[0] if profile.sum() > 0: yield (profile, seq)
def _get_counts_and_sequence(gtf_iterator, bam, fasta, seperate_UTRs=False): '''Called by pentamer_enrichment. This function will return an iterator that yeilds tuples of profiles accross transcripts or introns and the sequence for which the profile is determined''' for transcript in gtf_iterator: transcript = [e for e in transcript if hasattr(e, "transcript_id")] if len(transcript) == 0: continue contig, strand = transcript[0].contig, transcript[0].strand # exons exons = GTF.asRanges(transcript, "exon") try: sequence = "".join( fasta.getSequence(contig, "+", exon[0], exon[1]) for exon in exons) except KeyError: continue if strand == "-": sequence = revcomp(sequence) exon_counts = count_transcript(transcript, bam) if exon_counts.sum() > 0: yield (exon_counts, sequence) # introns intron_intervals = GTF.toIntronIntervals(transcript) intron_counts = count_intervals(bam, intron_intervals, contig, strand) if intron_counts.sum() == 0: continue for intron in intron_intervals: seq = fasta.getSequence(contig, "+", intron[0], intron[1]) if strand == "-": seq = revcomp(seq) profile = intron_counts.loc[float(intron[0]):float(intron[1])] profile.index = profile.index - intron[0] if profile.sum() > 0: yield (profile, seq)
def _get_profiles_and_conveter(gtf_iterator, bam): for transcript in gtf_iterator: transcript = [e for e in transcript if hasattr(e, "transcript_id")] if len(transcript) == 0: continue gene_id = transcript[0].gene_id #transcript_id = transcript[0].transcript_id contig = transcript[0].contig strand = transcript[0].strand E.debug("Crunching gene: %s:" % gene_id) # exons profile = count_transcript(transcript, bam) if profile.sum() > 0: converter = TranscriptCoordInterconverter(transcript) yield (profile, converter, LiteExon(0, converter.length), contig, strand) # introns intron_intervals = GTF.toIntronIntervals(transcript) intron_counts = count_intervals(bam, intron_intervals, contig, strand) if intron_counts.sum() == 0: continue converter = TranscriptCoordInterconverter(transcript, introns=True) intron_counts.index = converter.genome2transcript( intron_counts.index.values) for intron in intron_intervals: intron = (intron[0], intron[1] - 1) intron = converter.genome2transcript(intron) intron = sorted(intron) intron = (intron[0], intron[1] + 1) profile = intron_counts.loc[float(intron[0]):float(intron[1])] if profile.sum() > 0: yield (profile, converter, LiteExon(*intron), contig, strand)
def processing_index(interval_iterator, bam, window_size=50): '''Calculate the ratio of processed transcripts to non-processed Parameters ---------- interval_iterator : CGAT.Bed or CGAT.GTF-like iterator The iterator must yeild objects that have a start, end and strand attribute. Processing index will be calculated around these. bam : *_getter-like function A getter function returned by the `make_getter` function, this will be used to retrieve cross-link counts. window_size : int, optional How far up and downstream of the the processing site to consider. Returns ------- int processing index averaged over all processing sites given. Notes ----- The proccessing index for G genes is defined as: .. math:: pi = log_2( \frac{\sum_{i=1}^{G} N_i^{PM}}{\sum_{i=1}^{G} N_i^M}) after Baejen et al Mol Cell 5(55):745-757. However, Beaejen et al normalise this number to the total number of genes, which seems wrong to me. ''' n_pm = 0 n_m = 0 for site in interval_iterator: if site.strand == "+": pos = site.end elif site.strand == "-": pos = site.start else: raise ValueError( "processing index not valid for unstranded cleavage points in " "entry\n" + str(site) + "\n") upstream_interval = (pos - window_size, pos) downstream_interval = (pos, pos + window_size) counts = count_intervals(bam, [upstream_interval, downstream_interval], site.contig, site.strand) # We are currently in genome cooridinates, not transcript if site.strand == "+": # pandas indexing is inclusive n_up = counts.iloc[:pos - 1].sum() n_down = counts.iloc[pos:].sum() elif site.strand == "-": n_up = counts.iloc[pos:].sum() n_down = counts.iloc[:pos - 1].sum() n_pm += n_down n_m += n_up - n_down pi = np.log2(float(n_pm) / float(max(1, n_m))) return pi
def processing_index(interval_iterator, bam, window_size=50): '''Calculate the ratio of processed transcripts to non-processed Parameters ---------- interval_iterator : CGAT.Bed or CGAT.GTF-like iterator The iterator must yeild objects that have a start, end and strand attribute. Processing index will be calculated around these. bam : *_getter-like function A getter function returned by the `make_getter` function, this will be used to retrieve cross-link counts. window_size : int, optional How far up and downstream of the the processing site to consider. Returns ------- int processing index averaged over all processing sites given. Notes ----- The proccessing index for G genes is defined as: .. math:: pi = log_2( \frac{\sum_{i=1}^{G} N_i^{PM}}{\sum_{i=1}^{G} N_i^M}) after Baejen et al Mol Cell 5(55):745-757. However, Beaejen et al normalise this number to the total number of genes, which seems wrong to me. ''' n_pm = 0 n_m = 0 for site in interval_iterator: if site.strand == "+": pos = site.end elif site.strand == "-": pos = site.start else: raise ValueError( "processing index not valid for unstranded cleavage points in " "entry\n" + str(site)+"\n") upstream_interval = (pos - window_size, pos) downstream_interval = (pos, pos + window_size) counts = count_intervals(bam, [upstream_interval, downstream_interval], site.contig, site.strand) # We are currently in genome cooridinates, not transcript if site.strand == "+": # pandas indexing is inclusive n_up = counts.iloc[:pos-1].sum() n_down = counts.iloc[pos:].sum() elif site.strand == "-": n_up = counts.iloc[pos:].sum() n_down = counts.iloc[:pos-1].sum() n_pm += n_down n_m += n_up-n_down pi = np.log2(float(n_pm)/float(max(1, n_m))) return pi