def produce_transcript_base_compositions(): gff_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff' genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/' composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5' CDSs = gff.get_CDSs(gff_fn, genome_dir) left_buffer = 500 right_buffer = 500 genes = {} windows = [5, 10, 20] for transcript in utilities.progress_bar(len(CDSs), CDSs): genes[transcript.name] = {} transcript.build_coordinate_maps() landmarks = { 'start': 0, 'start_codon': transcript.transcript_start_codon, 'stop_codon': transcript.transcript_stop_codon, 'end': transcript.transcript_length, } sequence = transcript.get_transcript_sequence(left_buffer, right_buffer) A_locations = positions.PositionCounts( landmarks, left_buffer, right_buffer, data=(sequence.data == 'A'), ) for window in windows: recent_As = positions.PositionCounts( landmarks, left_buffer, right_buffer, ) for left_edge in range( -left_buffer, transcript.CDS_length + right_buffer - window): num_As = sum(A_locations['start', left_edge:left_edge + window]) recent_As['start', left_edge] = num_As genes[transcript.name][window] = recent_As transcript.delete_coordinate_maps() Serialize.read_positions.write_file(genes, composition_fn)
def distribute_analytically(self): buffered_codon_counts = self.template_experiment.read_file('buffered_codon_counts') all_gene_names = sorted(buffered_codon_counts) piece_gene_names = Sequencing.Parallel.piece_of_list(all_gene_names, self.num_pieces, self.which_piece, ) simulated_codon_counts = {} cds_slice = slice('start_codon', ('stop_codon', 1)) for i, gene_name in enumerate(piece_gene_names): identities = buffered_codon_counts[gene_name]['identities'] codon_sequence = identities[cds_slice] real_counts = buffered_codon_counts[gene_name]['relaxed'][cds_slice] total_real_counts = sum(real_counts) rates_array = np.array([codon_rates[codon_id] for codon_id in codon_sequence]) fractions_array = rates_array / sum(rates_array) simulated_counts = positions.PositionCounts(identities.landmarks, identities.left_buffer, identities.right_buffer, ) for position, fraction in enumerate(fractions_array): simulated_counts['start_codon', position] = np.random.binomial(total_real_counts, fraction) simulated_codon_counts[gene_name] = {'identities': identities, 'relaxed': simulated_counts, } self.write_file('simulated_codon_counts', simulated_codon_counts)
def simulate(self): buffered_codon_counts = self.template_experiment.read_file('buffered_codon_counts') codon_means = self.load_codon_means(self.template_experiment) if self.perturbation_model == 'change_all': perturbed_codon_means = self.load_codon_means(self.new_rates_experiment) else: perturbed_codon_means = None TEs = self.load_TEs() initiation_means = {gene_name: self.initiation_mean_numerator / TEs[gene_name] for gene_name in buffered_codon_counts} all_gene_names = sorted(buffered_codon_counts) piece_gene_names = Sequencing.Parallel.piece_of_list(all_gene_names, self.num_pieces, self.which_piece, ) simulated_codon_counts = {} cds_slice = slice('start_codon', ('stop_codon', 1)) for i, gene_name in enumerate(piece_gene_names): logging.info('Starting {0} ({1:,} / {2:,})'.format(gene_name, i, len(piece_gene_names) - 1)) identities = buffered_codon_counts[gene_name]['identities'] codon_sequence = identities[cds_slice] real_counts = buffered_codon_counts[gene_name]['relaxed'][cds_slice] total_real_counts = sum(real_counts) target = int(np.ceil(total_real_counts)) all_measurements = Counter() num_messages = 0 while sum(all_measurements.values()) < target: message = Message(codon_sequence, initiation_means[gene_name], codon_means, self.CHX_mean, perturbed_codon_means=perturbed_codon_means) message.evolve_to_steady_state() if self.perturbation_model == None: message.introduce_CHX() else: message.evolve_perturbed_CHX_model(self.perturbation_model) all_measurements.update(message.collect_measurements()) num_messages += 1 if num_messages % 10000 == 0: logging.info('{0:,} counts generated for {1} from {2:,} messages (target = {3})'.format(sum(all_measurements.values()), gene_name, num_messages, target)) simulated_counts = positions.PositionCounts(identities.landmarks, identities.left_buffer, identities.right_buffer, ) for key, value in all_measurements.items(): simulated_counts['start_codon', key] = value simulated_codon_counts[gene_name] = {'identities': identities, 'relaxed': simulated_counts, } logging.info('{0:,} counts generated for {1} from {2:,} messages'.format(sum(all_measurements.values()), gene_name, num_messages)) self.write_file('simulated_codon_counts', simulated_codon_counts)
def get_transcript_sequence(self, left_buffer=0, right_buffer=0): ''' Get the sequence of the mature transcript. ''' # Remake coordinate maps to guarantee buffer sizes self.build_coordinate_maps(left_buffer, right_buffer) transcript_positions = range( -left_buffer, self.transcript_length + right_buffer, ) genomic_positions = [ self.transcript_to_genomic[t] for t in transcript_positions ] bases = [ self.region_fetcher(self.seqname, p, p + 1) for p in genomic_positions ] sequence = ''.join(bases).upper() if self.strand == '-': sequence = utilities.complement(sequence) sequence = np.asarray(sequence, dtype='c') landmarks = { 'start': 0, 'start_codon': self.transcript_start_codon, 'stop_codon': self.transcript_stop_codon, 'end': self.transcript_length, } transcript_sequence = positions.PositionCounts( landmarks, left_buffer, right_buffer, data=sequence, ) return transcript_sequence
def get_extent_sequence(self, left_buffer=0, right_buffer=0): ''' Get the sequence of the extent. Useful for looking at gene with annotated frameshifts. ''' sequence = self.region_fetcher( self.seqname, min(self.genomic_to_extent), max(self.genomic_to_extent) + 1, ) if self.strand == '-': sequence = utilities.reverse_complement(sequence) sequence = np.asarray(sequence, dtype='c') extent_landmarks = { 'start': 0, 'end': self.extent_length, } return positions.PositionCounts( extent_landmarks, left_buffer, right_buffer, data=sequence, )
def record_uniqueness(self): CDSs, _ = self.get_CDSs() uniqueness = {} transcripts = {} # For any genomic position that participates in a transcript, this will # contain a mapping to a set of all transcripts it participates in. genomic_to_all_transcripts = defaultdict(set) for transcript in CDSs: landmarks = { 'start': 0, 'start_codon': transcript.transcript_start_codon, 'stop_codon': transcript.transcript_stop_codon, 'end': transcript.transcript_length, } uniqueness[transcript.name] = { self.fragment_length: positions.PositionCounts(landmarks, self.common_buffer, self.common_buffer) } transcript.build_coordinate_maps(left_buffer=self.common_buffer, right_buffer=self.common_buffer) transcripts[transcript.name] = transcript for genomic_position, transcript_position in transcript.genomic_to_transcript.iteritems( ): full_position = (transcript.seqname, transcript.strand, genomic_position) genomic_to_all_transcripts[full_position].add( (transcript.name, transcript_position)) bam_file = pysam.Samfile(self.file_names['accepted_hits']) for read in bam_file: # If this read was incorrectly trimmed, don't record it. if read.qlen != self.fragment_length: continue annotation = artifical_annotation.from_prefix_identifier( read.qname) true_transcript = transcripts[annotation['transcript_name']] true_position = annotation['position'] strand = '-' if read.is_reverse else '+' if strand == '+': five_prime = read.pos else: five_prime = read.aend - 1 full_mapped_position = (bam_file.getrname(read.tid), strand, five_prime) if read.mapq < 50: # Flag the true source of the read as nonunique. uniqueness[true_transcript.name][self.fragment_length][ 'start_codon', true_position] = 2 # Hopefully redundantly, flag the position actually mapped to as # nonunqiue. for transcript_name, transcript_position in genomic_to_all_transcripts[ full_mapped_position]: uniqueness[transcript_name][self.fragment_length][ 'start_codon', transcript_position] = 2 else: # Check that any read with a MAPQ of 50 is to the expected position. full_true_position = ( true_transcript.seqname, true_transcript.strand, true_transcript.transcript_to_genomic[true_position], ) if read.mapq == 50 and (full_mapped_position != full_true_position): raise ValueError(full_mapped_position, full_true_position) # As long as this hasn't been mapped to by some other fragment, # mark it as unique. if uniqueness[true_transcript.name][self.fragment_length][ 'start_codon', true_position] == 0: uniqueness[true_transcript.name][self.fragment_length][ 'start_codon', true_position] = 1 self.write_file('uniqueness', uniqueness)
def plot_mRNA_metagene_unaveraged(from_end, min_length, max_length): bmap = brewer2mpl.get_map('Set1', 'qualitative', 9) colors = cycle(bmap.mpl_colors[:5] + bmap.mpl_colors[6:]) experiments = select_work.build_all_experiments(verbose=False) mRNA_experiments = [ #('WT_mRNA_1', 'polyA', 0, experiments['belgium_2014_12_10']['WT_1_mRNA']), #('WT_mRNA_1', 'polyA', 'nonzero', experiments['belgium_2014_12_10']['WT_1_mRNA']), #('WT_mRNA_1', 'stop_codon', 0, experiments['belgium_2014_12_10']['WT_1_mRNA']), #('WT_mRNA_1', 'stop_codon', 'nonzero', experiments['belgium_2014_12_10']['WT_1_mRNA']), #('WT_mRNA_1', 'start', 'all', experiments['belgium_2014_12_10']['WT_1_mRNA']), #('WT_mRNA_1', 'start_codon', 'all', experiments['belgium_2014_12_10']['WT_1_mRNA']), #('WT_cDNA_mRNA', 'cap', 'all', experiments['belgium_2013_08_06']['WT_cDNA_mRNA']), #('WT_cDNA_mRNA', 'start_codon', 'all', experiments['belgium_2013_08_06']['WT_cDNA_mRNA']), #('R98S_1_mRNA', 'cap', 'all', experiments['belgium_2014_12_10']['R98S_1_mRNA']), #('R98S_1_mRNA', 'start_codon', 'all', experiments['belgium_2014_12_10']['R98S_1_mRNA']), ##('WT_mRNA_1 3\'', experiments['belgium_2014_12_10']['WT_1_mRNA']), ##('WT_mRNA_2 3\'', experiments['belgium_2014_12_10']['WT_2_mRNA']), ##('WT_cDNA_mRNA 3\'', experiments['belgium_2013_08_06']['WT_cDNA_mRNA']), #('RiboZero', 'polyA', 0, experiments['weinberg']['RiboZero']), #('RiboZero', 'polyA', 'nonzero', experiments['weinberg']['RiboZero']), ('RiboZero', 'start', 'all', experiments['weinberg']['RiboZero']), ('RiboZero', 'start_codon', 'all', experiments['weinberg']['RiboZero']), ##('RiboZero', 'stop_codon', 0, experiments['weinberg']['RiboZero']), ##('RiboZero', 'stop_codon', 'nonzero', experiments['weinberg']['RiboZero']), #('Dynabeads', 'polyA', 0, experiments['weinberg']['Dynabeads']), #('Dynabeads', 'polyA', 'nonzero', experiments['weinberg']['Dynabeads']), #('Dynabeads', 'cap', 'all', experiments['weinberg']['Dynabeads']), #('Dynabeads', 'start_codon', 'all', experiments['weinberg']['Dynabeads']), ##('Dynabeads', 'stop_codon', 0, experiments['weinberg']['Dynabeads']), ##('Dynabeads', 'stop_codon', 'nonzero', experiments['weinberg']['Dynabeads']), ] plot_to = 500 fig_cumulative, ax_cumulative = plt.subplots() edge_buffer = 200 if from_end: xs = np.arange(-plot_to, edge_buffer) else: xs = np.arange(-edge_buffer, plot_to) unexpected_counts = {} for (name, landmark, key, experiment), color in zip(mRNA_experiments, colors): print name, landmark, key if from_end: counts_generator = counts_from_read_positions_fn( experiment.file_names['three_prime_read_positions'], key=key) else: counts_generator = counts_from_read_positions_fn( experiment.file_names['read_positions'], key='all') landmarks = { 'start': 0, 'start_codon': 0, 'stop_codon': 90000, 'end': 90000 } expected_counts = positions.PositionCounts(landmarks, 400, 400, dtype=float) actual_counts = positions.PositionCounts(landmarks, 400, 400, dtype=float) for gene_name, counts in counts_generator: if not min_length <= counts.CDS_length <= max_length: continue num_positions = counts.CDS_length + edge_buffer if from_end: edge_slice = (landmark, slice(-counts.CDS_length, edge_buffer)) else: edge_slice = (landmark, slice(-edge_buffer, counts.CDS_length)) unexpected_slice = (landmark, slice(-edge_buffer, 0)) r_g = counts[edge_slice].sum() uniform_counts = np.ones(num_positions) * r_g / num_positions actual_counts[edge_slice] += counts[edge_slice] expected_counts[edge_slice] += uniform_counts unexpected_counts[gene_name] = counts[unexpected_slice].sum() print actual_counts.sum() print expected_counts.sum() most_unexpected = sorted(unexpected_counts, key=unexpected_counts.get, reverse=True) for n in most_unexpected[:10]: print n, unexpected_counts[n] if from_end: plot_slice = (landmark, slice(-plot_to, edge_buffer)) else: plot_slice = ('start_codon', slice(-edge_buffer, plot_to)) ax_cumulative.plot(xs, expected_counts[plot_slice], '--', color=color) ax_cumulative.plot(xs, actual_counts[plot_slice], 'o-', color=color, markersize=2, markeredgewidth=0, label='{0}, {1}, {2}, actual'.format( name, landmark, key)) #ax_cumulative.plot(xs, smoothed(actual_counts[-49:plot_to], 15) / expected_counts[0], '-', label='{0}'.format(name), color=color) #ax_cumulative.set_ylim(0.8, 1.5) #ax_cumulative.plot(xs, np.zeros(plot_to), 'k--') ax_cumulative.legend(loc='upper left', framealpha=0.5) if from_end: xlabel = 'Position relative to {0}'.format(landmark) else: xlabel = 'Position relative to start of CDS' ax_cumulative.set_xlabel(xlabel) ax_cumulative.set_xlim(min(xs), max(xs)) ax_cumulative.set_ylabel('Mapped read counts, normalized across data sets') #ax_cumulative.set_title('Read counts in the final {0} bases of CDSs at least {0} long'.format(min_length)) fig_cumulative.set_size_inches(18, 12)
if __name__ == '__main__': gff_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff' genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/' composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5' CDSs = gff.get_CDSs(gff_fn, genome_dir) import select_work exps = select_work.build_all_experiments(verbose=False) reads_fn = exps['belgium_2014_12_10']['WT_1_mRNA'].file_names[ 'three_prime_read_positions'] reads_fh = h5py.File(reads_fn, 'r') meta_counts = positions.PositionCounts({'A': 0}, left_buffer=100000, right_buffer=100000) f = h5py.File(composition_fn, 'r') for t in utilities.progress_bar(len(CDSs), CDSs): if t.name not in reads_fh: continue gene = Serialize.read_positions.build_gene(f[t.name]) t.build_coordinate_maps() if t.transcript_length < 301: continue end = t.transcript_length - 200 sl = ('start', np.arange(100, end)) A_rich_position = gene[10].argmax_over_slice(*sl) if gene[10]['start', A_rich_position] > 9: