def produce_bowtie2_alignments(reads, index_prefix, genome_dir, score_min, ): bowtie2_options = {'local': True, #'report_all': True, 'report_up_to': 10, 'seed_mismatches': 1, 'seed_interval_function': 'C,1,0', 'seed_length': 10, } sam_file, mappings = mapping_tools.map_bowtie2(index_prefix, reads=reads, custom_binary=True, score_min=score_min, yield_mappings=True, **bowtie2_options) region_fetcher = genomes.build_region_fetcher(genome_dir, load_references=True) mapping_groups = utilities.group_by(mappings, lambda m: m.qname) for qname, group in mapping_groups: group = sorted(group, key=lambda m: (m.tid, m.pos)) alignments = [mapping_to_alignment(mapping, sam_file, region_fetcher) for mapping in group if not mapping.is_unmapped] yield qname, alignments
def produce_bowtie2_alignments_old(reads, sam_fn, index_prefix, genome_dir, score_min, ): bowtie2_options = {'local': True, #'report_all': True, 'report_up_to': 10, 'seed_mismatches': 1, 'seed_interval_function': 'C,1,0', 'seed_length': 10, #'threads': 12, } mapping_tools.map_bowtie2(index_prefix, None, None, sam_fn, unpaired_Reads=reads, custom_binary=True, score_min=score_min, **bowtie2_options) sam_file = pysam.Samfile(sam_fn) region_fetcher = genomes.build_region_fetcher(genome_dir, load_references=True) mapping_groups = utilities.group_by(sam_file, lambda m: m.qname) for qname, group in mapping_groups: alignments = [mapping_to_alignment(mapping, sam_file, region_fetcher) for mapping in group if not mapping.is_unmapped] yield qname, alignments
def filter_mappings(self): num_unmapped = 0 num_entirely_genomic = 0 num_nonunique = 0 num_unique = 0 nongenomic_lengths = Counter() sam_file = pysam.Samfile(self.file_names['accepted_hits']) region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=sam_file, ) extended_sorter = sam.AlignmentSorter(sam_file.references, sam_file.lengths, self.file_names['extended'], ) filtered_sorter = sam.AlignmentSorter(sam_file.references, sam_file.lengths, self.file_names['extended_filtered'], ) extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file) mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname) with extended_sorter, filtered_sorter: for qname, group in mapping_groups: for m in group: extended_sorter.write(m) min_nongenomic_length = min(trim.get_nongenomic_length(m) for m in group) nongenomic_lengths[min_nongenomic_length] += 1 if min_nongenomic_length == 0: num_entirely_genomic += 1 continue nonunique = len(group) > 1 or any(m.mapq < 40 for m in group) if nonunique: num_nonunique += 1 continue num_unique += 1 for m in group: filtered_sorter.write(m) self.summary.extend( [('Mapped with no non-genomic A\'s', num_entirely_genomic), ('Nonunique', num_nonunique), ('Unique', num_unique), ], ) nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths) self.write_file('nongenomic_lengths', nongenomic_lengths)
def get_transcripts(all_features, genome_dir): region_fetcher = genomes.build_region_fetcher(genome_dir, load_references=True) feature_lists = defaultdict(list) for feature in all_features: transcript_name = feature.attribute['transcript_id'] feature_lists[transcript_name].append(feature) transcripts = [Transcript(name, features, None, region_fetcher) for name, features in feature_lists.iteritems()] return transcripts
def get_gff_transcripts(all_features, genome_dir): region_fetcher = genomes.build_region_fetcher(genome_dir, load_references=True) genes = [] for feature in all_features: top_level = feature.parent == None dubious = feature.attribute.get('orf_classification') == 'Dubious' has_exon = any('exon' in c.feature for c in feature.descendants) if top_level and has_exon and not dubious: gene = GFFTranscript(feature, region_fetcher) genes.append(gene) return genes
def call_3p_peaks(): gtf_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf' genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/' composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5' output_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_3p_lengths.txt' region_fetcher = genomes.build_region_fetcher(genome_dir) CDSs = gtf.get_CDSs(gtf_fn) CDS_dict = {t.name: t for t in CDSs} experiments = build_all_experiments(verbose=False) three_prime_experiments = [(n, e) for n, e in sorted(experiments['three_p_seq']['three_p_seq'].items())] + \ [(n, e) for n, e in sorted(experiments['three_t_fill_seq']['wilkening_nar'].items()) if '3tfill_ypd_rep1' in n] + \ [(n, e) for n, e in sorted(experiments['TIF_seq']['pelechano_nature'].items()) if n == 'ypd_bio1_lib1' or n == 'ypd_bio1_lib4'] argmaxes = {} fractions = {} joints = {} for name, experiment in three_prime_experiments: print name argmaxes[name] = {} fractions[name] = [] joints[name] = [] fn = experiment.file_names['three_prime_read_positions'] f = h5py.File(fn, 'r') for transcript in utilities.progress_bar(len(CDSs), CDSs): if transcript.name not in f: continue gene = Serialize.read_positions.build_gene(f[transcript.name]) xs = np.arange(0, 400) argmax = gene['all'].argmax_over_slice('stop_codon', xs) argmaxes[name][transcript.name] = argmax most = gene['all']['stop_codon', argmax] total = gene['all']['stop_codon', xs].sum() if total > 9: fraction = np.true_divide(most, total) fractions[name].append(fraction) joints[name].append((argmax, fraction)) with open(output_fn, 'w') as output_fh: name_order = sorted(argmaxes['Cerevisiae_3Pseq'], key=argmaxes['Cerevisiae_3Pseq'].get) for name in name_order: output_fh.write('{0}\t'.format(str(CDS_dict[name]))) for exp_name, _ in three_prime_experiments: output_fh.write('{0}\t'.format(argmaxes[exp_name][name])) output_fh.write('\n')
def get_transcripts(all_features, genome_dir): region_fetcher = genomes.build_region_fetcher(genome_dir, load_references=True) feature_lists = defaultdict(list) for feature in all_features: transcript_name = feature.attribute['transcript_id'] feature_lists[transcript_name].append(feature) transcripts = [ Transcript(name, features, None, region_fetcher) for name, features in feature_lists.iteritems() ] return transcripts
def get_reads(self): CDSs, _ = self.get_CDSs() region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, ) for transcript in CDSs: reads = make_artificial_reads(transcript, self.fragment_length, self.max_read_length, self.adapter_sequence, region_fetcher, self.common_buffer, ) for read in reads: yield read
def get_reads(self): CDSs, _ = self.get_CDSs() region_fetcher = genomes.build_region_fetcher( self.file_names['genome'], load_references=True, ) for transcript in CDSs: reads = make_artificial_reads( transcript, self.fragment_length, self.max_read_length, self.adapter_sequence, region_fetcher, self.common_buffer, ) for read in reads: yield read
def process_full_length_mappings(self): clean_bam = pysam.Samfile(self.file_names['clean_bam']) type_shape = (self.max_read_length + 1, self.max_read_length, fastq.MAX_EXPECTED_QUAL + 1, 6, 6, ) type_counts = np.zeros(type_shape, int) # To avoid counting mismatches in non-unique mappings multiple times, # a dummy secondary_type_counts array is passed to # trim_mismatches_from_start for secondary mappings. secondary_type_counts = np.zeros(type_shape, int) clean_trimmed_length_counts = Counter() region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=clean_bam, ) for mapping in clean_bam: if mapping.is_secondary: counts_array = secondary_type_counts else: counts_array = type_counts trimmed_from_start = trim.trim_mismatches_from_start(mapping, region_fetcher, counts_array, ) trimmed_from_end = trim.trim_nongenomic_polyA_from_end(trimmed_from_start, region_fetcher, ) if not trimmed_from_end.is_unmapped and not trimmed_from_end.is_secondary: clean_trimmed_length_counts[trimmed_from_end.qlen] += 1 yield trimmed_from_end self.write_file('mismatches', type_counts) clean_trimmed_lengths = self.zero_padded_array(clean_trimmed_length_counts) self.write_file('lengths', {'clean_trimmed': clean_trimmed_lengths})
def extend_polyA_ends(bam_fn, extended_bam_fn, genome_dir, trimmed_twice=False): bam_file = pysam.Samfile(bam_fn) region_fetcher = genomes.build_region_fetcher(genome_dir, load_references=True, sam_file=bam_file, ) # Adding bases to the end of minus strand mappings produces a file # that is not necessarily sorted, so re-sort. alignment_sorter = sam.AlignmentSorter(bam_file.references, bam_file.lengths, extended_bam_fn, ) with alignment_sorter: for mapping in bam_file: extended_mapping = extend_polyA_end(mapping, region_fetcher, trimmed_twice) alignment_sorter.write(extended_mapping)
def call_5p_peaks(): gtf_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf" genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/" region_fetcher = genomes.build_region_fetcher(genome_dir) CDSs = gtf.get_CDSs(gtf_fn) experiments = build_all_experiments(verbose=False) five_prime_experiments = ( [(n, e) for n, e in sorted(experiments["TL_seq"]["arribere_gr"].items()) if "TLSeq1" in n] + [(n, e) for n, e in sorted(experiments["TL_seq"]["park_nar"].items()) if n == "SMORE-seq_WT_TAP+_rep1"] + [ (n, e) for n, e in sorted(experiments["TIF_seq"]["pelechano_nature"].items()) if n == "ypd_bio1_lib1" or n == "ypd_bio1_lib4" ] ) argmaxes = {} fractions = {} joints = {} for name, experiment in five_prime_experiments: print name argmaxes[name] = Counter() fractions[name] = [] joints[name] = [] fn = experiment.file_names["five_prime_read_positions"] f = h5py.File(fn, "r") for transcript in utilities.progress_bar(len(CDSs), CDSs): if transcript.name not in f: continue gene = Serialize.read_positions.build_gene(f[transcript.name]) xs = np.arange(-300, 0) argmax = gene["all"].argmax_over_slice("start_codon", xs) argmaxes[name][argmax] += 1 most = gene["all"]["start_codon", argmax] total = gene["all"]["start_codon", xs].sum() if total == 0: print transcript if total > 9: fraction = np.true_divide(most, total) fractions[name].append(fraction) joints[name].append((argmax, fraction))
def compute_yield(self): lengths = self.read_file('lengths') reads = {name: lengths[name].sum() for name in lengths} reads['total'] = reads['trimmed'] + reads['too_short'] reads['dominant'], overlapping_reads, boundaries = contaminants.identify_dominant_stretches(self.read_file('rRNA_coverage'), reads['total'], self.max_read_length, self.merged_file_names['rRNA_bam'], ) contaminants.plot_dominant_stretch_lengths(boundaries, self.figure_file_names['dominant_stretch_lengths']) reads['other'] = reads['rRNA'] - reads['dominant'] region_fetcher = genomes.build_region_fetcher(self.file_names['rRNA_fasta_dir']) with open(self.merged_file_names['dominant_stretches'], 'w') as dominant_stretches_file: for rname in sorted(boundaries): for start, stop in sorted(boundaries[rname]): sequence = region_fetcher(rname, start, stop) fraction = overlapping_reads[rname, start, stop] / float(reads['total']) dominant_stretches_file.write('{0}: {1:,}-{2:,}\t{3:6.1%}\t{4}\n'.format(rname, start, stop, fraction, sequence)) with open(self.file_names['yield'], 'w') as yield_file: yield_file.write('Total reads: {0:,}\n'.format(reads['total'])) for category, count in [('Long enough reads', reads['trimmed']), ('phiX reads', reads['phiX']), ('rRNA reads', reads['rRNA']), ('(rRNA reads from non-dominant stetches)', reads['other']), ('tRNA reads', reads['tRNA']), ('Other ncRNA reads', reads['other_ncRNA']), ('Clean reads', reads['clean']), ('Reads mapped after polyA trimming', reads['remapped']), ('Reads that start wth long polyA', reads['long_polyA']), ('Synthetic reads', reads['synthetic']), ('Unaccounted-for reads', reads['unmapped']), ]: fraction = float(count) / reads['total'] line = '{0}: {1:,} ({2:.2%})\n'.format(category, count, fraction, ) yield_file.write(line)
def call_5p_peaks(): gtf_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf' genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/' region_fetcher = genomes.build_region_fetcher(genome_dir) CDSs = gtf.get_CDSs(gtf_fn) experiments = build_all_experiments(verbose=False) five_prime_experiments = [(n, e) for n, e in sorted(experiments['TL_seq']['arribere_gr'].items()) if 'TLSeq1' in n] + \ [(n, e) for n, e in sorted(experiments['TL_seq']['park_nar'].items()) if n == 'SMORE-seq_WT_TAP+_rep1'] + \ [(n, e) for n, e in sorted(experiments['TIF_seq']['pelechano_nature'].items()) if n == 'ypd_bio1_lib1' or n == 'ypd_bio1_lib4'] argmaxes = {} fractions = {} joints = {} for name, experiment in five_prime_experiments: print name argmaxes[name] = Counter() fractions[name] = [] joints[name] = [] fn = experiment.file_names['five_prime_read_positions'] f = h5py.File(fn, 'r') for transcript in utilities.progress_bar(len(CDSs), CDSs): if transcript.name not in f: continue gene = Serialize.read_positions.build_gene(f[transcript.name]) xs = np.arange(-300, 0) argmax = gene['all'].argmax_over_slice('start_codon', xs) argmaxes[name][argmax] += 1 most = gene['all']['start_codon', argmax] total = gene['all']['start_codon', xs].sum() if total == 0: print transcript if total > 9: fraction = np.true_divide(most, total) fractions[name].append(fraction) joints[name].append((argmax, fraction))
def process_remapped(self): clean_bam = pysam.Samfile(self.file_names['remapped_accepted_hits']) type_shape = (self.max_read_length + 1, self.max_read_length, fastq.MAX_EXPECTED_QUAL + 1, 6, 6, ) type_counts = np.zeros(type_shape, int) remapped_length_counts = Counter() region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=clean_bam, ) for mapping in clean_bam: trimmed_from_start = trim.trim_mismatches_from_start(mapping, region_fetcher, type_counts, ) # Add back any genomic A's that were trimmed as part of mappings and # any remaining A's from the first non-genomic onward as soft clipped # bases for visualization in IGV. extended = trim.extend_polyA_end(trimmed_from_start, region_fetcher, trimmed_twice=True, ) if not extended.is_unmapped and not extended.is_secondary: remapped_length_counts[extended.qlen] += 1 yield extended remapped_lengths = self.zero_padded_array(remapped_length_counts) self.write_file('lengths', {'remapped': remapped_lengths})
def extend_polyA_ends(bam_fn, extended_bam_fn, genome_dir, trimmed_twice=False): bam_file = pysam.Samfile(bam_fn) region_fetcher = genomes.build_region_fetcher( genome_dir, load_references=True, sam_file=bam_file, ) # Adding bases to the end of minus strand mappings produces a file # that is not necessarily sorted, so re-sort. alignment_sorter = sam.AlignmentSorter( bam_file.references, bam_file.lengths, extended_bam_fn, ) with alignment_sorter: for mapping in bam_file: extended_mapping = extend_polyA_end(mapping, region_fetcher, trimmed_twice) alignment_sorter.write(extended_mapping)
def filter_mappings(self): num_unmapped = 0 num_entirely_genomic = 0 num_nonunique = 0 num_unique = 0 nongenomic_lengths = Counter() sam_file = pysam.Samfile(self.file_names['accepted_hits']) region_fetcher = genomes.build_region_fetcher( self.file_names['genome'], load_references=True, sam_file=sam_file, ) extended_sorter = sam.AlignmentSorter( sam_file.references, sam_file.lengths, self.file_names['extended'], ) filtered_sorter = sam.AlignmentSorter( sam_file.references, sam_file.lengths, self.file_names['extended_filtered'], ) extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file) mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname) with extended_sorter, filtered_sorter: for qname, group in mapping_groups: for m in group: extended_sorter.write(m) min_nongenomic_length = min( trim.get_nongenomic_length(m) for m in group) nongenomic_lengths[min_nongenomic_length] += 1 if min_nongenomic_length == 0: num_entirely_genomic += 1 continue nonunique = len(group) > 1 or any(m.mapq < 40 for m in group) if nonunique: num_nonunique += 1 continue num_unique += 1 for m in group: filtered_sorter.write(m) self.summary.extend([ ('Mapped with no non-genomic A\'s', num_entirely_genomic), ('Nonunique', num_nonunique), ('Unique', num_unique), ], ) nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths) self.write_file('nongenomic_lengths', nongenomic_lengths)
def call_3p_peaks(): gtf_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf" genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/" composition_fn = ( "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5" ) output_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_3p_lengths.txt" region_fetcher = genomes.build_region_fetcher(genome_dir) CDSs = gtf.get_CDSs(gtf_fn) CDS_dict = {t.name: t for t in CDSs} experiments = build_all_experiments(verbose=False) three_prime_experiments = ( [(n, e) for n, e in sorted(experiments["three_p_seq"]["three_p_seq"].items())] + [ (n, e) for n, e in sorted(experiments["three_t_fill_seq"]["wilkening_nar"].items()) if "3tfill_ypd_rep1" in n ] + [ (n, e) for n, e in sorted(experiments["TIF_seq"]["pelechano_nature"].items()) if n == "ypd_bio1_lib1" or n == "ypd_bio1_lib4" ] ) argmaxes = {} fractions = {} joints = {} for name, experiment in three_prime_experiments: print name argmaxes[name] = {} fractions[name] = [] joints[name] = [] fn = experiment.file_names["three_prime_read_positions"] f = h5py.File(fn, "r") for transcript in utilities.progress_bar(len(CDSs), CDSs): if transcript.name not in f: continue gene = Serialize.read_positions.build_gene(f[transcript.name]) xs = np.arange(0, 400) argmax = gene["all"].argmax_over_slice("stop_codon", xs) argmaxes[name][transcript.name] = argmax most = gene["all"]["stop_codon", argmax] total = gene["all"]["stop_codon", xs].sum() if total > 9: fraction = np.true_divide(most, total) fractions[name].append(fraction) joints[name].append((argmax, fraction)) with open(output_fn, "w") as output_fh: name_order = sorted(argmaxes["Cerevisiae_3Pseq"], key=argmaxes["Cerevisiae_3Pseq"].get) for name in name_order: output_fh.write("{0}\t".format(str(CDS_dict[name]))) for exp_name, _ in three_prime_experiments: output_fh.write("{0}\t".format(argmaxes[exp_name][name])) output_fh.write("\n")
def combine_mappings(self): num_unmapped = 0 num_five_unmapped = 0 num_three_unmapped = 0 num_nonunique = 0 num_discordant = 0 num_concordant = 0 five_prime_mappings = pysam.Samfile(self.file_names['five_prime_accepted_hits']) five_prime_unmapped = pysam.Samfile(self.file_names['five_prime_unmapped']) all_five_prime = sam.merge_by_name(five_prime_mappings, five_prime_unmapped) five_prime_grouped = utilities.group_by(all_five_prime, lambda m: m.qname) three_prime_mappings = pysam.Samfile(self.file_names['three_prime_accepted_hits']) three_prime_unmapped = pysam.Samfile(self.file_names['three_prime_unmapped']) all_three_prime = sam.merge_by_name(three_prime_mappings, three_prime_unmapped) three_prime_grouped = utilities.group_by(all_three_prime, lambda m: m.qname) group_pairs = izip(five_prime_grouped, three_prime_grouped) alignment_sorter = sam.AlignmentSorter(five_prime_mappings.references, five_prime_mappings.lengths, self.file_names['combined_extended'], ) region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=five_prime_mappings, ) with alignment_sorter: for (five_qname, five_group), (three_qname, three_group) in group_pairs: five_annotation = trim.PayloadAnnotation.from_identifier(five_qname) three_annotation = trim.PayloadAnnotation.from_identifier(three_qname) if five_annotation['original_name'] != three_annotation['original_name']: # Ensure that the iteration through pairs is in sync. print five_qname, three_qname raise ValueError five_unmapped = any(m.is_unmapped for m in five_group) three_unmapped = any(m.is_unmapped for m in three_group) if five_unmapped: num_five_unmapped += 1 if three_unmapped: num_three_unmapped += 1 if five_unmapped or three_unmapped: num_unmapped += 1 continue five_nonunique = len(five_group) > 1 or any(m.mapq < 40 for m in five_group) three_nonunique = len(three_group) > 1 or any(m.mapq < 40 for m in three_group) if five_nonunique or three_nonunique: num_nonunique += 1 continue five_m = five_group.pop() three_m = three_group.pop() five_strand = '-' if five_m.is_reverse else '+' three_strand = '-' if three_m.is_reverse else '+' tlen = max(five_m.aend, three_m.aend) - min(five_m.pos, three_m.pos) discordant = (five_m.tid != three_m.tid) or (five_strand) != (three_strand) or (tlen > 10000) if discordant: num_discordant += 1 continue if five_strand == '+': first_read = five_m second_read = three_m elif five_strand == '-': first_read = three_m second_read = five_m gap = second_read.pos - first_read.aend if gap < 0: num_discordant += 1 continue combined_read = pysam.AlignedRead() # qname needs to come from three_m to include trimmed As combined_read.qname = three_m.qname combined_read.tid = five_m.tid combined_read.seq = first_read.seq + second_read.seq combined_read.qual = first_read.qual + second_read.qual combined_read.cigar = first_read.cigar + [(3, gap)] + second_read.cigar combined_read.pos = first_read.pos combined_read.is_reverse = first_read.is_reverse combined_read.mapq = min(first_read.mapq, second_read.mapq) combined_read.rnext = -1 combined_read.pnext = -1 num_concordant += 1 extended_mapping = trim.extend_polyA_end(combined_read, region_fetcher, ) alignment_sorter.write(extended_mapping) self.summary.extend( [('Unmapped', num_unmapped), ('Five prime unmapped', num_five_unmapped), ('Three prime unmapped', num_three_unmapped), ('Nonunique', num_nonunique), ('Discordant', num_discordant), ('Concordant', num_concordant), ], )