def produce_bowtie2_alignments(reads,
                               index_prefix,
                               genome_dir,
                               score_min,
                              ):

    bowtie2_options = {'local': True,
                       #'report_all': True,
                       'report_up_to': 10,
                       'seed_mismatches': 1,
                       'seed_interval_function': 'C,1,0',
                       'seed_length': 10,
                      }

    sam_file, mappings = mapping_tools.map_bowtie2(index_prefix,
                                                   reads=reads,
                                                   custom_binary=True,
                                                   score_min=score_min,
                                                   yield_mappings=True,
                                                   **bowtie2_options)

    region_fetcher = genomes.build_region_fetcher(genome_dir, load_references=True)

    mapping_groups = utilities.group_by(mappings, lambda m: m.qname)
    
    for qname, group in mapping_groups:
        group = sorted(group, key=lambda m: (m.tid, m.pos))
        alignments = [mapping_to_alignment(mapping, sam_file, region_fetcher)
                      for mapping in group if not mapping.is_unmapped]
        yield qname, alignments
def produce_bowtie2_alignments_old(reads,
                                   sam_fn,
                                   index_prefix,
                                   genome_dir,
                                   score_min,
                                  ):

    bowtie2_options = {'local': True,
                       #'report_all': True,
                       'report_up_to': 10,
                       'seed_mismatches': 1,
                       'seed_interval_function': 'C,1,0',
                       'seed_length': 10,
                       #'threads': 12,
                      }

   
    mapping_tools.map_bowtie2(index_prefix,
                              None,
                              None,
                              sam_fn,
                              unpaired_Reads=reads,
                              custom_binary=True,
                              score_min=score_min,
                              **bowtie2_options)
    
    sam_file = pysam.Samfile(sam_fn)
    region_fetcher = genomes.build_region_fetcher(genome_dir, load_references=True)

    mapping_groups = utilities.group_by(sam_file, lambda m: m.qname)
    
    for qname, group in mapping_groups:
        alignments = [mapping_to_alignment(mapping, sam_file, region_fetcher)
                      for mapping in group if not mapping.is_unmapped]
        yield qname, alignments
示例#3
0
    def filter_mappings(self):
        num_unmapped = 0
        num_entirely_genomic = 0
        num_nonunique = 0
        num_unique = 0

        nongenomic_lengths = Counter()

        sam_file = pysam.Samfile(self.file_names['accepted_hits'])
    
        region_fetcher = genomes.build_region_fetcher(self.file_names['genome'],
                                                      load_references=True,
                                                      sam_file=sam_file,
                                                     )

        extended_sorter = sam.AlignmentSorter(sam_file.references,
                                              sam_file.lengths,
                                              self.file_names['extended'],
                                             )
        filtered_sorter = sam.AlignmentSorter(sam_file.references,
                                              sam_file.lengths,
                                              self.file_names['extended_filtered'],
                                             )

        extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file)
        mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname)

        with extended_sorter, filtered_sorter:
            for qname, group in mapping_groups:
                for m in group:
                    extended_sorter.write(m)

                min_nongenomic_length = min(trim.get_nongenomic_length(m) for m in group)
                nongenomic_lengths[min_nongenomic_length] += 1
                if min_nongenomic_length == 0:
                    num_entirely_genomic += 1
                    continue
                
                nonunique = len(group) > 1 or any(m.mapq < 40 for m in group)
                if nonunique:
                    num_nonunique += 1
                    continue
                
                num_unique += 1
                
                for m in group:
                    filtered_sorter.write(m)

        self.summary.extend(
            [('Mapped with no non-genomic A\'s', num_entirely_genomic),
             ('Nonunique', num_nonunique),
             ('Unique', num_unique),
            ],
        )

        nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths)
        self.write_file('nongenomic_lengths', nongenomic_lengths)
示例#4
0
def get_transcripts(all_features, genome_dir):
    region_fetcher = genomes.build_region_fetcher(genome_dir, load_references=True)

    feature_lists = defaultdict(list)
    for feature in all_features:
        transcript_name = feature.attribute['transcript_id']
        feature_lists[transcript_name].append(feature)

    transcripts = [Transcript(name, features, None, region_fetcher)
                   for name, features in feature_lists.iteritems()]

    return transcripts
示例#5
0
def get_gff_transcripts(all_features, genome_dir):
    region_fetcher = genomes.build_region_fetcher(genome_dir, load_references=True)
    genes = []
    for feature in all_features:
        top_level = feature.parent == None
        dubious = feature.attribute.get('orf_classification') == 'Dubious'
        has_exon = any('exon' in c.feature for c in feature.descendants) 
        
        if top_level and has_exon and not dubious:
            gene = GFFTranscript(feature, region_fetcher)
            genes.append(gene)

    return genes
示例#6
0
def get_gff_transcripts(all_features, genome_dir):
    region_fetcher = genomes.build_region_fetcher(genome_dir,
                                                  load_references=True)
    genes = []
    for feature in all_features:
        top_level = feature.parent == None
        dubious = feature.attribute.get('orf_classification') == 'Dubious'
        has_exon = any('exon' in c.feature for c in feature.descendants)

        if top_level and has_exon and not dubious:
            gene = GFFTranscript(feature, region_fetcher)
            genes.append(gene)

    return genes
示例#7
0
def call_3p_peaks():
    gtf_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf'
    genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/'
    composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5'

    output_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_3p_lengths.txt'

    region_fetcher = genomes.build_region_fetcher(genome_dir)
    CDSs = gtf.get_CDSs(gtf_fn)
    CDS_dict = {t.name: t for t in CDSs}

    experiments = build_all_experiments(verbose=False)

    three_prime_experiments = [(n, e) for n, e in sorted(experiments['three_p_seq']['three_p_seq'].items())] + \
                              [(n, e) for n, e in sorted(experiments['three_t_fill_seq']['wilkening_nar'].items()) if '3tfill_ypd_rep1' in n] + \
                              [(n, e) for n, e in sorted(experiments['TIF_seq']['pelechano_nature'].items()) if n == 'ypd_bio1_lib1' or n == 'ypd_bio1_lib4']

    argmaxes = {}
    fractions = {}
    joints = {}
    for name, experiment in three_prime_experiments:
        print name
        argmaxes[name] = {}
        fractions[name] = []
        joints[name] = []
        fn = experiment.file_names['three_prime_read_positions']
        f = h5py.File(fn, 'r')
        for transcript in utilities.progress_bar(len(CDSs), CDSs):
            if transcript.name not in f:
                continue
            gene = Serialize.read_positions.build_gene(f[transcript.name])
            xs = np.arange(0, 400)

            argmax = gene['all'].argmax_over_slice('stop_codon', xs)
            argmaxes[name][transcript.name] = argmax
            most = gene['all']['stop_codon', argmax]
            total = gene['all']['stop_codon', xs].sum()
            if total > 9:
                fraction = np.true_divide(most, total)
                fractions[name].append(fraction)
                joints[name].append((argmax, fraction))

    with open(output_fn, 'w') as output_fh:
        name_order = sorted(argmaxes['Cerevisiae_3Pseq'],
                            key=argmaxes['Cerevisiae_3Pseq'].get)
        for name in name_order:
            output_fh.write('{0}\t'.format(str(CDS_dict[name])))
            for exp_name, _ in three_prime_experiments:
                output_fh.write('{0}\t'.format(argmaxes[exp_name][name]))
            output_fh.write('\n')
示例#8
0
def get_transcripts(all_features, genome_dir):
    region_fetcher = genomes.build_region_fetcher(genome_dir,
                                                  load_references=True)

    feature_lists = defaultdict(list)
    for feature in all_features:
        transcript_name = feature.attribute['transcript_id']
        feature_lists[transcript_name].append(feature)

    transcripts = [
        Transcript(name, features, None, region_fetcher)
        for name, features in feature_lists.iteritems()
    ]

    return transcripts
示例#9
0
 def get_reads(self):
     CDSs, _ = self.get_CDSs()
     region_fetcher = genomes.build_region_fetcher(self.file_names['genome'],
                                                   load_references=True,
                                                  )
     for transcript in CDSs:
         reads = make_artificial_reads(transcript,
                                       self.fragment_length,
                                       self.max_read_length,
                                       self.adapter_sequence,
                                       region_fetcher,
                                       self.common_buffer,
                                      )
         for read in reads:
             yield read
示例#10
0
 def get_reads(self):
     CDSs, _ = self.get_CDSs()
     region_fetcher = genomes.build_region_fetcher(
         self.file_names['genome'],
         load_references=True,
     )
     for transcript in CDSs:
         reads = make_artificial_reads(
             transcript,
             self.fragment_length,
             self.max_read_length,
             self.adapter_sequence,
             region_fetcher,
             self.common_buffer,
         )
         for read in reads:
             yield read
示例#11
0
    def process_full_length_mappings(self):
        clean_bam = pysam.Samfile(self.file_names['clean_bam'])
        
        type_shape = (self.max_read_length + 1,
                      self.max_read_length,
                      fastq.MAX_EXPECTED_QUAL + 1,
                      6,
                      6,
                     )
        type_counts = np.zeros(type_shape, int)

        # To avoid counting mismatches in non-unique mappings multiple times,
        # a dummy secondary_type_counts array is passed to
        # trim_mismatches_from_start for secondary mappings.
        secondary_type_counts = np.zeros(type_shape, int)
        
        clean_trimmed_length_counts = Counter()
    
        region_fetcher = genomes.build_region_fetcher(self.file_names['genome'],
                                                      load_references=True,
                                                      sam_file=clean_bam,
                                                     )

        for mapping in clean_bam:
            if mapping.is_secondary:
                counts_array = secondary_type_counts
            else:
                counts_array = type_counts

            trimmed_from_start = trim.trim_mismatches_from_start(mapping,
                                                                 region_fetcher,
                                                                 counts_array,
                                                                )
            trimmed_from_end = trim.trim_nongenomic_polyA_from_end(trimmed_from_start,
                                                                   region_fetcher,
                                                                  )
            if not trimmed_from_end.is_unmapped and not trimmed_from_end.is_secondary:
                clean_trimmed_length_counts[trimmed_from_end.qlen] += 1

            yield trimmed_from_end

        self.write_file('mismatches', type_counts)
        
        clean_trimmed_lengths = self.zero_padded_array(clean_trimmed_length_counts)
        self.write_file('lengths', {'clean_trimmed': clean_trimmed_lengths})
示例#12
0
文件: trim.py 项目: AlexeyG/ribosomes
def extend_polyA_ends(bam_fn, extended_bam_fn, genome_dir, trimmed_twice=False):
    bam_file = pysam.Samfile(bam_fn)
    region_fetcher = genomes.build_region_fetcher(genome_dir,
                                                  load_references=True,
                                                  sam_file=bam_file,
                                                 )

    # Adding bases to the end of minus strand mappings produces a file
    # that is not necessarily sorted, so re-sort. 
    alignment_sorter = sam.AlignmentSorter(bam_file.references,
                                           bam_file.lengths,
                                           extended_bam_fn,
                                          )

    with alignment_sorter:
        for mapping in bam_file:
            extended_mapping = extend_polyA_end(mapping, region_fetcher, trimmed_twice)
            alignment_sorter.write(extended_mapping)
def call_5p_peaks():
    gtf_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf"
    genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/"
    region_fetcher = genomes.build_region_fetcher(genome_dir)
    CDSs = gtf.get_CDSs(gtf_fn)

    experiments = build_all_experiments(verbose=False)

    five_prime_experiments = (
        [(n, e) for n, e in sorted(experiments["TL_seq"]["arribere_gr"].items()) if "TLSeq1" in n]
        + [(n, e) for n, e in sorted(experiments["TL_seq"]["park_nar"].items()) if n == "SMORE-seq_WT_TAP+_rep1"]
        + [
            (n, e)
            for n, e in sorted(experiments["TIF_seq"]["pelechano_nature"].items())
            if n == "ypd_bio1_lib1" or n == "ypd_bio1_lib4"
        ]
    )

    argmaxes = {}
    fractions = {}
    joints = {}
    for name, experiment in five_prime_experiments:
        print name
        argmaxes[name] = Counter()
        fractions[name] = []
        joints[name] = []
        fn = experiment.file_names["five_prime_read_positions"]
        f = h5py.File(fn, "r")
        for transcript in utilities.progress_bar(len(CDSs), CDSs):
            if transcript.name not in f:
                continue
            gene = Serialize.read_positions.build_gene(f[transcript.name])
            xs = np.arange(-300, 0)

            argmax = gene["all"].argmax_over_slice("start_codon", xs)
            argmaxes[name][argmax] += 1
            most = gene["all"]["start_codon", argmax]
            total = gene["all"]["start_codon", xs].sum()
            if total == 0:
                print transcript
            if total > 9:
                fraction = np.true_divide(most, total)
                fractions[name].append(fraction)
                joints[name].append((argmax, fraction))
示例#14
0
    def compute_yield(self):
        lengths = self.read_file('lengths')
        reads = {name: lengths[name].sum() for name in lengths}
        reads['total'] = reads['trimmed'] + reads['too_short']

        reads['dominant'], overlapping_reads, boundaries = contaminants.identify_dominant_stretches(self.read_file('rRNA_coverage'),
                                                                                                    reads['total'],
                                                                                                    self.max_read_length,
                                                                                                    self.merged_file_names['rRNA_bam'],
                                                                                                   )
        contaminants.plot_dominant_stretch_lengths(boundaries, self.figure_file_names['dominant_stretch_lengths'])
        reads['other'] = reads['rRNA'] - reads['dominant']

        region_fetcher = genomes.build_region_fetcher(self.file_names['rRNA_fasta_dir'])

        with open(self.merged_file_names['dominant_stretches'], 'w') as dominant_stretches_file:
            for rname in sorted(boundaries):
                for start, stop in sorted(boundaries[rname]):
                    sequence = region_fetcher(rname, start, stop)
                    fraction = overlapping_reads[rname, start, stop] / float(reads['total'])

                    dominant_stretches_file.write('{0}: {1:,}-{2:,}\t{3:6.1%}\t{4}\n'.format(rname, start, stop, fraction, sequence))

        with open(self.file_names['yield'], 'w') as yield_file:
            yield_file.write('Total reads: {0:,}\n'.format(reads['total']))
            for category, count in [('Long enough reads', reads['trimmed']),
                                    ('phiX reads', reads['phiX']),
                                    ('rRNA reads', reads['rRNA']),
                                    ('(rRNA reads from non-dominant stetches)', reads['other']),
                                    ('tRNA reads', reads['tRNA']),
                                    ('Other ncRNA reads', reads['other_ncRNA']),
                                    ('Clean reads', reads['clean']),
                                    ('Reads mapped after polyA trimming', reads['remapped']),
                                    ('Reads that start wth long polyA', reads['long_polyA']),
                                    ('Synthetic reads', reads['synthetic']),
                                    ('Unaccounted-for reads', reads['unmapped']),
                                   ]:
                fraction = float(count) / reads['total']
                line = '{0}: {1:,} ({2:.2%})\n'.format(category,
                                                       count,
                                                       fraction,
                                                      )
                yield_file.write(line)
示例#15
0
def call_5p_peaks():
    gtf_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf'
    genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/'
    region_fetcher = genomes.build_region_fetcher(genome_dir)
    CDSs = gtf.get_CDSs(gtf_fn)

    experiments = build_all_experiments(verbose=False)

    five_prime_experiments = [(n, e) for n, e in sorted(experiments['TL_seq']['arribere_gr'].items()) if 'TLSeq1' in n] + \
                             [(n, e) for n, e in sorted(experiments['TL_seq']['park_nar'].items()) if n == 'SMORE-seq_WT_TAP+_rep1'] + \
                             [(n, e) for n, e in sorted(experiments['TIF_seq']['pelechano_nature'].items()) if n == 'ypd_bio1_lib1' or n == 'ypd_bio1_lib4']

    argmaxes = {}
    fractions = {}
    joints = {}
    for name, experiment in five_prime_experiments:
        print name
        argmaxes[name] = Counter()
        fractions[name] = []
        joints[name] = []
        fn = experiment.file_names['five_prime_read_positions']
        f = h5py.File(fn, 'r')
        for transcript in utilities.progress_bar(len(CDSs), CDSs):
            if transcript.name not in f:
                continue
            gene = Serialize.read_positions.build_gene(f[transcript.name])
            xs = np.arange(-300, 0)

            argmax = gene['all'].argmax_over_slice('start_codon', xs)
            argmaxes[name][argmax] += 1
            most = gene['all']['start_codon', argmax]
            total = gene['all']['start_codon', xs].sum()
            if total == 0:
                print transcript
            if total > 9:
                fraction = np.true_divide(most, total)
                fractions[name].append(fraction)
                joints[name].append((argmax, fraction))
示例#16
0
    def process_remapped(self):
        clean_bam = pysam.Samfile(self.file_names['remapped_accepted_hits'])
        
        type_shape = (self.max_read_length + 1,
                      self.max_read_length,
                      fastq.MAX_EXPECTED_QUAL + 1,
                      6,
                      6,
                     )
        type_counts = np.zeros(type_shape, int)
        remapped_length_counts = Counter()
    
        region_fetcher = genomes.build_region_fetcher(self.file_names['genome'],
                                                      load_references=True,
                                                      sam_file=clean_bam,
                                                     )

        for mapping in clean_bam:
            trimmed_from_start = trim.trim_mismatches_from_start(mapping,
                                                                 region_fetcher,
                                                                 type_counts,
                                                                )
            # Add back any genomic A's that were trimmed as part of mappings and
            # any remaining A's from the first non-genomic onward as soft clipped
            # bases for visualization in IGV.
            extended = trim.extend_polyA_end(trimmed_from_start,
                                             region_fetcher,
                                             trimmed_twice=True,
                                            )
            if not extended.is_unmapped and not extended.is_secondary:
                remapped_length_counts[extended.qlen] += 1

            yield extended

        remapped_lengths = self.zero_padded_array(remapped_length_counts)
        self.write_file('lengths', {'remapped': remapped_lengths})
示例#17
0
def extend_polyA_ends(bam_fn,
                      extended_bam_fn,
                      genome_dir,
                      trimmed_twice=False):
    bam_file = pysam.Samfile(bam_fn)
    region_fetcher = genomes.build_region_fetcher(
        genome_dir,
        load_references=True,
        sam_file=bam_file,
    )

    # Adding bases to the end of minus strand mappings produces a file
    # that is not necessarily sorted, so re-sort.
    alignment_sorter = sam.AlignmentSorter(
        bam_file.references,
        bam_file.lengths,
        extended_bam_fn,
    )

    with alignment_sorter:
        for mapping in bam_file:
            extended_mapping = extend_polyA_end(mapping, region_fetcher,
                                                trimmed_twice)
            alignment_sorter.write(extended_mapping)
示例#18
0
    def filter_mappings(self):
        num_unmapped = 0
        num_entirely_genomic = 0
        num_nonunique = 0
        num_unique = 0

        nongenomic_lengths = Counter()

        sam_file = pysam.Samfile(self.file_names['accepted_hits'])

        region_fetcher = genomes.build_region_fetcher(
            self.file_names['genome'],
            load_references=True,
            sam_file=sam_file,
        )

        extended_sorter = sam.AlignmentSorter(
            sam_file.references,
            sam_file.lengths,
            self.file_names['extended'],
        )
        filtered_sorter = sam.AlignmentSorter(
            sam_file.references,
            sam_file.lengths,
            self.file_names['extended_filtered'],
        )

        extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher)
                             for mapping in sam_file)
        mapping_groups = utilities.group_by(extended_mappings,
                                            lambda m: m.qname)

        with extended_sorter, filtered_sorter:
            for qname, group in mapping_groups:
                for m in group:
                    extended_sorter.write(m)

                min_nongenomic_length = min(
                    trim.get_nongenomic_length(m) for m in group)
                nongenomic_lengths[min_nongenomic_length] += 1
                if min_nongenomic_length == 0:
                    num_entirely_genomic += 1
                    continue

                nonunique = len(group) > 1 or any(m.mapq < 40 for m in group)
                if nonunique:
                    num_nonunique += 1
                    continue

                num_unique += 1

                for m in group:
                    filtered_sorter.write(m)

        self.summary.extend([
            ('Mapped with no non-genomic A\'s', num_entirely_genomic),
            ('Nonunique', num_nonunique),
            ('Unique', num_unique),
        ], )

        nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths)
        self.write_file('nongenomic_lengths', nongenomic_lengths)
def call_3p_peaks():
    gtf_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf"
    genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/"
    composition_fn = (
        "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5"
    )

    output_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_3p_lengths.txt"

    region_fetcher = genomes.build_region_fetcher(genome_dir)
    CDSs = gtf.get_CDSs(gtf_fn)
    CDS_dict = {t.name: t for t in CDSs}

    experiments = build_all_experiments(verbose=False)

    three_prime_experiments = (
        [(n, e) for n, e in sorted(experiments["three_p_seq"]["three_p_seq"].items())]
        + [
            (n, e)
            for n, e in sorted(experiments["three_t_fill_seq"]["wilkening_nar"].items())
            if "3tfill_ypd_rep1" in n
        ]
        + [
            (n, e)
            for n, e in sorted(experiments["TIF_seq"]["pelechano_nature"].items())
            if n == "ypd_bio1_lib1" or n == "ypd_bio1_lib4"
        ]
    )

    argmaxes = {}
    fractions = {}
    joints = {}
    for name, experiment in three_prime_experiments:
        print name
        argmaxes[name] = {}
        fractions[name] = []
        joints[name] = []
        fn = experiment.file_names["three_prime_read_positions"]
        f = h5py.File(fn, "r")
        for transcript in utilities.progress_bar(len(CDSs), CDSs):
            if transcript.name not in f:
                continue
            gene = Serialize.read_positions.build_gene(f[transcript.name])
            xs = np.arange(0, 400)

            argmax = gene["all"].argmax_over_slice("stop_codon", xs)
            argmaxes[name][transcript.name] = argmax
            most = gene["all"]["stop_codon", argmax]
            total = gene["all"]["stop_codon", xs].sum()
            if total > 9:
                fraction = np.true_divide(most, total)
                fractions[name].append(fraction)
                joints[name].append((argmax, fraction))

    with open(output_fn, "w") as output_fh:
        name_order = sorted(argmaxes["Cerevisiae_3Pseq"], key=argmaxes["Cerevisiae_3Pseq"].get)
        for name in name_order:
            output_fh.write("{0}\t".format(str(CDS_dict[name])))
            for exp_name, _ in three_prime_experiments:
                output_fh.write("{0}\t".format(argmaxes[exp_name][name]))
            output_fh.write("\n")
示例#20
0
    def combine_mappings(self):
        num_unmapped = 0
        num_five_unmapped = 0
        num_three_unmapped = 0
        num_nonunique = 0
        num_discordant = 0
        num_concordant = 0

        five_prime_mappings = pysam.Samfile(self.file_names['five_prime_accepted_hits'])
        five_prime_unmapped = pysam.Samfile(self.file_names['five_prime_unmapped'])
        all_five_prime = sam.merge_by_name(five_prime_mappings, five_prime_unmapped)
        five_prime_grouped = utilities.group_by(all_five_prime, lambda m: m.qname)

        three_prime_mappings = pysam.Samfile(self.file_names['three_prime_accepted_hits'])
        three_prime_unmapped = pysam.Samfile(self.file_names['three_prime_unmapped'])
        all_three_prime = sam.merge_by_name(three_prime_mappings, three_prime_unmapped)
        three_prime_grouped = utilities.group_by(all_three_prime, lambda m: m.qname)

        group_pairs = izip(five_prime_grouped, three_prime_grouped)

        alignment_sorter = sam.AlignmentSorter(five_prime_mappings.references,
                                               five_prime_mappings.lengths,
                                               self.file_names['combined_extended'],
                                              )
        region_fetcher = genomes.build_region_fetcher(self.file_names['genome'],
                                                      load_references=True,
                                                      sam_file=five_prime_mappings,
                                                     )

        with alignment_sorter:
            for (five_qname, five_group), (three_qname, three_group) in group_pairs:
                five_annotation = trim.PayloadAnnotation.from_identifier(five_qname)
                three_annotation = trim.PayloadAnnotation.from_identifier(three_qname)
                if five_annotation['original_name'] != three_annotation['original_name']:
                    # Ensure that the iteration through pairs is in sync.
                    print five_qname, three_qname
                    raise ValueError

                five_unmapped = any(m.is_unmapped for m in five_group)
                three_unmapped = any(m.is_unmapped for m in three_group)
                if five_unmapped:
                    num_five_unmapped += 1
                if three_unmapped:
                    num_three_unmapped += 1
                if five_unmapped or three_unmapped:
                    num_unmapped += 1
                    continue

                five_nonunique = len(five_group) > 1 or any(m.mapq < 40 for m in five_group)
                three_nonunique = len(three_group) > 1 or any(m.mapq < 40 for m in three_group)
                if five_nonunique or three_nonunique:
                    num_nonunique += 1
                    continue
                
                five_m = five_group.pop()
                three_m = three_group.pop()

                five_strand = '-' if five_m.is_reverse else '+'
                three_strand = '-' if three_m.is_reverse else '+'

                tlen = max(five_m.aend, three_m.aend) - min(five_m.pos, three_m.pos)
                discordant = (five_m.tid != three_m.tid) or (five_strand) != (three_strand) or (tlen > 10000) 
                if discordant:
                    num_discordant += 1
                    continue
                
                if five_strand == '+':
                    first_read = five_m
                    second_read = three_m
                elif five_strand == '-':
                    first_read = three_m
                    second_read = five_m
                
                gap = second_read.pos - first_read.aend
                if gap < 0:
                    num_discordant += 1
                    continue
                
                combined_read = pysam.AlignedRead()
                # qname needs to come from three_m to include trimmed As
                combined_read.qname = three_m.qname
                combined_read.tid = five_m.tid
                combined_read.seq = first_read.seq + second_read.seq
                combined_read.qual = first_read.qual + second_read.qual
                combined_read.cigar = first_read.cigar + [(3, gap)] + second_read.cigar
                combined_read.pos = first_read.pos
                combined_read.is_reverse = first_read.is_reverse
                combined_read.mapq = min(first_read.mapq, second_read.mapq)
                combined_read.rnext = -1
                combined_read.pnext = -1
                
                num_concordant += 1

                extended_mapping = trim.extend_polyA_end(combined_read,
                                                         region_fetcher,
                                                        )

                alignment_sorter.write(extended_mapping)

        self.summary.extend(
            [('Unmapped', num_unmapped),
             ('Five prime unmapped', num_five_unmapped),
             ('Three prime unmapped', num_three_unmapped),
             ('Nonunique', num_nonunique),
             ('Discordant', num_discordant),
             ('Concordant', num_concordant),
            ],
        )