Exemplo n.º 1
0
    def filter_mappings(self):
        num_unmapped = 0
        num_entirely_genomic = 0
        num_nonunique = 0
        num_unique = 0

        nongenomic_lengths = Counter()

        sam_file = pysam.Samfile(self.file_names['accepted_hits'])
    
        region_fetcher = genomes.build_region_fetcher(self.file_names['genome'],
                                                      load_references=True,
                                                      sam_file=sam_file,
                                                     )

        extended_sorter = sam.AlignmentSorter(sam_file.references,
                                              sam_file.lengths,
                                              self.file_names['extended'],
                                             )
        filtered_sorter = sam.AlignmentSorter(sam_file.references,
                                              sam_file.lengths,
                                              self.file_names['extended_filtered'],
                                             )

        extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file)
        mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname)

        with extended_sorter, filtered_sorter:
            for qname, group in mapping_groups:
                for m in group:
                    extended_sorter.write(m)

                min_nongenomic_length = min(trim.get_nongenomic_length(m) for m in group)
                nongenomic_lengths[min_nongenomic_length] += 1
                if min_nongenomic_length == 0:
                    num_entirely_genomic += 1
                    continue
                
                nonunique = len(group) > 1 or any(m.mapq < 40 for m in group)
                if nonunique:
                    num_nonunique += 1
                    continue
                
                num_unique += 1
                
                for m in group:
                    filtered_sorter.write(m)

        self.summary.extend(
            [('Mapped with no non-genomic A\'s', num_entirely_genomic),
             ('Nonunique', num_nonunique),
             ('Unique', num_unique),
            ],
        )

        nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths)
        self.write_file('nongenomic_lengths', nongenomic_lengths)
Exemplo n.º 2
0
    def filter_mappings(self):
        num_unmapped = 0
        num_entirely_genomic = 0
        num_nonunique = 0
        num_unique = 0

        nongenomic_lengths = Counter()

        sam_file = pysam.Samfile(self.file_names['accepted_hits'])

        region_fetcher = genomes.build_region_fetcher(
            self.file_names['genome'],
            load_references=True,
            sam_file=sam_file,
        )

        extended_sorter = sam.AlignmentSorter(
            sam_file.references,
            sam_file.lengths,
            self.file_names['extended'],
        )
        filtered_sorter = sam.AlignmentSorter(
            sam_file.references,
            sam_file.lengths,
            self.file_names['extended_filtered'],
        )

        extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher)
                             for mapping in sam_file)
        mapping_groups = utilities.group_by(extended_mappings,
                                            lambda m: m.qname)

        with extended_sorter, filtered_sorter:
            for qname, group in mapping_groups:
                for m in group:
                    extended_sorter.write(m)

                min_nongenomic_length = min(
                    trim.get_nongenomic_length(m) for m in group)
                nongenomic_lengths[min_nongenomic_length] += 1
                if min_nongenomic_length == 0:
                    num_entirely_genomic += 1
                    continue

                nonunique = len(group) > 1 or any(m.mapq < 40 for m in group)
                if nonunique:
                    num_nonunique += 1
                    continue

                num_unique += 1

                for m in group:
                    filtered_sorter.write(m)

        self.summary.extend([
            ('Mapped with no non-genomic A\'s', num_entirely_genomic),
            ('Nonunique', num_nonunique),
            ('Unique', num_unique),
        ], )

        nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths)
        self.write_file('nongenomic_lengths', nongenomic_lengths)
Exemplo n.º 3
0
def get_Transcript_position_counts(clean_bam_fn,
                                   transcripts,
                                   relevant_lengths,
                                   left_buffer=left_buffer,
                                   right_buffer=right_buffer,
                                  ):
    gene_infos = {}
    bam_file = pysam.Samfile(clean_bam_fn)
    
    max_nongenomic_length = 5

    for transcript in transcripts:
        transcript.build_coordinate_maps(left_buffer, right_buffer)

        nonunique = 0
        alternatively_spliced = 0
        
        landmarks = {'start': 0,
                     'start_codon': transcript.transcript_start_codon,
                     'stop_codon': transcript.transcript_stop_codon,
                     'end': transcript.transcript_length,
                    }
        five_prime_positions = {l: PositionCounts(landmarks, left_buffer, right_buffer)
                                for l in relevant_lengths + ['all', 'all_nonunique']}
        
        three_prime_positions = {l: PositionCounts(landmarks, left_buffer, right_buffer)
                                 for l in range(max_nongenomic_length + 1) + ['all', 'all_nonunique']}

        transcript_sequence = transcript.get_transcript_sequence(left_buffer, right_buffer)
        
        # fetch raises a ValueError if given a negative start, but it doesn't 
        # care if the end is valid.
        left_edge = max(0, transcript.start - left_buffer)
        right_edge = transcript.end + right_buffer
        overlapping_reads = bam_file.fetch(transcript.seqname, left_edge, right_edge)
        for read in overlapping_reads:
            if any(transcript.is_spliced_out(position) for position in read.positions):
                alternatively_spliced += 1
                continue
            
            if read.mapq != 50:
                nonunique += 1
                is_unique = False
            else:
                is_unique = True

            read_strand = '-' if read.is_reverse else '+'
            if read_strand != transcript.strand:
                continue
            
            left_edge = read.pos
            right_edge = read.aend - 1
            
            if read_strand == '+':
                five_prime_position = left_edge
                three_prime_position = right_edge
            elif read_strand == '-':
                five_prime_position = right_edge
                three_prime_position = left_edge

            if five_prime_position in transcript.genomic_to_transcript:
                transcript_coord = transcript.genomic_to_transcript[five_prime_position]

                if is_unique:
                    five_prime_positions['all']['start', transcript_coord] += 1
                    
                    if read.qlen in relevant_lengths:
                        five_prime_positions[read.qlen]['start', transcript_coord] += 1

                elif not read.is_secondary:
                    five_prime_positions['all_nonunique']['start', transcript_coord] += 1

            
            if three_prime_position in transcript.genomic_to_transcript:
                transcript_coord = transcript.genomic_to_transcript[three_prime_position]

                if is_unique:
                    three_prime_positions['all']['start', transcript_coord] += 1

                    nongenomic_length = trim.get_nongenomic_length(read)
                    if nongenomic_length <= max_nongenomic_length:
                        three_prime_positions[nongenomic_length]['start', transcript_coord] += 1
                elif not read.is_secondary:
                    three_prime_positions['all_nonunique']['start', transcript_coord] += 1

        gene_infos[transcript.name] = {'CDS_length': transcript.CDS_length,
                                       'five_prime_positions': five_prime_positions,
                                       'three_prime_positions': three_prime_positions,
                                       'nonunique': nonunique,
                                       'alternatively_spliced': alternatively_spliced,
                                       'sequence': transcript_sequence,
                                      }
        transcript.delete_coordinate_maps()

    return gene_infos
Exemplo n.º 4
0
def get_Transcript_position_counts(
    clean_bam_fn,
    transcripts,
    relevant_lengths,
    left_buffer=left_buffer,
    right_buffer=right_buffer,
):
    gene_infos = {}
    bam_file = pysam.Samfile(clean_bam_fn)

    max_nongenomic_length = 5

    for transcript in transcripts:
        transcript.build_coordinate_maps(left_buffer, right_buffer)

        nonunique = 0
        alternatively_spliced = 0

        landmarks = {
            'start': 0,
            'start_codon': transcript.transcript_start_codon,
            'stop_codon': transcript.transcript_stop_codon,
            'end': transcript.transcript_length,
        }
        five_prime_positions = {
            l: PositionCounts(landmarks, left_buffer, right_buffer)
            for l in relevant_lengths + ['all', 'all_nonunique']
        }

        three_prime_positions = {
            l: PositionCounts(landmarks, left_buffer, right_buffer)
            for l in range(max_nongenomic_length + 1) +
            ['all', 'all_nonunique']
        }

        transcript_sequence = transcript.get_transcript_sequence(
            left_buffer, right_buffer)

        # fetch raises a ValueError if given a negative start, but it doesn't
        # care if the end is valid.
        left_edge = max(0, transcript.start - left_buffer)
        right_edge = transcript.end + right_buffer
        overlapping_reads = bam_file.fetch(transcript.seqname, left_edge,
                                           right_edge)
        for read in overlapping_reads:
            if any(
                    transcript.is_spliced_out(position)
                    for position in read.positions):
                alternatively_spliced += 1
                continue

            if read.mapq != 50:
                nonunique += 1
                is_unique = False
            else:
                is_unique = True

            read_strand = '-' if read.is_reverse else '+'
            if read_strand != transcript.strand:
                continue

            left_edge = read.pos
            right_edge = read.aend - 1

            if read_strand == '+':
                five_prime_position = left_edge
                three_prime_position = right_edge
            elif read_strand == '-':
                five_prime_position = right_edge
                three_prime_position = left_edge

            if five_prime_position in transcript.genomic_to_transcript:
                transcript_coord = transcript.genomic_to_transcript[
                    five_prime_position]

                if is_unique:
                    five_prime_positions['all']['start', transcript_coord] += 1

                    if read.qlen in relevant_lengths:
                        five_prime_positions[read.qlen]['start',
                                                        transcript_coord] += 1

                elif not read.is_secondary:
                    five_prime_positions['all_nonunique'][
                        'start', transcript_coord] += 1

            if three_prime_position in transcript.genomic_to_transcript:
                transcript_coord = transcript.genomic_to_transcript[
                    three_prime_position]

                if is_unique:
                    three_prime_positions['all']['start',
                                                 transcript_coord] += 1

                    nongenomic_length = trim.get_nongenomic_length(read)
                    if nongenomic_length <= max_nongenomic_length:
                        three_prime_positions[nongenomic_length][
                            'start', transcript_coord] += 1
                elif not read.is_secondary:
                    three_prime_positions['all_nonunique'][
                        'start', transcript_coord] += 1

        gene_infos[transcript.name] = {
            'CDS_length': transcript.CDS_length,
            'five_prime_positions': five_prime_positions,
            'three_prime_positions': three_prime_positions,
            'nonunique': nonunique,
            'alternatively_spliced': alternatively_spliced,
            'sequence': transcript_sequence,
        }
        transcript.delete_coordinate_maps()

    return gene_infos