Пример #1
0
def make_artificial_reads(
    transcript,
    fragment_length,
    read_length,
    adapter_sequence,
    region_fetcher,
    common_buffer,
):
    transcript_sequence = transcript.retrieve_sequence(
        region_fetcher,
        left_buffer=common_buffer,
        right_buffer=common_buffer + fragment_length,
    )
    # Needs to include one non-Solexa value for automatic encoding recognition.
    high_quals = fastq.encode_sanger([25] + [30] * (read_length - 1))
    for i, transcript_position in enumerate(
            range(-common_buffer, transcript.CDS_length + common_buffer)):
        annotation = artifical_annotation(
            transcript_name=transcript.name,
            position=transcript_position,
        )
        fragment_sequence = transcript_sequence[i:i + fragment_length]
        if '-' in fragment_sequence:
            # skip fragments that run off the edge of a reference sequence
            continue

        full_sequence = fragment_sequence + adapter_sequence
        read = fastq.Read(annotation.identifier, full_sequence[:read_length],
                          high_quals)
        yield read
Пример #2
0
 def get_reads():
     for i, (seq, count) in enumerate(self.read_file('common_unmapped')['non_long_polyA'].most_common()):
         read = fastq.Read('{0}_{1}'.format(i, count),
                           seq,
                           fastq.encode_sanger([40]*len(seq)),
                          )
         yield read
Пример #3
0
def collapse_fastq(reads, outfile):

    counter = 0

    collapsedReads = {
    }  # dict of unique reads; collapsedRead{seq} = [read,count]
    for r in reads:  #itertools.islice(reads,10000): # iterate thru reads and collapse as necessary

        if counter % 1000000 == 0:
            print(str(counter) + " reads processed ...")

        n = r.name.split('_')
        nseq = n[1] + "_" + n[2] + "_" + r.seq  #[rSlice]

        if nseq in collapsedReads:  # collapsable sequence
            [oldRead, count] = collapsedReads[nseq]
            # maximize quality
            nqual = r.qual  #[rSlice]
            fqualList = []
            for oR_q, nR_q in zip(oldRead.qual, nqual):
                if oR_q > nR_q:
                    fqualList.append(oR_q)
                else:
                    fqualList.append(nR_q)
            fqual = ''.join(fqualList)
            count = count + 1
            nRead = fastq.Read(oldRead.name, r.seq, fqual)
            collapsedReads[nseq] = [nRead, count]
        else:
            nRead = fastq.Read(r.name, r.seq, r.qual)  #[rSlice])
            collapsedReads[nseq] = [nRead, 1]
        counter = counter + 1

    fh = open(outfile, 'w')
    for i in collapsedReads:
        [r, count] = collapsedReads[i]
        #n = r.name.split(' ')

        fh.write(str(fastq.Read(r.name + "_" + str(count), r.seq, r.qual)))
    fh.close()
    def trim_reads(self, read_pairs):
        total_reads = 0
        long_enough_reads = 0
        trimmed_lengths = Counter()
        barcodes = Counter()
        
        for R1, R2 in read_pairs:
            total_reads += 1
            barcodes[R2.seq[:len(self.barcode)]] += 1

            # R2 isn't expected to have adapters sequence because it will
            # have to get through the A tail first.
            position = adapters.find_adapter(self.adapter_in_R1, 3, R1.seq)
            trimmed_lengths[position] += 1
            if position < 12:
                continue
            long_enough_reads += 1

            R1_slice = slice(None, position)
            # position points to where the barcode starts in R1. The length
            # of the trimmed R2 read should be equal to position.
            R2_slice = slice(len(self.barcode), len(self.barcode) + position)

            processed_R1 = fastq.Read(R1.name, R1.seq[R1_slice], R1.qual[R1_slice])
            processed_R2 = fastq.Read(R2.name, R2.seq[R2_slice], R2.qual[R2_slice])
            
            yield processed_R1, processed_R2

        trimmed_lengths = utilities.counts_to_array(trimmed_lengths)
        self.write_file('trimmed_lengths', trimmed_lengths)
        self.write_file('barcodes', barcodes)
        self.summary.extend(
            [('Total read pairs', total_reads),
             ('Long enough', long_enough_reads),
            ]
        )
Пример #5
0
def trim(reads, find_start=None, find_end=None, second_time=False):
    ''' Wrapper that handles the logistics of trimming reads given functions
        find_start and find_end that take a sequence and returns positions
        that trimming should occur at.
    '''
    if find_start == None:
        find_start = lambda seq: 0
    if find_end == None:
        find_end = len

    for read in reads:
        start = find_start(read.seq)
        end = find_end(read.seq)

        left_seq = read.seq[:start]
        left_qual = fastq.sanitize_qual(read.qual[:start])
        right_seq = read.seq[end:]
        right_qual = fastq.sanitize_qual(read.qual[end:])
        if second_time:
            payload_annotation = PayloadAnnotation.from_identifier(read.name)
            annotation = TrimmedTwiceAnnotation(
                retrimmed_left_seq=left_seq,
                retrimmed_left_qual=left_qual,
                retrimmed_right_seq=right_seq,
                retrimmed_right_qual=right_qual,
                **payload_annotation)
        else:
            annotation = PayloadAnnotation(
                original_name=read.name,
                left_seq=left_seq,
                left_qual=left_qual,
                right_seq=right_seq,
                right_qual=right_qual,
            )
        trimmed_read = fastq.Read(
            annotation.identifier,
            read.seq[start:end],
            read.qual[start:end],
        )
        yield trimmed_read
Пример #6
0
def untrim_reads(trimmed_reads, second_time=False):
    if second_time:
        Annotation = TrimmedTwiceAnnotation
        left_seq_key = 'retrimmed_left_seq'
        left_qual_key = 'retrimmed_left_qual'
        right_seq_key = 'retrimmed_right_seq'
        right_qual_key = 'retrimmed_right_qual'
    else:
        Annotation = PayloadAnnotation
        left_seq_key = 'left_seq'
        left_qual_key = 'left_qual'
        right_seq_key = 'right_seq'
        right_qual_key = 'right_qual'

    for trimmed_read in trimmed_reads:
        annotation = Annotation.from_identifier(trimmed_read.name)
        name = trimmed_read.name
        seq = annotation[left_seq_key] + trimmed_read.seq + annotation[
            right_seq_key]
        qual = annotation[left_qual_key] + trimmed_read.qual + annotation[
            right_qual_key]
        read = fastq.Read(name, seq, qual)
        yield read
Пример #7
0
def align_reads(
    target_fasta_fn,
    reads,
    bam_fn,
    min_path_length=15,
    error_fn='/dev/null',
    alignment_type='overlap',
):
    ''' Aligns reads to targets in target_fasta_fn by Smith-Waterman, storing
    alignments in bam_fn and yielding unaligned reads.
    '''
    targets = {r.name: r.seq for r in fasta.reads(target_fasta_fn)}

    target_names = sorted(targets)
    target_lengths = [len(targets[n]) for n in target_names]
    alignment_sorter = sam.AlignmentSorter(
        target_names,
        target_lengths,
        bam_fn,
    )
    statistics = Counter()

    with alignment_sorter:
        for original_read in reads:
            statistics['input'] += 1

            alignments = []

            rc_read = fastq.Read(
                original_read.name,
                utilities.reverse_complement(original_read.seq),
                original_read.qual[::-1],
            )

            for read, is_reverse in ([original_read, False], [rc_read, True]):
                qual = fastq.decode_sanger(read.qual)
                for target_name, target_seq in targets.iteritems():
                    alignment = generate_alignments(read.seq, target_seq,
                                                    alignment_type)[0]
                    path = alignment['path']
                    if len(path) >= min_path_length and alignment['score'] / (
                            2. * len(path)) > 0.8:
                        aligned_segment = pysam.AlignedSegment()
                        aligned_segment.seq = read.seq
                        aligned_segment.query_qualities = qual
                        aligned_segment.is_reverse = is_reverse

                        char_pairs = make_char_pairs(path, read.seq,
                                                     target_seq)

                        cigar = sam.aligned_pairs_to_cigar(char_pairs)
                        clip_from_start = first_query_index(path)
                        if clip_from_start > 0:
                            cigar = [(sam.BAM_CSOFT_CLIP, clip_from_start)
                                     ] + cigar
                        clip_from_end = len(
                            read.seq) - 1 - last_query_index(path)
                        if clip_from_end > 0:
                            cigar = cigar + [
                                (sam.BAM_CSOFT_CLIP, clip_from_end)
                            ]
                        aligned_segment.cigar = cigar

                        read_aligned, ref_aligned = zip(*char_pairs)
                        md = sam.alignment_to_MD_string(
                            ref_aligned, read_aligned)
                        aligned_segment.set_tag('MD', md)

                        aligned_segment.set_tag('AS', alignment['score'])
                        aligned_segment.tid = alignment_sorter.get_tid(
                            target_name)
                        aligned_segment.query_name = read.name
                        aligned_segment.next_reference_id = -1
                        aligned_segment.reference_start = first_target_index(
                            path)

                        alignments.append(aligned_segment)

            if alignments:
                statistics['aligned'] += 1

                sorted_alignments = sorted(alignments,
                                           key=lambda m: m.get_tag('AS'),
                                           reverse=True)
                grouped = utilities.group_by(sorted_alignments,
                                             key=lambda m: m.get_tag('AS'))
                _, highest_group = grouped.next()
                primary_already_assigned = False
                for alignment in highest_group:
                    if len(highest_group) == 1:
                        alignment.mapping_quality = 2
                    else:
                        alignment.mapping_quality = 1

                    if not primary_already_assigned:
                        primary_already_assigned = True
                    else:
                        alignment.is_secondary = True

                    alignment_sorter.write(alignment)
            else:
                statistics['unaligned'] += 1

                yield read

        with open(error_fn, 'w') as error_fh:
            for key in ['input', 'aligned', 'unaligned']:
                error_fh.write('{0}: {1:,}\n'.format(key, statistics[key]))
Пример #8
0
                overlap_R2_seq = R2_rc_seq[overlap_R2_slice]
                overlap_R2_qual = R2_rc_qual[overlap_R2_slice]

                just_R2_slice = slice(len(overlap_R1_seq), None)
                just_R2_seq = R2_rc_seq[just_R2_slice]
                just_R2_qual = R2_rc_qual[just_R2_slice]

                overlap_seq = []
                overlap_qual = []
                for R1_s, R1_q, R2_s, R2_q in zip(
                        overlap_R1_seq,
                        overlap_R1_qual,
                        overlap_R2_seq,
                        overlap_R2_qual,
                ):
                    if R1_q > R2_q:
                        s, q = R1_s, R1_q
                    else:
                        s, q = R2_s, R2_q

                    overlap_seq.append(s)
                    overlap_qual.append(q)

                overlap_seq = ''.join(overlap_seq)
                overlap_qual = ''.join(overlap_qual)

                seq = just_R1_seq + overlap_seq + just_R2_seq
                qual = just_R1_qual + overlap_qual + just_R2_qual

                output_fh.write(str(fastq.Read(R1.name, seq, qual)))
 def get_R2_rc_reads():
     read_pairs = islice(get_read_pairs(), 100)
     return (fastq.Read(R2.name, utilities.reverse_complement(R2.seq),
                        R2.qual[::-1]) for R1, R2 in read_pairs)
    def trim_reads(self, read_pairs):
        total_reads = 0
        long_enough_reads = 0
        trimmed_lengths = Counter()
        barcodes = Counter()

        truncated_in_R1 = self.adapter_in_R1[1:]
        truncated_in_R2 = self.adapter_in_R2[1:]

        for R1, R2 in read_pairs:
            total_reads += 1
            barcodes[R2.seq[:len(self.barcode)]] += 1

            # Check for weird thing where expected overhang base doesn't
            # exist in primer dimers.
            R1_dimer_distance = adapters.adapter_hamming_distance(
                R1.seq,
                truncated_in_R1,
                len(R1.seq),
                len(truncated_in_R1),
                len(self.barcode),
            )
            R2_dimer_distance = adapters.adapter_hamming_distance(
                R2.seq,
                truncated_in_R2,
                len(R2.seq),
                len(truncated_in_R2),
                len(self.barcode),
            )
            if R1_dimer_distance <= 3 and R2_dimer_distance <= 3:
                position = len(self.barcode)
            else:
                position = adapters.consistent_paired_position(
                    R1.seq,
                    R2.seq,
                    self.adapter_in_R1,
                    self.adapter_in_R2,
                    19,
                    3,
                )
            if position != None:
                trimmed_lengths[position] += 1
                if position - len(self.barcode) < 12:
                    continue
            else:
                position = len(R1.seq)

            long_enough_reads += 1

            payload_slice = slice(len(self.barcode), position)

            processed_R1 = fastq.Read(R1.name, R1.seq[payload_slice],
                                      R1.qual[payload_slice])
            processed_R2 = fastq.make_record(R2.name, R2.seq[payload_slice],
                                             R2.qual[payload_slice])

            yield processed_R1, processed_R2

        trimmed_lengths = utilities.counts_to_array(trimmed_lengths)
        self.write_file('trimmed_lengths', trimmed_lengths)
        self.write_file('barcodes', barcodes)
        self.summary.extend([
            ('Total read pairs', total_reads),
            ('Long enough', long_enough_reads),
        ])
Пример #11
0
def find_boundary_sequences(R1, R2, counters):
    # Find which read in the read pair is from the reverse strand by looking for
    # common_right_reverse.
    # First try to find a unique position entirely contained within R1 or R2
    # that is close to common_right_reverse.
    # Failing this, find the longest of (the longest suffix of R1 or R2 that
    # matches a prefix of common_right_reverse) or (the longest prefix of R1 or
    # R2 that matches a suffix of common_right_reverse).

    R1_contained, R1_prefix, R1_suffix = all_adapter_possibilites(
        R1.seq, common_right_reverse)
    R2_contained, R2_prefix, R2_suffix = all_adapter_possibilites(
        R2.seq, common_right_reverse)

    if len(R1_contained) + len(R2_contained) > 1:
        # Only one of occurence of common_right_reverse should exist between R1
        # and R2.
        return None, None
    elif len(R1_contained) + len(R2_contained) == 0:
        possiblities = [
            (len(common_right_reverse) - R1_prefix, 'R1_prefix'),
            (len(common_right_reverse) - R2_prefix, 'R2_prefix'),
            (len(common_right_reverse) - R1_suffix, 'R1_suffix'),
            (len(common_right_reverse) - R2_suffix, 'R2_suffix'),
        ]
        length, kind = max(possiblities)
        if length > 5:
            if 'R1' in kind:
                reverse_read = R1
                forward_read = R2
                polyA_read = 'R2_forward'
                polyT_read = 'R1_reverse'
            elif 'R2' in kind:
                reverse_read = R2
                forward_read = R1
                polyA_read = 'R1_forward'
                polyT_read = 'R2_reverse'
            if 'prefix' in kind:
                common_right_reverse_start = len(reverse_read.seq) - length
            elif 'suffix' in kind:
                common_right_reverse_start = -length
        else:
            return None, None

    elif len(R1_contained) == 1:
        reverse_read = R1
        forward_read = R2
        polyA_read = 'R2_forward'
        polyT_read = 'R1_reverse'
        common_right_reverse_start = R1_contained.pop()
    elif len(R2_contained) == 1:
        reverse_read = R2
        forward_read = R1
        polyA_read = 'R1_forward'
        polyT_read = 'R2_reverse'
        common_right_reverse_start = R2_contained.pop()

    # '*' means that there was no opportunity to see this id.
    # 'X' means that there was an opportunity and it was neither A nor B.
    right_id = '*'
    left_id = '*'

    five_payload_slice = slice(None, max(0, common_right_reverse_start))
    five_payload_seq = utilities.reverse_complement(
        reverse_read.seq[five_payload_slice])
    five_payload_qual = reverse_read.qual[five_payload_slice][::-1]

    current_p = common_right_reverse_start + len(common_right_reverse)
    if current_p < len(reverse_read.seq) - after_right_length:
        right_id_seq = reverse_read.seq[current_p:current_p +
                                        after_right_length]
        for key, prefix in after_right_prefix.items():
            if right_id_seq == prefix:
                right_id = key
        if right_id == '*':
            right_id = 'X'

        counters['right_ids'][right_id_seq] += 1

        if right_id != 'X':
            current_p += len(after_right[right_id])
            if current_p < len(reverse_read.seq) - 4:
                left_id_seq = reverse_read.seq[current_p:current_p + 4]
                for key, sequence in after_left.items():
                    if left_id_seq == sequence:
                        left_id = key
                if left_id == '*':
                    left_id = 'X'

                counters['left_ids'][left_id_seq] += 1

    polyA_start, polyA_length = find_polyA_cython.find_polyA(
        forward_read.seq, 15)
    polyA_slice = slice(polyA_start, polyA_start + polyA_length)
    polyA_seq = forward_read.seq[polyA_slice]
    polyA_qual = fastq.sanitize_qual(forward_read.qual[polyA_slice])
    three_payload_slice = slice(None, polyA_start)
    three_payload_seq = forward_read.seq[three_payload_slice]
    three_payload_qual = forward_read.qual[three_payload_slice]

    common_name, _ = R1.name.rsplit(':', 1)
    control_ids_string = '{0}-{1}'.format(left_id, right_id)
    five_annotation = trim.PayloadAnnotation(
        original_name=common_name,
        left_seq=control_ids_string,
        left_qual='',
        right_seq='',
        right_qual='',
    )
    three_annotation = trim.PayloadAnnotation(
        original_name=common_name,
        left_seq=control_ids_string,
        left_qual='',
        right_seq=polyA_seq,
        right_qual=polyA_qual,
    )
    five_payload_read = fastq.Read(five_annotation.identifier,
                                   five_payload_seq, five_payload_qual)
    three_payload_read = fastq.Read(three_annotation.identifier,
                                    three_payload_seq, three_payload_qual)

    counters['positions'][polyT_read][max(0, common_right_reverse_start)] += 1
    counters['positions'][polyA_read][polyA_start] += 1
    counters['joint_lengths'][max(0, common_right_reverse_start),
                              polyA_start] += 1
    counters['polyA_lengths'][polyA_length] += 1
    counters['control_ids'][control_ids_string] += 1

    if polyA_length < 13:
        return None, None

    return five_payload_read, three_payload_read
Пример #12
0
 def trim_read(read):
     trimmed = fastq.Read(read.name,
                          read.seq[num_to_trim:],
                          read.qual[num_to_trim:],
                         )
     return trimmed
Пример #13
0
def extract_reads_from_combined(combined_mapping):
    R1_seq, R1_qual, R2_seq, R2_qual = extract_seqs_from_combined(
        combined_mapping, remove_soft_clipped=False)
    R1 = fastq.Read(combined_mapping.qname, R1_seq, R1_qual)
    R2 = fastq.Read(combined_mapping.qname, R2_seq, R2_qual)
    return R1, R2