Python format_fastq_record示例，skbio.format.sequences.format_fastq_record Python示例

示例#1

0

显示文件

def process_barcode_single_end_data(read1_data,
                                    output_bc_fastq,
                                    output_fastq1,
                                    bc1_len=6,
                                    rev_comp_bc1=False):
    """ Processes, writes single-end barcode data, parsed sequence

    read1_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq1: open output fastq reads filepath
    bc1_len: length of barcode to remove from beginning of data
    rev_comp_bc1: reverse complement barcode before writing.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    bc_read = read1_data[sequence_index][:bc1_len]
    bc_qual = read1_data[quality_index][:bc1_len]
    if rev_comp_bc1:
        bc_read = str(DNA(bc_read).rc())
        bc_qual = bc_qual[::-1]

    bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual)
    output_bc_fastq.write(bc_lines)
    seq_lines = format_fastq_record(read1_data[header_index],
                                    read1_data[sequence_index][bc1_len:],
                                    read1_data[quality_index][bc1_len:])
    output_fastq1.write(seq_lines)

    return

示例#2

0

显示文件

文件： extract_barcodes.py 项目： Springbudder/qiime

def process_barcode_single_end_data(read1_data,
                                    output_bc_fastq,
                                    output_fastq1,
                                    bc1_len=6,
                                    rev_comp_bc1=False):
    """ Processes, writes single-end barcode data, parsed sequence

    read1_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq1: open output fastq reads filepath
    bc1_len: length of barcode to remove from beginning of data
    rev_comp_bc1: reverse complement barcode before writing.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    bc_read = read1_data[sequence_index][:bc1_len]
    bc_qual = read1_data[quality_index][:bc1_len]
    if rev_comp_bc1:
        bc_read = str(DNA(bc_read).rc())
        bc_qual = bc_qual[::-1]

    bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual)
    output_bc_fastq.write(bc_lines)
    seq_lines = format_fastq_record(read1_data[header_index],
                                    read1_data[sequence_index][bc1_len:],
                                    read1_data[quality_index][bc1_len:])
    output_fastq1.write(seq_lines)

    return

示例#3

0

显示文件

文件： split_libraries_fastq.py 项目： ekopylova/qiime

def extract_reads_from_interleaved(
        input_fp, forward_id, reverse_id, output_dir):
    """Parses a single fastq file and creates two new files: forward and reverse, based on
    the two values (comma separated) in read_direction_identifiers

    input_fp: file path to input
    read_direction_identifiers: comma separated values to identify forward and reverse reads
    output_folder: file path to the output folder
    """
    forward_fp = join(output_dir, "forward_reads.fastq")
    reverse_fp = join(output_dir, "reverse_reads.fastq")
    ffp = open(forward_fp, 'w')
    rfp = open(reverse_fp, 'w')

    for label, seq, qual in parse_fastq(qiime_open(input_fp), strict=False):
        fastq_string = format_fastq_record(label, seq, qual)
        if forward_id in label:
            ffp.write(fastq_string)
        elif reverse_id in label and forward_id not in label:
            rfp.write(fastq_string)
        else:
            ffp.close()
            rfp.close()
            raise ValueError("One of the input sequences doesn't have either identifier "
                             "or it has both.\nLabel: %s\nForward: %s\n Reverse: %s" %
                             (label, forward_id, reverse_id))
    ffp.close()
    rfp.close()

示例#4

0

显示文件

文件： filter.py 项目： Springbudder/qiime

def filter_fastq(input_seqs_f, output_seqs_f, seqs_to_keep, negate=False,
                 seqid_f=None):
    """ Write filtered input_seqs to output_seqs_f which contains only seqs_to_keep

        input_seqs can be the output of parse_fasta or parse_fastq
    """
    if seqid_f is None:
        seqs_to_keep_lookup = {}.fromkeys([seq_id.split()[0]
                                           for seq_id in seqs_to_keep])
        # Define a function based on the value of negate
        if not negate:
            def keep_seq(seq_id):
                return seq_id.split()[0] in seqs_to_keep_lookup
        else:
            def keep_seq(seq_id):
                return seq_id.split()[0] not in seqs_to_keep_lookup

    else:
        if not negate:
            keep_seq = seqid_f
        else:
            keep_seq = lambda x: not seqid_f(x)

    for seq_id, seq, qual in parse_fastq(input_seqs_f,
                                         enforce_qual_range=False):
        if keep_seq(seq_id):
            output_seqs_f.write(format_fastq_record(seq_id, seq, qual))
    output_seqs_f.close()

示例#5

0

显示文件

文件： split_libraries_fastq.py 项目： davidvilanova/qiime

def extract_reads_from_interleaved(input_fp, forward_id, reverse_id,
                                   output_dir):
    """Parses a single fastq file and creates two new files: forward and reverse, based on
    the two values (comma separated) in read_direction_identifiers

    input_fp: file path to input
    read_direction_identifiers: comma separated values to identify forward and reverse reads
    output_folder: file path to the output folder
    """
    forward_fp = join(output_dir, "forward_reads.fastq")
    reverse_fp = join(output_dir, "reverse_reads.fastq")
    ffp = open(forward_fp, 'w')
    rfp = open(reverse_fp, 'w')

    for label, seq, qual in parse_fastq(qiime_open(input_fp),
                                        strict=False,
                                        enforce_qual_range=False):
        fastq_string = format_fastq_record(label, seq, qual)
        if forward_id in label:
            ffp.write(fastq_string)
        elif reverse_id in label and forward_id not in label:
            rfp.write(fastq_string)
        else:
            ffp.close()
            rfp.close()
            raise ValueError(
                "One of the input sequences doesn't have either identifier "
                "or it has both.\nLabel: %s\nForward: %s\n Reverse: %s" %
                (label, forward_id, reverse_id))
    ffp.close()
    rfp.close()

示例#6

0

显示文件

def filter_fastq(input_seqs_f,
                 output_seqs_f,
                 seqs_to_keep,
                 negate=False,
                 seqid_f=None):
    """ Write filtered input_seqs to output_seqs_f which contains only seqs_to_keep

        input_seqs can be the output of parse_fasta or parse_fastq
    """
    if seqid_f is None:
        seqs_to_keep_lookup = {}.fromkeys(
            [seq_id.split()[0] for seq_id in seqs_to_keep])
        # Define a function based on the value of negate
        if not negate:

            def keep_seq(seq_id):
                return seq_id.split()[0] in seqs_to_keep_lookup
        else:

            def keep_seq(seq_id):
                return seq_id.split()[0] not in seqs_to_keep_lookup

    else:
        if not negate:
            keep_seq = seqid_f
        else:
            keep_seq = lambda x: not seqid_f(x)

    for seq_id, seq, qual in parse_fastq(input_seqs_f,
                                         enforce_qual_range=False):
        if keep_seq(seq_id):
            output_seqs_f.write(format_fastq_record(seq_id, seq, qual))
    output_seqs_f.close()

示例#7

0

显示文件

文件： join_paired_ends.py 项目： shiffer1/qiime

def write_synced_barcodes_fastq(joined_fp, index_fp):
    """Writes new index file based on surviving assembled paired-ends.
       -joined_fp : file path to paired-end assembled fastq file
       -index_fp : file path to index / barcode reads fastq file

       This function iterates through the joined reads file and index file.
       Only those index-reads within the file at index_fp, that have headers
       matching those within the joined-pairs at joined_fp, are written
       to file.

     WARNING: Assumes reads are in the same order in both files,
              except for cases in which the corresponding
              read in the joined_fp file is missing (i.e. pairs
              failed to assemble).

    """

    # open files (handles normal / gzipped data)
    jh = qiime_open(joined_fp)
    ih = qiime_open(index_fp)

    # base new index file name on joined paired-end file name:
    j_path, ext = os.path.splitext(joined_fp)
    filtered_bc_outfile_path = j_path + '_barcodes.fastq'
    fbc_fh = open(filtered_bc_outfile_path, 'w')

    # Set up iterators
    index_fastq_iter = parse_fastq(ih, strict=False)
    joined_fastq_iter = parse_fastq(jh, strict=False)
    # Write barcodes / index reads that we observed within
    # the joined paired-ends. Warn if index and joined data
    # are not in order.
    for joined_label, joined_seq, joined_qual in joined_fastq_iter:
        index_label, index_seq, index_qual = index_fastq_iter.next()
        while joined_label != index_label:
            try:
                index_label, index_seq, index_qual = index_fastq_iter.next()
            except StopIteration:
                raise StopIteration(
                    "\n\nReached end of index-reads file" +
                    " before iterating through joined paired-end-reads file!" +
                    " Except for missing paired-end reads that did not survive"
                    +
                    " assembly, your index and paired-end reads files must be in"
                    + " the same order! Also, check that the index-reads and" +
                    " paired-end reads have identical headers. The last joined"
                    + " paired-end ID processed was:\n\'%s\'\n" %
                    (joined_label))
        else:
            fastq_string = format_fastq_record(index_label, index_seq,
                                               index_qual)
            fbc_fh.write(fastq_string)

    ih.close()
    jh.close()
    fbc_fh.close()

    return filtered_bc_outfile_path

示例#8

0

显示文件

文件： join_paired_ends.py 项目： Springbudder/qiime

def write_synced_barcodes_fastq(joined_fp, index_fp):
    """Writes new index file based on surviving assembled paired-ends.
       -joined_fp : file path to paired-end assembled fastq file
       -index_fp : file path to index / barcode reads fastq file

       This function iterates through the joined reads file and index file.
       Only those index-reads within the file at index_fp, that have headers
       matching those within the joined-pairs at joined_fp, are written
       to file.

     WARNING: Assumes reads are in the same order in both files,
              except for cases in which the corresponding
              read in the joined_fp file is missing (i.e. pairs
              failed to assemble).

    """

    # open files (handles normal / gzipped data)
    jh = qiime_open(joined_fp)
    ih = qiime_open(index_fp)

    # base new index file name on joined paired-end file name:
    j_path, ext = os.path.splitext(joined_fp)
    filtered_bc_outfile_path = j_path + '_barcodes.fastq'
    fbc_fh = open(filtered_bc_outfile_path, 'w')

    # Set up iterators
    index_fastq_iter = parse_fastq(ih, strict=False, enforce_qual_range=False)
    joined_fastq_iter = parse_fastq(jh, strict=False, enforce_qual_range=False)
    # Write barcodes / index reads that we observed within
    # the joined paired-ends. Warn if index and joined data
    # are not in order.
    for joined_label, joined_seq, joined_qual in joined_fastq_iter:
        index_label, index_seq, index_qual = index_fastq_iter.next()
        while joined_label != index_label:
            try:
                index_label, index_seq, index_qual = index_fastq_iter.next()
            except StopIteration:
                raise StopIteration("\n\nReached end of index-reads file" +
                                    " before iterating through joined paired-end-reads file!" +
                                    " Except for missing paired-end reads that did not survive" +
                                    " assembly, your index and paired-end reads files must be in" +
                                    " the same order! Also, check that the index-reads and" +
                                    " paired-end reads have identical headers. The last joined" +
                                    " paired-end ID processed was:\n\'%s\'\n" % (joined_label))
        else:
            fastq_string = format_fastq_record(index_label, index_seq, index_qual)
            fbc_fh.write(fastq_string)

    ih.close()
    jh.close()
    fbc_fh.close()

    return filtered_bc_outfile_path

示例#9

0

显示文件

文件： extract_primer_lev.py 项目： billyfournier/remove_primers_levenshtien

def remove_primers(input_fastq, output_fastq,for_primers,rev_primers, ed_tol):
    count = 0
    with open(input_fastq) as read, open(output_fastq, "w") as out_seqs:
        for label,seq,qual in parse_fastq(read):
            for primerF,primerR in zip(for_primers,rev_primers):
                start_slice = editSearchForward(primerF,seq,ed_tol)
                end_slice = editSearchReverse(primerR,seq,ed_tol)
                # print type(start_slice), '\t',end_slice

            if (start_slice != -1) and (end_slice != -1):
                curr_seq = seq[start_slice:end_slice]
                curr_qual = qual[start_slice:end_slice]
                formatted_fastq_line = format_fastq_record(label, curr_seq, curr_qual)
                out_seqs.write("%s" % (formatted_fastq_line))

示例#10

0

显示文件

文件： extract_primers.py 项目： billyfournier/extract_primers

def remove_primers(input_fastq, output_fastq, primers):
    count = 0
    # USING regex list (Time 11m4)
    with open(input_fastq) as read, open(output_fastq, "w") as out_seqs:
        for label, seq, qual in parse_fastq(read):
            start_slice = 0
            if primers.search(seq):
                start_slice = int(primers.search(seq).end())
            curr_seq = seq[start_slice:]
            curr_qual = qual[start_slice:]
            if start_slice > 0:
                formatted_fastq_line = format_fastq_record(
                    label, curr_seq, curr_qual)
                # print ("%s" % (formatted_fastq_line))
                out_seqs.write("%s" % (formatted_fastq_line))

示例#11

0

显示文件

文件： split_libraries_fastq.py 项目： TankMermaid/qiime-1

 def fastq_writer(h, s, q):
     output_fastq_f.write(format_fastq_record(h, s, q))

示例#12

0

显示文件

文件： extract_barcodes.py 项目： Springbudder/qiime

def process_barcode_in_label(read1_data,
                             read2_data,
                             output_bc_fastq,
                             bc1_len=6,
                             bc2_len=6,
                             rev_comp_bc1=False,
                             rev_comp_bc2=False,
                             char_delineator=":"):
    """ Reads data from one or two fastq labels, writes output barcodes file.

    read1_data: list of header, read, quality scores
    read2_data: list of header, read, quality scores, False if no read 2.
    output_bc_fastq: open output fastq filepath
    bc1_len: length of barcode to remove from beginning of read1 data
    bc2_len: length of barcode to remove from beginning of read2 data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    char_delineator: Specify character that immediately precedes the barcode
        for input_type of barcode_in_label.
    """
    header_index = 0

    # Check for char_delineator in sequence
    try:
        bc1_read = read1_data[header_index].split(
            char_delineator)[-1][0:bc1_len]
    # If there is an index error, it means the char_delineator wasn't found
    except IndexError:
        raise IndexError("Found sequence lacking character delineator. "
                         "Sequence header %s, character delineator %s" %
                         (read1_data[header_index], char_delineator))

    # Create fake quality scores, using 6 here to match the existing qual fake
    # qual scores that were all F.
    bc1_qual = np.ones(len(bc1_read), dtype=np.int8) * 6
    if rev_comp_bc1:
        bc1_read = str(DNA(bc1_read).rc())

    if read2_data:
        bc2_read =\
            read2_data[header_index].strip().split(
                char_delineator)[-1][0:bc2_len]
        bc2_qual = np.ones(len(bc2_read), dtype=np.int8) * 6
        if rev_comp_bc2:
            bc2_read = str(DNA(bc2_read).rc())
    else:
        bc2_read = ""
        bc2_qual = np.array([], dtype=np.int8)

    if not bc1_read and not bc2_read:
        raise ValueError("Came up with empty barcode sequence, please check "
                         "character delineator with -s, and fastq label "
                         "%s" % read1_data[header_index])

    bc_lines = format_fastq_record(read1_data[header_index],
                                   bc1_read + bc2_read,
                                   np.hstack([bc1_qual, bc2_qual]))

    output_bc_fastq.write(bc_lines)

    return

示例#13

0

显示文件

文件： test_fastq.py 项目： cauyrd/scikit-bio

 def test_format_fastq_record_phred_offset_33(self):
     exp = b"@abc\ndef\n+\nGHI\n"
     obs = format_fastq_record(*self.args, phred_offset=33)
     self.assertEqual(obs, exp)

示例#14

0

显示文件

文件： extract_barcodes.py 项目： Springbudder/qiime

def process_barcode_paired_end_data(read1_data,
                                    read2_data,
                                    output_bc_fastq,
                                    output_fastq1,
                                    output_fastq2,
                                    bc1_len=6,
                                    bc2_len=6,
                                    rev_comp_bc1=False,
                                    rev_comp_bc2=False,
                                    attempt_read_orientation=False,
                                    forward_primers=None,
                                    reverse_primers=None,
                                    output_bc_not_oriented=None,
                                    fastq1_out_not_oriented=None,
                                    fastq2_out_not_oriented=None):
    """ Processes, writes paired-end barcode data, parsed sequences

    read1_data: list of header, read, quality scores
    read2_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq1: open output fastq reads 1 filepath
    output_fastq2: open output fastq reads 2 filepath
    bc1_len: length of barcode to remove from beginning of read1 data
    bc2_len: length of barcode to remove from beginning of read2 data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    attempt_read_orientation: If True, will attempt to orient the reads
        according to the forward primers in the mapping file. If primer is
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    forward_primers: list of regular expression generators, forward primers
    reverse_primers: list of regular expression generators, reverse primers
    output_bc_not_oriented: Barcode output from reads that are not oriented
    fastq1_out_not_oriented: Open filepath to write reads 1 where primers
        can't be found when attempt_read_orientation is True.
    fastq2_out_not_oriented: Open filepath to write reads 2 where primers
        can't be found when attempt_read_orientation is True.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    found_primer_match = False
    # Break from orientation search as soon as a match is found
    if attempt_read_orientation:
        # First check forward primers
        for curr_primer in forward_primers:
            if curr_primer.search(read1_data[sequence_index]):
                read1 = read1_data
                read2 = read2_data
                found_primer_match = True
                break
            if curr_primer.search(read2_data[sequence_index]):
                read1 = read2_data
                read2 = read1_data
                found_primer_match = True
                break
        # Check reverse primers if forward primers not found
        if not found_primer_match:
            for curr_primer in reverse_primers:
                if curr_primer.search(read1_data[sequence_index]):
                    read1 = read2_data
                    read2 = read1_data
                    found_primer_match = True
                    break
                if curr_primer.search(read2_data[sequence_index]):
                    read1 = read1_data
                    read2 = read2_data
                    found_primer_match = True
                    break
    else:
        read1 = read1_data
        read2 = read2_data

    if not found_primer_match and attempt_read_orientation:
        read1 = read1_data
        read2 = read2_data
        output_bc = output_bc_not_oriented
        output_read1 = fastq1_out_not_oriented
        output_read2 = fastq2_out_not_oriented
    else:
        output_bc = output_bc_fastq
        output_read1 = output_fastq1
        output_read2 = output_fastq2

    bc_read1 = read1[sequence_index][0:bc1_len]
    bc_read2 = read2[sequence_index][0:bc2_len]
    bc_qual1 = read1[quality_index][0:bc1_len]
    bc_qual2 = read2[quality_index][0:bc2_len]
    if rev_comp_bc1:
        bc_read1 = str(DNA(bc_read1).rc())
        bc_qual1 = bc_qual1[::-1]
    if rev_comp_bc2:
        bc_read2 = str(DNA(bc_read2).rc())
        bc_qual2 = bc_qual2[::-1]

    bc_lines = format_fastq_record(read1[header_index],
                                   bc_read1 + bc_read2,
                                   np.hstack([bc_qual1, bc_qual2]))
    output_bc.write(bc_lines)
    seq1_lines = format_fastq_record(read1[header_index],
                                     read1[sequence_index][bc1_len:], read1[quality_index][bc1_len:])
    output_read1.write(seq1_lines)
    seq2_lines = format_fastq_record(read2[header_index],
                                     read2[sequence_index][bc2_len:], read2[quality_index][bc2_len:])
    output_read2.write(seq2_lines)

    return

示例#15

0

显示文件

文件： extract_barcodes.py 项目： Springbudder/qiime

def process_barcode_paired_stitched(read_data,
                                    output_bc_fastq,
                                    output_fastq,
                                    bc1_len=6,
                                    bc2_len=6,
                                    rev_comp_bc1=False,
                                    rev_comp_bc2=False,
                                    attempt_read_orientation=False,
                                    forward_primers=None,
                                    reverse_primers=None,
                                    output_bc_not_oriented=None,
                                    fastq_out_not_oriented=None,
                                    switch_bc_order=False):
    """ Processes stitched barcoded reads, writes barcode, parsed stitched read

    read_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq: open output fastq reads filepath
    bc1_len: length of barcode to remove from beginning of read1 stitched data
    bc2_len: length of barcode to remove from end of read2 stitched data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    attempt_read_orientation: If True, will attempt to orient the reads
        according to the forward primers in the mapping file. If primer is
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    forward_primers: list of regular expression generators, forward primers
    reverse_primers: list of regular expression generators, reverse primers
    output_bc_not_oriented: Barcode output from reads that are not oriented
    fastq_out_not_oriented: Open filepath to write reads where primers
        can't be found when attempt_read_orientation is True.
    switch_bc_order: Normally, barcode 1 will be written first, followed by
        barcode 2 in a combined output fastq file. If True, the order will be
        reversed. Only applies to stitched reads processing, as other barcode
        orders are dictated by the the parameter chosen for the fastq files.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    read_seq = read_data[sequence_index]
    read_qual = read_data[quality_index]

    found_primer_match = False
    # Break from orientation search as soon as a match is found
    if attempt_read_orientation:
        for curr_primer in forward_primers:
            if curr_primer.search(read_data[sequence_index]):
                found_primer_match = True
                break
        if not found_primer_match:
            for curr_primer in reverse_primers:
                if curr_primer.search(read_data[sequence_index]):
                    read_seq = str(DNA(read_seq).rc())
                    read_qual = read_qual[::-1]
                    found_primer_match = True
                    break

    if not found_primer_match and attempt_read_orientation:
        output_bc = output_bc_not_oriented
        output_read = fastq_out_not_oriented
    else:
        output_bc = output_bc_fastq
        output_read = output_fastq

    bc_read1 = read_seq[0:bc1_len]
    bc_read2 = read_seq[-bc2_len:]
    bc_qual1 = read_qual[0:bc1_len]
    bc_qual2 = read_qual[-bc2_len:]

    if rev_comp_bc1:
        bc_read1 = str(DNA(bc_read1).rc())
        bc_qual1 = bc_qual1[::-1]
    if rev_comp_bc2:
        bc_read2 = str(DNA(bc_read2).rc())
        bc_qual2 = bc_qual2[::-1]

    if switch_bc_order:
        bc_read1, bc_read2 = bc_read2, bc_read1
        bc_qual1, bc_qual2 = bc_qual2, bc_qual1

    bc_lines = format_fastq_record(read_data[header_index],
                                   bc_read1 + bc_read2,
                                   np.hstack([bc_qual1, bc_qual2]))
    output_bc.write(bc_lines)
    seq_lines = format_fastq_record(read_data[header_index],
                                    read_seq[bc1_len:-bc2_len], read_qual[bc1_len:-bc2_len])
    output_read.write(seq_lines)

    return

示例#16

0

显示文件

文件： test_fastq.py 项目： cauyrd/scikit-bio

 def test_format_fastq_record_invalid_phred_offset(self):
     with self.assertRaises(ValueError):
         format_fastq_record(*self.args, phred_offset=42)

示例#17

0

显示文件

文件： test_fastq.py 项目： cauyrd/scikit-bio

 def test_format_fastq_record_phred_offset_64(self):
     exp = b"@abc\ndef\n+\nfgh\n"
     obs = format_fastq_record(*self.args, phred_offset=64)
     self.assertEqual(obs, exp)

示例#18

0

显示文件

def process_barcode_paired_stitched(read_data,
                                    output_bc_fastq,
                                    output_fastq,
                                    bc1_len=6,
                                    bc2_len=6,
                                    rev_comp_bc1=False,
                                    rev_comp_bc2=False,
                                    attempt_read_orientation=False,
                                    forward_primers=None,
                                    reverse_primers=None,
                                    output_bc_not_oriented=None,
                                    fastq_out_not_oriented=None,
                                    switch_bc_order=False):
    """ Processes stitched barcoded reads, writes barcode, parsed stitched read

    read_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq: open output fastq reads filepath
    bc1_len: length of barcode to remove from beginning of read1 stitched data
    bc2_len: length of barcode to remove from end of read2 stitched data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    attempt_read_orientation: If True, will attempt to orient the reads
        according to the forward primers in the mapping file. If primer is
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    forward_primers: list of regular expression generators, forward primers
    reverse_primers: list of regular expression generators, reverse primers
    output_bc_not_oriented: Barcode output from reads that are not oriented
    fastq_out_not_oriented: Open filepath to write reads where primers
        can't be found when attempt_read_orientation is True.
    switch_bc_order: Normally, barcode 1 will be written first, followed by
        barcode 2 in a combined output fastq file. If True, the order will be
        reversed. Only applies to stitched reads processing, as other barcode
        orders are dictated by the the parameter chosen for the fastq files.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    read_seq = read_data[sequence_index]
    read_qual = read_data[quality_index]

    found_primer_match = False
    # Break from orientation search as soon as a match is found
    if attempt_read_orientation:
        for curr_primer in forward_primers:
            if curr_primer.search(read_data[sequence_index]):
                found_primer_match = True
                break
        if not found_primer_match:
            for curr_primer in reverse_primers:
                if curr_primer.search(read_data[sequence_index]):
                    read_seq = str(DNA(read_seq).rc())
                    read_qual = read_qual[::-1]
                    found_primer_match = True
                    break

    if not found_primer_match and attempt_read_orientation:
        output_bc = output_bc_not_oriented
        output_read = fastq_out_not_oriented
    else:
        output_bc = output_bc_fastq
        output_read = output_fastq

    bc_read1 = read_seq[0:bc1_len]
    bc_read2 = read_seq[-bc2_len:]
    bc_qual1 = read_qual[0:bc1_len]
    bc_qual2 = read_qual[-bc2_len:]

    if rev_comp_bc1:
        bc_read1 = str(DNA(bc_read1).rc())
        bc_qual1 = bc_qual1[::-1]
    if rev_comp_bc2:
        bc_read2 = str(DNA(bc_read2).rc())
        bc_qual2 = bc_qual2[::-1]

    if switch_bc_order:
        bc_read1, bc_read2 = bc_read2, bc_read1
        bc_qual1, bc_qual2 = bc_qual2, bc_qual1

    bc_lines = format_fastq_record(read_data[header_index],
                                   bc_read1 + bc_read2,
                                   np.hstack([bc_qual1, bc_qual2]))
    output_bc.write(bc_lines)
    seq_lines = format_fastq_record(read_data[header_index],
                                    read_seq[bc1_len:-bc2_len],
                                    read_qual[bc1_len:-bc2_len])
    output_read.write(seq_lines)

    return

示例#19

0

显示文件

def process_barcode_in_label(read1_data,
                             read2_data,
                             output_bc_fastq,
                             bc1_len=6,
                             bc2_len=6,
                             rev_comp_bc1=False,
                             rev_comp_bc2=False,
                             char_delineator=":"):
    """ Reads data from one or two fastq labels, writes output barcodes file.

    read1_data: list of header, read, quality scores
    read2_data: list of header, read, quality scores, False if no read 2.
    output_bc_fastq: open output fastq filepath
    bc1_len: length of barcode to remove from beginning of read1 data
    bc2_len: length of barcode to remove from beginning of read2 data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    char_delineator: Specify character that immediately precedes the barcode
        for input_type of barcode_in_label.
    """
    header_index = 0

    # Check for char_delineator in sequence
    try:
        bc1_read = read1_data[header_index].split(
            char_delineator)[-1][0:bc1_len]
    # If there is an index error, it means the char_delineator wasn't found
    except IndexError:
        raise IndexError("Found sequence lacking character delineator. "
                         "Sequence header %s, character delineator %s" %
                         (read1_data[header_index], char_delineator))

    # Create fake quality scores, using 6 here to match the existing qual fake
    # qual scores that were all F.
    bc1_qual = np.ones(len(bc1_read), dtype=np.int8) * 6
    if rev_comp_bc1:
        bc1_read = str(DNA(bc1_read).rc())

    if read2_data:
        bc2_read =\
            read2_data[header_index].strip().split(
                char_delineator)[-1][0:bc2_len]
        bc2_qual = np.ones(len(bc2_read), dtype=np.int8) * 6
        if rev_comp_bc2:
            bc2_read = str(DNA(bc2_read).rc())
    else:
        bc2_read = ""
        bc2_qual = np.array([], dtype=np.int8)

    if not bc1_read and not bc2_read:
        raise ValueError("Came up with empty barcode sequence, please check "
                         "character delineator with -s, and fastq label "
                         "%s" % read1_data[header_index])

    bc_lines = format_fastq_record(read1_data[header_index],
                                   bc1_read + bc2_read,
                                   np.hstack([bc1_qual, bc2_qual]))

    output_bc_fastq.write(bc_lines)

    return

示例#20

0

显示文件

def process_barcode_paired_end_data(read1_data,
                                    read2_data,
                                    output_bc_fastq,
                                    output_fastq1,
                                    output_fastq2,
                                    bc1_len=6,
                                    bc2_len=6,
                                    rev_comp_bc1=False,
                                    rev_comp_bc2=False,
                                    attempt_read_orientation=False,
                                    forward_primers=None,
                                    reverse_primers=None,
                                    output_bc_not_oriented=None,
                                    fastq1_out_not_oriented=None,
                                    fastq2_out_not_oriented=None):
    """ Processes, writes paired-end barcode data, parsed sequences

    read1_data: list of header, read, quality scores
    read2_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq1: open output fastq reads 1 filepath
    output_fastq2: open output fastq reads 2 filepath
    bc1_len: length of barcode to remove from beginning of read1 data
    bc2_len: length of barcode to remove from beginning of read2 data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    attempt_read_orientation: If True, will attempt to orient the reads
        according to the forward primers in the mapping file. If primer is
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    forward_primers: list of regular expression generators, forward primers
    reverse_primers: list of regular expression generators, reverse primers
    output_bc_not_oriented: Barcode output from reads that are not oriented
    fastq1_out_not_oriented: Open filepath to write reads 1 where primers
        can't be found when attempt_read_orientation is True.
    fastq2_out_not_oriented: Open filepath to write reads 2 where primers
        can't be found when attempt_read_orientation is True.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2
    bc1_end = None
    bc2_end = None
    found_primer_match = False
    # Break from orientation search as soon as a match is found
    if attempt_read_orientation:
        # First check forward primers
        for curr_primer in forward_primers:
            if curr_primer.search(read1_data[sequence_index]):
                read1 = read1_data
                read2 = read2_data
                found_primer_match = True

                bc1_end = curr_primer.search(
                    read1_data[sequence_index]).start()  # self_add  by liaoth

                break
            if curr_primer.search(read2_data[sequence_index]):
                read1 = read2_data
                read2 = read1_data
                found_primer_match = True
                bc1_end = curr_primer.search(
                    read2_data[sequence_index]).start()  # self_add  by liaoth

                break
        # Check reverse primers if forward primers not found
        if found_primer_match:
            for curr_primer in reverse_primers:  # self_mod  by liaoth
                if curr_primer.search(read1_data[sequence_index]):
                    read1 = read2_data
                    read2 = read1_data
                    found_primer_match = True
                    bc2_end = curr_primer.search(
                        read1_data[sequence_index]).start(
                        )  # self_add  by liaoth
                    break
                if curr_primer.search(read2_data[sequence_index]):
                    read1 = read1_data
                    read2 = read2_data
                    found_primer_match = True
                    bc2_end = curr_primer.search(
                        read2_data[sequence_index]).start(
                        )  # self_add  by liaoth
                    break
                #if reverse_primers.index(curr_primer) == 1:
                #    import pdb;pdb.set_trace()
                found_primer_match = False
    else:
        read1 = read1_data
        read2 = read2_data

    if not found_primer_match and attempt_read_orientation:
        read1 = read1_data
        read2 = read2_data
        output_bc = output_bc_not_oriented
        output_read1 = fastq1_out_not_oriented
        output_read2 = fastq2_out_not_oriented
    else:
        output_bc = output_bc_fastq
        output_read1 = output_fastq1
        output_read2 = output_fastq2

    if bc1_end and bc2_end:  # self_add  by liaoth
        #print 'test successed'
        bc_read1 = read1[sequence_index][bc1_end - bc1_len:
                                         bc1_end]  # self_add  by liaoth
        bc_read2 = read2[sequence_index][bc2_end - bc2_len:
                                         bc2_end]  # self_add  by liaoth
        bc_qual1 = read1[quality_index][bc1_end -
                                        bc1_len:bc1_end]  # self_add  by liaoth
        bc_qual2 = read2[quality_index][bc2_end -
                                        bc2_len:bc2_end]  # self_add  by liaoth
    else:  # self_add  by liaoth
        bc_read1 = read1[sequence_index][0:bc1_len]
        bc_read2 = read2[sequence_index][0:bc2_len]
        bc_qual1 = read1[quality_index][0:bc1_len]
        bc_qual2 = read2[quality_index][0:bc2_len]

    if rev_comp_bc1:
        bc_read1 = str(DNA(bc_read1).rc())
        bc_qual1 = bc_qual1[::-1]
    if rev_comp_bc2:
        bc_read2 = str(DNA(bc_read2).rc())
        bc_qual2 = bc_qual2[::-1]

    bc_lines = format_fastq_record(read1[header_index], bc_read1 + bc_read2,
                                   np.hstack([bc_qual1, bc_qual2]))
    output_bc.write(bc_lines)
    if found_primer_match and attempt_read_orientation:  # self_add  by liaoth

        seq1_lines = format_fastq_record(read1[header_index],
                                         read1[sequence_index][bc1_end:],
                                         read1[quality_index][bc1_end:])
        output_read1.write(seq1_lines)
        seq2_lines = format_fastq_record(read2[header_index],
                                         read2[sequence_index][bc2_end:],
                                         read2[quality_index][bc2_end:])
        output_read2.write(seq2_lines)

    else:  # self_add  by liaoth
        seq1_lines = format_fastq_record(read1[header_index],
                                         read1[sequence_index][bc1_len:],
                                         read1[quality_index][bc1_len:])
        output_read1.write(seq1_lines)
        seq2_lines = format_fastq_record(read2[header_index],
                                         read2[sequence_index][bc2_len:],
                                         read2[quality_index][bc2_len:])
        output_read2.write(seq2_lines)

    return

示例#21

0

显示文件

文件： test_fastq.py 项目： squarednob/scikit-bio

 def test_format_fastq_record_phred_offset_33(self):
     exp = b"@abc\ndef\n+\nGHI\n"
     obs = format_fastq_record(*self.args, phred_offset=33)
     self.assertEqual(obs, exp)

示例#22

0

显示文件

文件： test_fastq.py 项目： squarednob/scikit-bio

 def test_format_fastq_record_phred_offset_64(self):
     exp = b"@abc\ndef\n+\nfgh\n"
     obs = format_fastq_record(*self.args, phred_offset=64)
     self.assertEqual(obs, exp)

示例#23

0

显示文件

文件： split_libraries_fastq.py 项目： ElDeveloper/qiime

 def fastq_writer(h, s, q):
     output_fastq_f.write(format_fastq_record(h, s, q))

示例#24

0

显示文件

文件： test_fastq.py 项目： squarednob/scikit-bio

 def test_format_fastq_record_invalid_phred_offset(self):
     with self.assertRaises(ValueError):
         format_fastq_record(*self.args, phred_offset=42)