def process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len=6, rev_comp_bc1=False): """ Processes, writes single-end barcode data, parsed sequence read1_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq1: open output fastq reads filepath bc1_len: length of barcode to remove from beginning of data rev_comp_bc1: reverse complement barcode before writing. """ header_index = 0 sequence_index = 1 quality_index = 2 bc_read = read1_data[sequence_index][:bc1_len] bc_qual = read1_data[quality_index][:bc1_len] if rev_comp_bc1: bc_read = str(DNA(bc_read).rc()) bc_qual = bc_qual[::-1] bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual) output_bc_fastq.write(bc_lines) seq_lines = format_fastq_record(read1_data[header_index], read1_data[sequence_index][bc1_len:], read1_data[quality_index][bc1_len:]) output_fastq1.write(seq_lines) return
def process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len=6, rev_comp_bc1=False): """ Processes, writes single-end barcode data, parsed sequence read1_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq1: open output fastq reads filepath bc1_len: length of barcode to remove from beginning of data rev_comp_bc1: reverse complement barcode before writing. """ header_index = 0 sequence_index = 1 quality_index = 2 bc_read = read1_data[sequence_index][:bc1_len] bc_qual = read1_data[quality_index][:bc1_len] if rev_comp_bc1: bc_read = DNA.rc(bc_read) bc_qual = bc_qual[::-1] bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual) output_bc_fastq.write(bc_lines) seq_lines = format_fastq_record(read1_data[header_index], read1_data[sequence_index][bc1_len:], read1_data[quality_index][bc1_len:]) output_fastq1.write(seq_lines) return
def test_format_fastq_record(self): """ Returns fastq record in the correct format """ label = "test_label" seq = "AATTCCGG" qual = "12345678" actual_lines = format_fastq_record(label, seq, qual) expected_lines = '@test_label\nAATTCCGG\n+\n12345678\n' self.assertEqual(actual_lines, expected_lines)
def process_barcode_in_label(read1_data, read2_data, output_bc_fastq, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, char_delineator=":"): """ Reads data from one or two fastq labels, writes output barcodes file. read1_data: list of header, read, quality scores read2_data: list of header, read, quality scores, False if no read 2. output_bc_fastq: open output fastq filepath bc1_len: length of barcode to remove from beginning of read1 data bc2_len: length of barcode to remove from beginning of read2 data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. char_delineator: Specify character that immediately precedes the barcode for input_type of barcode_in_label. """ header_index = 0 sequence_index = 1 quality_index = 2 # Check for char_delineator in sequence try: bc1_read = read1_data[header_index].split( char_delineator)[-1][0:bc1_len] # If there is an index error, it means the char_delineator wasn't found except IndexError: raise IndexError("Found sequence lacking character delineator. " "Sequence header %s, character delineator %s" % (read1_data[header_index], char_delineator)) # Create fake quality scores bc1_qual = "F" * len(bc1_read) if rev_comp_bc1: bc1_read = str(DNA(bc1_read).rc()) if read2_data: bc2_read =\ read2_data[header_index].strip().split( char_delineator)[-1][0:bc2_len] bc2_qual = "F" * len(bc2_read) if rev_comp_bc2: bc2_read = str(DNA(bc2_read).rc()) else: bc2_read = "" bc2_qual = "" if not bc1_read and not bc2_read: raise ValueError("Came up with empty barcode sequence, please check " "character delineator with -s, and fastq label " "%s" % read1_data[header_index]) bc_lines = format_fastq_record(read1_data[header_index], bc1_read + bc2_read, bc1_qual + bc2_qual) output_bc_fastq.write(bc_lines) return
def process_barcode_paired_stitched(read_data, output_bc_fastq, output_fastq, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, attempt_read_orientation=False, forward_primers=None, reverse_primers=None, output_bc_not_oriented=None, fastq_out_not_oriented=None, switch_bc_order=False): """ Processes stitched barcoded reads, writes barcode, parsed stitched read read_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq: open output fastq reads filepath bc1_len: length of barcode to remove from beginning of read1 stitched data bc2_len: length of barcode to remove from end of read2 stitched data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. forward_primers: list of regular expression generators, forward primers reverse_primers: list of regular expression generators, reverse primers output_bc_not_oriented: Barcode output from reads that are not oriented fastq_out_not_oriented: Open filepath to write reads where primers can't be found when attempt_read_orientation is True. switch_bc_order: Normally, barcode 1 will be written first, followed by barcode 2 in a combined output fastq file. If True, the order will be reversed. Only applies to stitched reads processing, as other barcode orders are dictated by the the parameter chosen for the fastq files. """ header_index = 0 sequence_index = 1 quality_index = 2 read_seq = read_data[sequence_index] read_qual = read_data[quality_index] found_primer_match = False # Break from orientation search as soon as a match is found if attempt_read_orientation: for curr_primer in forward_primers: if curr_primer.search(read_data[sequence_index]): found_primer_match = True break if not found_primer_match: for curr_primer in reverse_primers: if curr_primer.search(read_data[sequence_index]): read_seq = str(DNA(read_seq).rc()) read_qual = read_qual[::-1] found_primer_match = True break if not found_primer_match and attempt_read_orientation: output_bc = output_bc_not_oriented output_read = fastq_out_not_oriented else: output_bc = output_bc_fastq output_read = output_fastq bc_read1 = read_seq[0:bc1_len] bc_read2 = read_seq[-bc2_len:] bc_qual1 = read_qual[0:bc1_len] bc_qual2 = read_qual[-bc2_len:] if rev_comp_bc1: bc_read1 = str(DNA(bc_read1).rc()) bc_qual1 = bc_qual1[::-1] if rev_comp_bc2: bc_read2 = str(DNA(bc_read2).rc()) bc_qual2 = bc_qual2[::-1] if switch_bc_order: bc_read1, bc_read2 = bc_read2, bc_read1 bc_qual1, bc_qual2 = bc_qual2, bc_qual1 bc_lines = format_fastq_record(read_data[header_index], bc_read1 + bc_read2, bc_qual1 + bc_qual2) output_bc.write(bc_lines) seq_lines = format_fastq_record(read_data[header_index], read_seq[bc1_len:-bc2_len], read_qual[bc1_len:-bc2_len]) output_read.write(seq_lines) return
def process_barcode_paired_end_data(read1_data, read2_data, output_bc_fastq, output_fastq1, output_fastq2, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, attempt_read_orientation=False, forward_primers=None, reverse_primers=None, output_bc_not_oriented=None, fastq1_out_not_oriented=None, fastq2_out_not_oriented=None): """ Processes, writes paired-end barcode data, parsed sequences read1_data: list of header, read, quality scores read2_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq1: open output fastq reads 1 filepath output_fastq2: open output fastq reads 2 filepath bc1_len: length of barcode to remove from beginning of read1 data bc2_len: length of barcode to remove from beginning of read2 data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. forward_primers: list of regular expression generators, forward primers reverse_primers: list of regular expression generators, reverse primers output_bc_not_oriented: Barcode output from reads that are not oriented fastq1_out_not_oriented: Open filepath to write reads 1 where primers can't be found when attempt_read_orientation is True. fastq2_out_not_oriented: Open filepath to write reads 2 where primers can't be found when attempt_read_orientation is True. """ header_index = 0 sequence_index = 1 quality_index = 2 found_primer_match = False # Break from orientation search as soon as a match is found if attempt_read_orientation: # First check forward primers for curr_primer in forward_primers: if curr_primer.search(read1_data[sequence_index]): read1 = read1_data read2 = read2_data found_primer_match = True break if curr_primer.search(read2_data[sequence_index]): read1 = read2_data read2 = read1_data found_primer_match = True break # Check reverse primers if forward primers not found if not found_primer_match: for curr_primer in reverse_primers: if curr_primer.search(read1_data[sequence_index]): read1 = read2_data read2 = read1_data found_primer_match = True break if curr_primer.search(read2_data[sequence_index]): read1 = read1_data read2 = read2_data found_primer_match = True break else: read1 = read1_data read2 = read2_data if not found_primer_match and attempt_read_orientation: read1 = read1_data read2 = read2_data output_bc = output_bc_not_oriented output_read1 = fastq1_out_not_oriented output_read2 = fastq2_out_not_oriented else: output_bc = output_bc_fastq output_read1 = output_fastq1 output_read2 = output_fastq2 bc_read1 = read1[sequence_index][0:bc1_len] bc_read2 = read2[sequence_index][0:bc2_len] bc_qual1 = read1[quality_index][0:bc1_len] bc_qual2 = read2[quality_index][0:bc2_len] if rev_comp_bc1: bc_read1 = str(DNA(bc_read1).rc()) bc_qual1 = bc_qual1[::-1] if rev_comp_bc2: bc_read2 = str(DNA(bc_read2).rc()) bc_qual2 = bc_qual2[::-1] bc_lines = format_fastq_record(read1[header_index], bc_read1 + bc_read2, bc_qual1 + bc_qual2) output_bc.write(bc_lines) seq1_lines = format_fastq_record(read1[header_index], read1[sequence_index][bc1_len:], read1[quality_index][bc1_len:]) output_read1.write(seq1_lines) seq2_lines = format_fastq_record(read2[header_index], read2[sequence_index][bc2_len:], read2[quality_index][bc2_len:]) output_read2.write(seq2_lines) return
def process_barcode_in_label(read1_data, read2_data, output_bc_fastq, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, char_delineator=":"): """ Reads data from one or two fastq labels, writes output barcodes file. read1_data: list of header, read, quality scores read2_data: list of header, read, quality scores, False if no read 2. output_bc_fastq: open output fastq filepath bc1_len: length of barcode to remove from beginning of read1 data bc2_len: length of barcode to remove from beginning of read2 data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. char_delineator: Specify character that immediately precedes the barcode for input_type of barcode_in_label. """ header_index = 0 sequence_index = 1 quality_index = 2 # Check for char_delineator in sequence try: bc1_read = read1_data[header_index].split( char_delineator)[-1][0:bc1_len] # If there is an index error, it means the char_delineator wasn't found except IndexError: raise IndexError,("Found sequence lacking character delineator. " "Sequence header %s, character delineator %s" %\ (read1_data[header_index], char_delineator)) # Create fake quality scores bc1_qual = "F" * len(bc1_read) if rev_comp_bc1: bc1_read = DNA.rc(bc1_read) if read2_data: bc2_read =\ read2_data[header_index].strip().split(char_delineator)[-1][0:bc2_len] bc2_qual = "F" * len(bc2_read) if rev_comp_bc2: bc2_read = DNA.rc(bc2_read) else: bc2_read = "" bc2_qual = "" if not bc1_read and not bc2_read: raise ValueError, ("Came up with empty barcode sequence, please check " "character delineator with -s, and fastq label " "%s" % read1_data[header_index]) bc_lines = format_fastq_record(read1_data[header_index], bc1_read + bc2_read, bc1_qual + bc2_qual) output_bc_fastq.write(bc_lines) return
def process_barcode_paired_stitched(read_data, output_bc_fastq, output_fastq, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, attempt_read_orientation=False, forward_primers=None, reverse_primers=None, output_bc_not_oriented=None, fastq_out_not_oriented=None, switch_bc_order=False): """ Processes stitched barcoded reads, writes barcode, parsed stitched read read_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq: open output fastq reads filepath bc1_len: length of barcode to remove from beginning of read1 stitched data bc2_len: length of barcode to remove from end of read2 stitched data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. forward_primers: list of regular expression generators, forward primers reverse_primers: list of regular expression generators, reverse primers output_bc_not_oriented: Barcode output from reads that are not oriented fastq_out_not_oriented: Open filepath to write reads where primers can't be found when attempt_read_orientation is True. switch_bc_order: Normally, barcode 1 will be written first, followed by barcode 2 in a combined output fastq file. If True, the order will be reversed. Only applies to stitched reads processing, as other barcode orders are dictated by the the parameter chosen for the fastq files. """ header_index = 0 sequence_index = 1 quality_index = 2 read_seq = read_data[sequence_index] read_qual = read_data[quality_index] found_primer_match = False # Break from orientation search as soon as a match is found if attempt_read_orientation: for curr_primer in forward_primers: if curr_primer.search(read_data[sequence_index]): found_primer_match = True break if not found_primer_match: for curr_primer in reverse_primers: if curr_primer.search(read_data[sequence_index]): read_seq = DNA.rc(read_seq) read_qual = read_qual[::-1] found_primer_match = True break if not found_primer_match and attempt_read_orientation: output_bc = output_bc_not_oriented output_read = fastq_out_not_oriented else: output_bc = output_bc_fastq output_read = output_fastq bc_read1 = read_seq[0:bc1_len] bc_read2 = read_seq[-bc2_len:] bc_qual1 = read_qual[0:bc1_len] bc_qual2 = read_qual[-bc2_len:] if rev_comp_bc1: bc_read1 = DNA.rc(bc_read1) bc_qual1 = bc_qual1[::-1] if rev_comp_bc2: bc_read2 = DNA.rc(bc_read2) bc_qual2 = bc_qual2[::-1] if switch_bc_order: bc_read1, bc_read2 = bc_read2, bc_read1 bc_qual1, bc_qual2 = bc_qual2, bc_qual1 bc_lines = format_fastq_record(read_data[header_index], bc_read1 + bc_read2, bc_qual1 + bc_qual2) output_bc.write(bc_lines) seq_lines = format_fastq_record(read_data[header_index], read_seq[bc1_len:-bc2_len], read_qual[bc1_len:-bc2_len]) output_read.write(seq_lines) return
def process_barcode_paired_end_data(read1_data, read2_data, output_bc_fastq, output_fastq1, output_fastq2, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, attempt_read_orientation=False, forward_primers=None, reverse_primers=None, output_bc_not_oriented=None, fastq1_out_not_oriented=None, fastq2_out_not_oriented=None): """ Processes, writes paired-end barcode data, parsed sequences read1_data: list of header, read, quality scores read2_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq1: open output fastq reads 1 filepath output_fastq2: open output fastq reads 2 filepath bc1_len: length of barcode to remove from beginning of read1 data bc2_len: length of barcode to remove from beginning of read2 data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. forward_primers: list of regular expression generators, forward primers reverse_primers: list of regular expression generators, reverse primers output_bc_not_oriented: Barcode output from reads that are not oriented fastq1_out_not_oriented: Open filepath to write reads 1 where primers can't be found when attempt_read_orientation is True. fastq2_out_not_oriented: Open filepath to write reads 2 where primers can't be found when attempt_read_orientation is True. """ header_index = 0 sequence_index = 1 quality_index = 2 found_primer_match = False # Break from orientation search as soon as a match is found if attempt_read_orientation: # First check forward primers for curr_primer in forward_primers: if curr_primer.search(read1_data[sequence_index]): read1 = read1_data read2 = read2_data found_primer_match = True break if curr_primer.search(read2_data[sequence_index]): read1 = read2_data read2 = read1_data found_primer_match = True break # Check reverse primers if forward primers not found if not found_primer_match: for curr_primer in reverse_primers: if curr_primer.search(read1_data[sequence_index]): read1 = read2_data read2 = read1_data found_primer_match = True break if curr_primer.search(read2_data[sequence_index]): read1 = read1_data read2 = read2_data found_primer_match = True break else: read1 = read1_data read2 = read2_data if not found_primer_match and attempt_read_orientation: read1 = read1_data read2 = read2_data output_bc = output_bc_not_oriented output_read1 = fastq1_out_not_oriented output_read2 = fastq2_out_not_oriented else: output_bc = output_bc_fastq output_read1 = output_fastq1 output_read2 = output_fastq2 bc_read1 = read1[sequence_index][0:bc1_len] bc_read2 = read2[sequence_index][0:bc2_len] bc_qual1 = read1[quality_index][0:bc1_len] bc_qual2 = read2[quality_index][0:bc2_len] if rev_comp_bc1: bc_read1 = DNA.rc(bc_read1) bc_qual1 = bc_qual1[::-1] if rev_comp_bc2: bc_read2 = DNA.rc(bc_read2) bc_qual2 = bc_qual2[::-1] bc_lines = format_fastq_record(read1[header_index], bc_read1 + bc_read2, bc_qual1 + bc_qual2) output_bc.write(bc_lines) seq1_lines = format_fastq_record(read1[header_index], read1[sequence_index][bc1_len:], read1[quality_index][bc1_len:]) output_read1.write(seq1_lines) seq2_lines = format_fastq_record(read2[header_index], read2[sequence_index][bc2_len:], read2[quality_index][bc2_len:]) output_read2.write(seq2_lines) return