def test_process_id_map(self): """process_id_map should return correct results on small test map""" s = """#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tX\tDescription #fake data x\tAA\tACGT\t3\tsample_x y\t"AC"\tACGT\t4\t"sample_y" z\tGG\tACGT\t5\tsample_z""" f = StringIO(s) f.name = 'test.xls' headers, id_map, description_map, run_description, errors, warnings = \ process_id_map(f) self.assertEqual(headers, ['BarcodeSequence', 'LinkerPrimerSequence', \ 'X']) self.assertEqual(id_map, {'y': {'X': '4', 'LinkerPrimerSequence': \ 'ACGT', 'BarcodeSequence': 'AC'}, 'x': {'X': '3', \ 'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'AA'}, 'z': \ {'X': '5', 'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'GG'}}) self.assertEqual(description_map, { 'x': 'sample_x', 'y': 'sample_y', 'z': 'sample_z', }) self.assertEqual(run_description, ['fake data']) self.assertEqual(errors, []) self.assertEqual(warnings, [])
def test_process_id_map(self): """process_id_map should return correct results on small test map""" s = """#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tX\tDescription #fake data x\tAA\tACGT\t3\tsample_x y\t"AC"\tACGT\t4\t"sample_y" z\tGG\tACGT\t5\tsample_z""" f = StringIO(s) f.name='test.xls' headers, id_map, description_map, run_description, errors, warnings = \ process_id_map(f) self.assertEqual(headers, ['BarcodeSequence', 'LinkerPrimerSequence', \ 'X']) self.assertEqual(id_map, {'y': {'X': '4', 'LinkerPrimerSequence': \ 'ACGT', 'BarcodeSequence': 'AC'}, 'x': {'X': '3', \ 'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'AA'}, 'z': \ {'X': '5', 'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'GG'}}) self.assertEqual(description_map, { 'x':'sample_x', 'y':'sample_y', 'z':'sample_z', }) self.assertEqual(run_description, ['fake data']) self.assertEqual(errors, []) self.assertEqual(warnings, [])
def add_qiime_labels(mapping_f, fasta_dir, filename_column, output_dir=".", count_start=0): """ Main function for combining fasta files, writing valid QIIME labels mapping_f: open file object of the metadata mapping file fasta_dir: Directory of fasta files to combine into a single file filename_column: Column of metadata mapping file containing fasta filenames output_dir: Directory to write output combined file to count_start: Number to start enumeration of fasta labels with """ headers, mapping_data, run_description, errors, warnings= \ process_id_map(mapping_f, has_barcodes=False, \ disable_primer_check=True, added_demultiplex_field=None, variable_len_barcodes=False) fasta_name_to_sample_id = check_mapping_data(mapping_data, headers, filename_column) fasta_files = get_fasta_fps(fasta_dir, fasta_name_to_sample_id.keys()) write_combined_fasta(fasta_name_to_sample_id, fasta_files, output_dir, counter=count_start)
def get_mapping_details(mapping_fp): """ Returns SampleIDs, Barcodes, Primer seqs from mapping file mapping_fp: filepath to mapping file """ mapping_f = open(mapping_fp, "U") # Only using the id_map and the errors from parsing the mapping file. hds, mapping_data, run_description, errors, warnings = \ process_id_map(mapping_f) mapping_f.close() # Errors means problems with SampleIDs or headers if errors: raise ValueError,('Error in mapping file, please validate '+\ 'mapping file with check_id_map.py') # create dict of dicts with SampleID:{each header:mapping data} id_map = {} for curr_data in mapping_data: id_map[curr_data[0]] = {} for header in range(len(hds)): for curr_data in mapping_data: id_map[curr_data[0]][hds[header]] = curr_data[header] sample_ids = id_map.keys() barcode_seqs = [] raw_linkerprimer_seqs = [] for curr_id in id_map: barcode_seqs.append(id_map[curr_id]['BarcodeSequence']) raw_linkerprimer_seqs.append(id_map[curr_id]['LinkerPrimerSequence']) # remove duplicates raw_linkerprimer_seqs = set(raw_linkerprimer_seqs) linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs) return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
def get_rev_primer_seqs(mapping_fp): """ Parses mapping file to get dictionary of SampleID:Rev primer mapping_fp: mapping filepath """ hds, mapping_data, run_description, errors, warnings = \ process_id_map(mapping_fp, has_barcodes=False, disable_primer_check=True) if errors: for curr_err in errors: if curr_err.startswith("Duplicate SampleID"): raise ValueError('Errors were found with mapping file, ' + 'please run validate_mapping_file.py to ' + 'identify problems.') # create dict of dicts with SampleID:{each header:mapping data} id_map = {} for curr_data in mapping_data: id_map[curr_data[0]] = {} for header in range(len(hds)): for curr_data in mapping_data: id_map[curr_data[0]][hds[header]] = curr_data[header] reverse_primers = {} for curr_id in id_map.keys(): try: reverse_primers[curr_id] =\ [str(DNA(curr_rev_primer).rc()) for curr_rev_primer in id_map[curr_id]['ReversePrimer'].split(',')] except KeyError: raise KeyError("Reverse primer not found in mapping file, " + "please include a 'ReversePrimer' column.") # Check for valid reverse primers # Will have been detected as warnings from mapping file for curr_err in errors: if curr_err.startswith("Invalid DNA sequence detected"): raise ValueError( "Problems found with reverse primers, please " + "check mapping file with validate_mapping_file.py") return reverse_primers
def get_rev_primer_seqs(mapping_fp): """ Parses mapping file to get dictionary of SampleID:Rev primer mapping_fp: mapping filepath """ hds, mapping_data, run_description, errors, warnings = \ process_id_map(mapping_fp, has_barcodes=False, disable_primer_check=True) if errors: for curr_err in errors: if curr_err.startswith("Duplicate SampleID"): raise ValueError,('Errors were found with mapping file, '+\ 'please run check_id_map.py to identify problems.') # create dict of dicts with SampleID:{each header:mapping data} id_map = {} for curr_data in mapping_data: id_map[curr_data[0]] = {} for header in range(len(hds)): for curr_data in mapping_data: id_map[curr_data[0]][hds[header]] = curr_data[header] reverse_primers = {} for curr_id in id_map.keys(): try: reverse_primers[curr_id] =\ [DNA.rc(curr_rev_primer) for curr_rev_primer in\ id_map[curr_id]['ReversePrimer'].split(',')] except KeyError: raise KeyError,("Reverse primer not found in mapping file, "+\ "please include a 'ReversePrimer' column.") # Check for valid reverse primers # Will have been detected as warnings from mapping file for curr_err in errors: if curr_err.startswith("Invalid DNA sequence detected"): raise ValueError,("Problems found with reverse primers, please "+\ "check mapping file with check_id_map.py") return reverse_primers
def check_map(mapping_file, barcode_type="golay_12", added_demultiplex_field=None): """ Gets header, mapping data, halts execution if there are errors mapping_file: list of lines of metadata mapping file barcode_type: Specified barcode, can be golay_12, hamming_8, variable_length, or an integer specifying length. added_demultiplex_field: Uses data supplied in metadata mapping field and demultiplexes according to data in fasta labels. """ if barcode_type == 0: has_barcodes = False var_len_barcodes = False elif barcode_type == 'variable_length': has_barcodes = True var_len_barcodes = True else: has_barcodes = True var_len_barcodes = False header, mapping_data, run_description, errors, warnings= \ process_id_map(mapping_file, has_barcodes=has_barcodes, \ disable_primer_check=True, added_demultiplex_field=added_demultiplex_field, variable_len_barcodes=var_len_barcodes) # Need to specifically detect varied length barcodes, otherwise won't know # how much of sequence to slice off for barcode reads for warning in warnings: if "differs than length" in warning: raise ValueError,("Detected variable length barcodes, if these "+\ "are being used, use -b variable_length") # Halt on errors, as these are serious problems with mapping file. # These include non-DNA characters in the barcodes, duplicate # barcodes or duplicate barcodes/added demultiplex fields, duplicate # SampleIDs, or header problems. if errors: raise ValueError,("Errors found in mapping file, please check "+\ "mapping file with check_id_map.py") return header, mapping_data
def check_map(infile, has_barcodes=True, disable_primer_check=False): """Check mapping file and extract list of valid barcodes, primers """ hds, id_map, dsp, run_description, errors, warnings = \ process_id_map(infile, is_barcoded=has_barcodes, \ disable_primer_check=disable_primer_check) barcode_to_sample_id = {} primer_seqs_lens = {} all_primers = {} for sample_id, sample in id_map.items(): barcode_to_sample_id[sample['BarcodeSequence'].upper()] = sample_id if not disable_primer_check: raw_primer = sample['LinkerPrimerSequence'].upper() expanded_primers = expand_degeneracies(raw_primer) curr_bc_primers = {} for primer in expanded_primers: curr_bc_primers[primer] = len(primer) all_primers[primer] = len(primer) primer_seqs_lens[sample['BarcodeSequence']] = curr_bc_primers return hds, id_map, barcode_to_sample_id, warnings, errors, \ primer_seqs_lens, all_primers
forward_primers = [] reverse_primers = [] for curr_primer in raw_forward_primers: forward_primers.append( compile(''.join([iupac[symbol] for symbol in curr_primer]))) for curr_primer in raw_reverse_primers: reverse_primers.append( compile(''.join([iupac[symbol] for symbol in curr_primer]))) return forward_primers, reverse_primers map_fp = open(argv[1], "U") header, mapping_data, run_description, errors, warnings = process_id_map( map_fp) forward_primers, reverse_primers = get_primers(header, mapping_data) seqs = open(argv[2], "U") out_seqs = open(argv[3], "w") log_out = open(argv[4], "w") f_count = 0 r_count = 0 no_seq_left = 0 for label, seq in MinimalFastaParser(seqs): start_slice = 0 end_slice = -1 for curr_primer in forward_primers:
def get_mapping_details(mapping_fp, suppress_barcode_checks=False, suppress_primer_checks=False): """ Returns SampleIDs, Barcodes, Primer seqs from mapping file mapping_fp: filepath to mapping file suppress_barcode_checks=If True, will skip getting barcodes from mapping file and searching for these in sequences. suppress_primer_checks=If True, will skip getting primers from mapping file and searching for these in sequences """ mapping_f = open(mapping_fp, "U") # Only using the id_map and the errors from parsing the mapping file. hds, mapping_data, run_description, errors, warnings = \ process_id_map(mapping_f) mapping_f.close() # Should raise errors for barcodes or primers unless suppressed, and # should raise errors for headers or duplicate SampleIDs in any case. loc_bcs = ",1" loc_primers = ",2" if errors: for curr_error in errors: # Halt when header has error if curr_error.startswith("Found header field"): raise ValueError('Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') elif curr_error.endswith(loc_bcs): # Halt for barcode errors unless suppressed if suppress_barcode_checks: continue else: raise ValueError( 'Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') elif curr_error.endswith(loc_primers): # Halt for primer errors unless suppressed if suppress_primer_checks: continue else: raise ValueError( 'Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') # Raise error on duplicate sample IDs elif curr_error.startswith("Duplicate SampleID"): raise ValueError('Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') # create dict of dicts with SampleID:{each header:mapping data} id_map = {} for curr_data in mapping_data: id_map[curr_data[0]] = {} for header in range(len(hds)): for curr_data in mapping_data: id_map[curr_data[0]][hds[header]] = curr_data[header] sample_ids = id_map.keys() barcode_seqs = [] raw_linkerprimer_seqs = [] for curr_id in id_map: if not suppress_barcode_checks: barcode_seqs.append(id_map[curr_id]['BarcodeSequence']) if not suppress_primer_checks: raw_linkerprimer_seqs.append( id_map[curr_id]['LinkerPrimerSequence']) # remove duplicates raw_linkerprimer_seqs = set(raw_linkerprimer_seqs) linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs) return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
def get_mapping_details(mapping_fp, suppress_barcode_checks=False, suppress_primer_checks=False): """ Returns SampleIDs, Barcodes, Primer seqs from mapping file mapping_fp: filepath to mapping file suppress_barcode_checks=If True, will skip getting barcodes from mapping file and searching for these in sequences. suppress_primer_checks=If True, will skip getting primers from mapping file and searching for these in sequences """ mapping_f = open(mapping_fp, "U") # Only using the id_map and the errors from parsing the mapping file. hds, mapping_data, run_description, errors, warnings = \ process_id_map(mapping_f) mapping_f.close() # Should raise errors for barcodes or primers unless suppressed, and # should raise errors for headers or duplicate SampleIDs in any case. loc_bcs = ",1" loc_primers = ",2" if errors: for curr_error in errors: # Halt when header has error if curr_error.startswith("Found header field"): raise ValueError('Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') elif curr_error.endswith(loc_bcs): # Halt for barcode errors unless suppressed if suppress_barcode_checks: continue else: raise ValueError('Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') elif curr_error.endswith(loc_primers): # Halt for primer errors unless suppressed if suppress_primer_checks: continue else: raise ValueError('Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') # Raise error on duplicate sample IDs elif curr_error.startswith("Duplicate SampleID"): raise ValueError('Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') # create dict of dicts with SampleID:{each header:mapping data} id_map = {} for curr_data in mapping_data: id_map[curr_data[0]] = {} for header in range(len(hds)): for curr_data in mapping_data: id_map[curr_data[0]][hds[header]] = curr_data[header] sample_ids = id_map.keys() barcode_seqs = [] raw_linkerprimer_seqs = [] for curr_id in id_map: if not suppress_barcode_checks: barcode_seqs.append(id_map[curr_id]['BarcodeSequence']) if not suppress_primer_checks: raw_linkerprimer_seqs.append( id_map[curr_id]['LinkerPrimerSequence']) # remove duplicates raw_linkerprimer_seqs = set(raw_linkerprimer_seqs) linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs) return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
def extract_barcodes(fastq1, fastq2=None, output_dir=".", input_type="barcode_single_end", bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, char_delineator=":", switch_bc_order=False, map_fp=None, attempt_read_orientation=False, disable_header_match=False): """ Main program function for extracting barcodes from reads fastq1: Open fastq file 1. fastq2: None or open fastq file 2. output_dir: Directory to write output parses sequences to. input_type: Specifies the type of parsing to be done. bc1_len: Length of barcode 1 to be parsed from fastq1 bc2_len: Length of barcode 2 to be parsed from fastq2, or from end of a stitched read. rev_comp_bc1: If True, reverse complement bc1 before writing. rev_comp_bc2: If True, reverse complement bc2 before writing. char_delineator: Specify character that immediately precedes the barcode for input_type of barcode_in_label. switch_bc_order: Normally, barcode 1 will be written first, followed by barcode 2 in a combined output fastq file. If True, the order will be reversed. Only applies to stitched reads processing, as other barcode orders are dictated by the the parameter chosen for the fastq files. map_fp: open file object of mapping file, requires a LinkerPrimerSequence and ReversePrimer field to be present. Used for orienting reads. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. disable_header_match: if True, suppresses checks between fastq headers. """ # Turn off extra file creation for single read. if input_type == "barcode_single_end" and attempt_read_orientation: attempt_read_orientation = False if attempt_read_orientation: header, mapping_data, run_description, errors, warnings =\ process_id_map(map_fp) forward_primers, reverse_primers = get_primers(header, mapping_data) output_bc_not_oriented = open(join(output_dir, "barcodes_not_oriented.fastq.incomplete"), "w") fastq1_out_not_oriented = open(join(output_dir, "reads1_not_oriented.fastq.incomplete"), "w") fastq2_out_not_oriented = open(join(output_dir, "reads2_not_oriented.fastq.incomplete"), "w") else: forward_primers = None reverse_primers = None output_bc_not_oriented = None fastq1_out_not_oriented = None fastq2_out_not_oriented = None output_bc_fastq = open(join(output_dir, "barcodes.fastq.incomplete"), "w") if input_type in ["barcode_single_end", "barcode_paired_stitched"]: output_fastq1 = open(join(output_dir, "reads.fastq.incomplete"), "w") output_fastq2 = None final_fastq1_name = join(output_dir, "reads.fastq") elif input_type in ["barcode_paired_end"]: output_fastq1 = open(join(output_dir, "reads1.fastq.incomplete"), "w") output_fastq2 = open(join(output_dir, "reads2.fastq.incomplete"), "w") final_fastq1_name = join(output_dir, "reads1.fastq") else: output_fastq1 = None output_fastq2 = None if not fastq2: fastq2 = cycle(["@", "AAAAAAAAAAAA", "+", "AAAAAAAAAAAA"]) not_paired = True else: not_paired = False check_header_match_f = get_casava_version(fastq1) header_index = 0 for read1_data, read2_data in izip( parse_fastq(fastq1, strict=False, enforce_qual_range=False), parse_fastq(fastq2, strict=False, enforce_qual_range=False)): if not disable_header_match: if not check_header_match_f(read1_data[header_index], read2_data[header_index]): raise FastqParseError("Headers of read1 and read2 do not match. Can't continue. " "Confirm that the fastq sequences that you are " "passing match one another. --disable_header_match can be " "used to suppress header checks.") if input_type == "barcode_single_end": process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len, rev_comp_bc1) elif input_type == "barcode_paired_end": process_barcode_paired_end_data(read1_data, read2_data, output_bc_fastq, output_fastq1, output_fastq2, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, attempt_read_orientation, forward_primers, reverse_primers, output_bc_not_oriented, fastq1_out_not_oriented, fastq2_out_not_oriented) elif input_type == "barcode_paired_stitched": process_barcode_paired_stitched(read1_data, output_bc_fastq, output_fastq1, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, attempt_read_orientation, forward_primers, reverse_primers, output_bc_not_oriented, fastq1_out_not_oriented, switch_bc_order) elif input_type == "barcode_in_label": if not_paired: curr_read2_data = False else: curr_read2_data = read2_data process_barcode_in_label(read1_data, curr_read2_data, output_bc_fastq, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, char_delineator) output_bc_fastq.close() rename(output_bc_fastq.name, join(output_dir, "barcodes.fastq")) if output_fastq1: output_fastq1.close() rename(output_fastq1.name, final_fastq1_name) if output_fastq2: output_fastq2.close() rename(output_fastq2.name, join(output_dir, "reads2.fastq")) if output_bc_not_oriented: rename(output_bc_not_oriented.name, join(output_dir, "barcodes_not_oriented.fastq")) if fastq1_out_not_oriented: rename(fastq1_out_not_oriented.name, join(output_dir, "reads1_not_oriented.fastq")) if fastq2_out_not_oriented: rename(fastq2_out_not_oriented.name, join(output_dir, "reads2_not_oriented.fastq"))
def run_demultiplex_and_trim(self, opts, **kwargs): """ The main part of the script that pulls all the various manipulations together. It takes arguments from the command line as well as **kwargs (currently only specifying gzip or not) """ import logging self.logger = logging.getLogger('demultip') sample_primer_dict = {} if not opts: sys.exit("command line options not getting to main method") metafile = opts.m # extract .gz to temp file location if 'gzipFilename' in kwargs: self.logger.info("Incoming kwargs detected...gzip file?") #sequence_file = kwargs.get('gzipFilename') else: self.logger.info("No kwargs, normal Fastq file") #sequence_file = opts.f self.logger.info("processing {0} total sequences".format( str((self.r1_tot + self.r2_tot) / 4))) self.logger.info( "using the first {0} bases of primer in search".format( self.search_length)) #extract the relevant data from the metadata file, can maybe change this to non-qiime1 self.logger.info("Getting header and mapping data...") header, mapping_data, run_description, errors, warnings = process_id_map( metafile) self.logger.debug("metadata headers {0}".format(header)) self.logger.debug("csv mapping data from {0}...\n{1}".format( metafile, "\n".join([str(x) for x in mapping_data]))) # get the primer regex search patterns self.logger.info("Generating regex search patterns...") forward_primers, forward_primers_rc, reverse_primers, reverse_primers_rc = self.create_primer_regex_patterns( header, mapping_data) self.primer_pattern_dict_list = { 'fp': forward_primers, 'fprc': forward_primers_rc, 'rp': reverse_primers, 'rprc': reverse_primers_rc } self.logger.debug("forward_primer patterns\n{0}\n".format("\n".join([ str(x.pattern) for x in self.primer_pattern_dict_list.get('fp') ]))) self.logger.debug("reverse_primers patterns\n{0}\n".format("\n".join([ str(x.pattern) for x in self.primer_pattern_dict_list.get('rp') ]))) # replace all extra characters in header with underscore intab = '.-+|=:;,&$' outtab = '__________' trantab = maketrans(intab, outtab) for samples in mapping_data: try: sample_primer_dict[samples[header.index('SampleID')].translate( trantab)] = (samples[header.index('LinkerPrimerSequence')], samples[header.index('ReversePrimer')]) except Exception as e: self.logger.error( "Can not find {0} in header fields, please make sure metadata file has the required fields" .format(e)) self.logger.debug("sample_primer_dict...{0}".format( "\n".join(x) for x in sample_primer_dict.items())) self.logger.info("Starting demultiplex process...") bar = progressbar.ProgressBar(max_value=(self.r1_tot + self.r2_tot) / 4, redirect_stdout=True) for r1, r2 in itertools.izip(self.R1.itervalues(), self.R2.itervalues()): #self.logger.debug("r1 {0}".format(r1)) #self.logger.debug("r2 {0}".format(r1)) pair_seq_dict = {'r1': r1, 'r2': r2} self.logger.debug("new read pair\n") self.logger.debug("processing new read pair {0}".format( pair_seq_dict.keys())) self.logger.debug("processing seq ID - R1 {0}... R2 {1}".format( r1.id, r2.id)) self.logger.debug("R1 sequence - {0}...".format(r1.seq[0:50])) self.logger.debug("R2 sequence - {0}...".format(r2.seq[0:50])) self.sample_id = "" # because we process two sequences at a time (R1 and R2) self.processed_seqs += 2 self.f_primer_found = [] self.r_primer_found = [] self.logger.debug("Looking in pair read for patterns...") search_result = self.regex_search_through_sequence( pair_seq_dict, self.primer_pattern_dict_list) #self.logger.debug("pre read correction search_result - {0}".format(search_result)) #search_result = self.correct_orientation_of_reads(search_result) #self.logger.debug("post read correction search_result - {0}".format(search_result)) try: if type(search_result) == list and len(search_result) > 1: self.logger.debug("search result - {0}".format( search_result[0])) self.logger.debug("search result - {0}".format( search_result[1])) except IndexError as e: self.logger.debug("search result - {0}".format(search_result)) self.logger.debug("error in list index {0}".format(e)) read_pair_proceed = self.screen_read_pair_suitability( search_result) self.logger.debug( "proceed with read pair ? {0}".format(read_pair_proceed)) if read_pair_proceed != 'failed': try: sample_id = self.get_sample_id_from_primer_sequence( sample_primer_dict, search_result[0].get('pattern'), search_result[1].get('pattern')) self.logger.debug( "- R1 ID -> {0} & R2 ID -> {1} from sample {2}".format( r1.id, r2.id, sample_id)) except IndexError as e: # sample is missing one or both the patterns keys self.logger.debug( "Sample seq is missing a pattern, {0}- discarding read" .format(e)) output = self.record_buffer_and_writer( {'discarded': pair_seq_dict}) self.unmapped_count += 2 continue try: new_seq = self.clip_primers_from_seq( search_result, self.primer_pattern_dict_list, pair_seq_dict, sample_primer_dict, sample_id) self.logger.debug( "clipped read returned...{0} seqs".format( len(new_seq))) output = self.record_buffer_and_writer(new_seq) self.both_primers_count += 2 except Exception as e: output = self.record_buffer_and_writer( {'discarded': pair_seq_dict}) self.logger.debug( "attempt to clip sequence failed - errmsg - {0} - discarding read {1}" .format(e, output)) self.unmapped_count += 2 continue bar.update(self.processed_seqs) if output == "cleared": self.record_buffer = {} self.logger.debug("buffer check {0}".format( self.record_buffer)) elif read_pair_proceed == 'failed': self.unmapped_count += 2 output = self.record_buffer_and_writer( {'discarded': pair_seq_dict}) bar.update(self.processed_seqs) self.logger.info("__________________________") self.logger.info("Samples successfully mapped (F+R found): {0}".format( self.both_primers_count)) self.logger.info("Read pairs in alternate orientation - {0}".format( str(len(self.alternate_orientation)))) self.logger.info("Sequences not mapped: {0}".format( self.unmapped_count)) self.logger.info("Total sequences checked: {0}".format( self.processed_seqs)) self.logger.info("writing alternate record IDs...") with open("alternate_orientation_records.txt", 'w') as f: for sequence_id in self.alternate_orientation: output_id = ''.join(sequence_id) f.write(output_id) self.logger.info("Run finished")
def extract_barcodes(fastq1, fastq2=None, output_dir=".", input_type="barcode_single_end", bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, char_delineator=":", switch_bc_order=False, map_fp=None, attempt_read_orientation=False, disable_header_match=False): """ Main program function for extracting barcodes from reads fastq1: Open fastq file 1. fastq2: None or open fastq file 2. output_dir: Directory to write output parses sequences to. input_type: Specifies the type of parsing to be done. bc1_len: Length of barcode 1 to be parsed from fastq1 bc2_len: Length of barcode 2 to be parsed from fastq2, or from end of a stitched read. rev_comp_bc1: If True, reverse complement bc1 before writing. rev_comp_bc2: If True, reverse complement bc2 before writing. char_delineator: Specify character that immediately precedes the barcode for input_type of barcode_in_label. switch_bc_order: Normally, barcode 1 will be written first, followed by barcode 2 in a combined output fastq file. If True, the order will be reversed. Only applies to stitched reads processing, as other barcode orders are dictated by the the parameter chosen for the fastq files. map_fp: open file object of mapping file, requires a LinkerPrimerSequence and ReversePrimer field to be present. Used for orienting reads. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. disable_header_match: if True, suppresses checks between fastq headers. """ # Turn off extra file creation for single read. if input_type == "barcode_single_end" and attempt_read_orientation: attempt_read_orientation = False if attempt_read_orientation: header, mapping_data, run_description, errors, warnings =\ process_id_map(map_fp) forward_primers, reverse_primers = get_primers(header, mapping_data) output_bc_not_oriented = open( join(output_dir, "barcodes_not_oriented.fastq.incomplete"), "w") fastq1_out_not_oriented = open( join(output_dir, "reads1_not_oriented.fastq.incomplete"), "w") fastq2_out_not_oriented = open( join(output_dir, "reads2_not_oriented.fastq.incomplete"), "w") else: forward_primers = None reverse_primers = None output_bc_not_oriented = None fastq1_out_not_oriented = None fastq2_out_not_oriented = None output_bc_fastq = open(join(output_dir, "barcodes.fastq.incomplete"), "w") if input_type in ["barcode_single_end", "barcode_paired_stitched"]: output_fastq1 = open(join(output_dir, "reads.fastq.incomplete"), "w") output_fastq2 = None final_fastq1_name = join(output_dir, "reads.fastq") elif input_type in ["barcode_paired_end"]: output_fastq1 = open(join(output_dir, "reads1.fastq.incomplete"), "w") output_fastq2 = open(join(output_dir, "reads2.fastq.incomplete"), "w") final_fastq1_name = join(output_dir, "reads1.fastq") else: output_fastq1 = None output_fastq2 = None if not fastq2: fastq2 = cycle(["@", "AAAAAAAAAAAA", "+", "AAAAAAAAAAAA"]) not_paired = True else: not_paired = False check_header_match_f = get_casava_version(fastq1) header_index = 0 for read1_data, read2_data in izip( parse_fastq(fastq1, strict=False, enforce_qual_range=False), parse_fastq(fastq2, strict=False, enforce_qual_range=False)): if not disable_header_match: if not check_header_match_f(read1_data[header_index], read2_data[header_index]): raise FastqParseError( "Headers of read1 and read2 do not match. Can't continue. " "Confirm that the fastq sequences that you are " "passing match one another. --disable_header_match can be " "used to suppress header checks.") if input_type == "barcode_single_end": process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len, rev_comp_bc1) elif input_type == "barcode_paired_end": process_barcode_paired_end_data( read1_data, read2_data, output_bc_fastq, output_fastq1, output_fastq2, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, attempt_read_orientation, forward_primers, reverse_primers, output_bc_not_oriented, fastq1_out_not_oriented, fastq2_out_not_oriented) elif input_type == "barcode_paired_stitched": process_barcode_paired_stitched( read1_data, output_bc_fastq, output_fastq1, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, attempt_read_orientation, forward_primers, reverse_primers, output_bc_not_oriented, fastq1_out_not_oriented, switch_bc_order) elif input_type == "barcode_in_label": if not_paired: curr_read2_data = False else: curr_read2_data = read2_data process_barcode_in_label(read1_data, curr_read2_data, output_bc_fastq, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, char_delineator) output_bc_fastq.close() rename(output_bc_fastq.name, join(output_dir, "barcodes.fastq")) if output_fastq1: output_fastq1.close() rename(output_fastq1.name, final_fastq1_name) if output_fastq2: output_fastq2.close() rename(output_fastq2.name, join(output_dir, "reads2.fastq")) if output_bc_not_oriented: rename(output_bc_not_oriented.name, join(output_dir, "barcodes_not_oriented.fastq")) if fastq1_out_not_oriented: rename(fastq1_out_not_oriented.name, join(output_dir, "reads1_not_oriented.fastq")) if fastq2_out_not_oriented: rename(fastq2_out_not_oriented.name, join(output_dir, "reads2_not_oriented.fastq"))
def test_process_id_map_added_demultiplex(self): """process_id_map handles added demultiplex fields""" s = """#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tX\tDescription #fake data x\tAA\tACGT\t3\tsample_x y\t"AC"\tACGT\t4\t"sample_y" z\tGG\tACGT\t5\tsample_z""" f = StringIO(s) f.name='test.xls' # Should raise error since demultiplex field not in mapping data. self.assertRaises(ValueError, process_id_map, f, added_demultiplex_field = 'Not_A_Field') """process_id_map should return correct results on small test map with the combinations of barcodes and added demultiplex fields unique""" s = """#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tX\tJanus\tDescription #fake data x\tAA\tACGT\t3\tDown\tsample_x y\t"AC"\tACGT\t4\tDown\t"sample_y" z\tAA\tACGT\t5\tNotUp\tsample_z""" f = StringIO(s) f.name='test.xls' headers, id_map, description_map, run_description, errors, warnings = \ process_id_map(f, added_demultiplex_field='Janus') self.assertEqual(headers, ['BarcodeSequence', 'LinkerPrimerSequence', \ 'X', 'Janus']) self.assertEqual(id_map, {'y': {'X': '4', 'Janus':'Down', 'LinkerPrimerSequence': \ 'ACGT', 'BarcodeSequence': 'AC'}, 'x': {'X': '3', 'Janus':'Down', \ 'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'AA'}, 'z': \ {'X': '5', 'Janus':'NotUp', 'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'AA'}}) self.assertEqual(description_map, { 'x':'sample_x', 'y':'sample_y', 'z':'sample_z', }) self.assertEqual(run_description, ['fake data']) self.assertEqual(errors, []) self.assertEqual(warnings, []) # Should get warnings with non-unique combinations of barcodes and # added demultiplex. s = """#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tX\tJanus\tDescription #fake data x\tAA\tACGT\t3\tDown\tsample_x y\t"AC"\tACGT\t4\tDown\t"sample_y" z\tAA\tACGT\t5\tDown\tsample_z""" f = StringIO(s) f.name='test.xls' headers, id_map, description_map, run_description, errors, warnings = \ process_id_map(f, added_demultiplex_field='Janus') self.assertEqual(headers, ['BarcodeSequence', 'LinkerPrimerSequence', \ 'X', 'Janus']) self.assertEqual(id_map, {'y': {'X': '4', 'Janus':'Down', 'LinkerPrimerSequence': \ 'ACGT', 'BarcodeSequence': 'AC'}, 'x': {'X': '3', 'Janus':'Down', \ 'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'AA'}, 'z': \ {'X': '5', 'Janus':'Down', 'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'AA'}}) self.assertEqual(description_map, { 'x':'sample_x', 'y':'sample_y', 'z':'sample_z', }) self.assertEqual(run_description, ['fake data']) expected_errors = ["DupChecker 'BarcodeSequence' found the following possible duplicates. If these metadata should have the same name, please correct.:\nGroup\tOriginal names\nAA\tAA,Down, AA,Down\nRow, column for all possible duplicate descriptions:\nLocation (row, column):\t0,1\nLocation (row, column):\t0,4\nLocation (row, column):\t2,1\nLocation (row, column):\t2,4\n"] self.assertEqual(errors, expected_errors) self.assertEqual(warnings, [])