def test_parse_qual_scores(self): """qual_scores should return dict of {id:qual_scores}""" scores = StringIO('>x\n5 10 5\n12\n>y\n30 40') scores2 = StringIO('>a\n5 10 5\n12\n>b\n30 40') self.assertEqual(parse_qual_scores([scores, scores2]), { 'x': [5, 10, 5, 12], 'y': [30, 40], 'a': [5, 10, 5, 12], 'b': [30, 40] })
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) in_fasta = open(opts.input_fasta_fp, 'U') quals = parse_qual_scores([open(f, 'U') for f in opts.qual_fps]) if not opts.result_fp: opts.result_fp = opts.input_fasta_fp + '.fastq' if opts.split: make_fastq_multi(in_fasta, quals, opts.result_fp) else: make_fastq_single(in_fasta, quals, opts.result_fp)
def test_parse_qual_scores(self): """qual_scores should return dict of {id:qual_scores}""" scores = StringIO('>x\n5 10 5\n12\n>y\n30 40') scores2= StringIO('>a\n5 10 5\n12\n>b\n30 40') self.assertEqual(parse_qual_scores([scores, scores2]), {'x':[5,10,5,12],'y':[30,40],'a':[5,10,5,12],'b':[30,40]})
def preprocess(fasta_files, qual_files, mapping_file, barcode_type="golay_12", min_seq_len=200, max_seq_len=1000, min_qual_score=25, starting_ix=1, keep_primer=True, max_ambig=0, max_primer_mm=1, trim_seq_len=True, dir_prefix='.', max_bc_errors=2, max_homopolymer=4,remove_unassigned=False, keep_barcode=False, attempt_bc_correction=True, qual_score_window=0, disable_primers=False, reverse_primers='disable', record_qual_scores=False): """ Preprocess barcoded libraries, e.g. from 454. Parameters: fasta_files: list of raw 454 fasta files, fasta format. qual_files: list of raw 454 qual file(s) mapping_file: mapping file with BarcodeSequence column containing valid barcodes used in the 454 run barcode_type: type of barcode, e.g. golay_12. Should appear in list of known barcode types. min_seq_len: minimum sequence length to allow. max_seq_len: maximum sequence length to allow. min_qual_score: minimum average qual score considered acceptaable. starting_ix: integer to start sample sequence numbering at. keep_primer: when True, will keep primer sequence, otherwise will strip it keep_barcode: when True, will keep barcode sequence, otherwise will strip it max_ambig: maximum number of ambiguous bases to allow in the read. max_primer_mm: maximum number of primer mismatches to allow. trim_seq_len: if True (default), calculates lengths after trimming. dir_prefix: prefix of directories to write files into. max_bc_errors: maximum number of barcode errors to allow in output seqs max_homopolymer: maximum number of a nucleotide that can be repeated in a given sequence. remove_unassigned: If True (False default), will not write seqs to the output .fna file that have a valid barcode (by Golay or Hamming standard) but are not included in the input mapping file. attempt_bc_correction: (default True) will attempt to find nearest valid barcode. Can be disabled to improve performance. disable_primers: (default False) Disables testing for primers in the input mapping file and primer testing in the input sequence files. reverse_primers: (default 'disable') Enables removal of reverse primers and any subsequence sequence data from output reads. Reverse primers have to be in 5'->3' format and in correct IUPAC codes in a column "ReversePrimer" in the input mapping file. Run check_id_map to make test primers in this column for valid formatting. The primers read from this column will be reverse complemented and associated with the given barcode in the mapping file. If set to 'truncate_only', sequences where primers are found will be truncated, sequences where the primer is not found will be written unchanged. If set to 'truncate_remove', sequences where primers are found will be truncated, sequences where the primer is not found will not be written and counted in the log file as failing for this reason. The mismatches allowed for a reverse primer match are the same as specified for the forward primer mismatches with the -M parameter (default 0). record_qual_scores: (default False) Will record quality scores for all sequences that are written to the output seqs.fna file in a separate file (seqs_filtered.qual) containing the same sequence IDs and quality scores for all bases found in the seqs.fna file. Result: in dir_prefix, writes the following files: id_map.xls: 2-column tab-delimited text format orig_id:new_id error_map.xls: 2-column tab-delimited text format orig_id:fail_reasons seqs.fasta: sequences with new ids lib_index in fasta format lengths.xls: histograms of unfiltered and filtered lengths, resolution 10 bp """ if max_seq_len < 10: raise ValueError, "Max sequence must be >= 10" if min_seq_len >= max_seq_len: raise ValueError, "Min len cannot be >= max len" if min_qual_score < 0: raise ValueError, "Min qual score must be > 0" if starting_ix < 1: raise ValueError, "Starting index must be > 0." if max_ambig < 0: raise ValueError, "Max ambig chars must be >= 0." if max_primer_mm < 0: raise ValueError, "Max primer mismatches must be >= 0." if min_qual_score < 5: raise ValueError, "Min qual score must be >= 5." if reverse_primers not in ['disable','truncate_only','truncate_remove']: raise ValueError, ("reverse_primers parameter must be 'disable', "+\ "truncate_only, or truncate_remove.") create_dir(dir_prefix, fail_on_exist=False) # try: # stat(dir_prefix) # except OSError: # mkdir(dir_prefix) """# Generate primer sequence patterns - changing to mapping file primers. all_primer_seqs, primer_seq_len = \ get_primer_seqs(primer_seq_pats.split(',')) """ # Check mapping file and get barcode mapping map_file = open(mapping_file, 'U') headers, id_map, valid_map, warnings, errors, \ primer_seqs_lens, all_primers = check_map(map_file, \ disable_primer_check = disable_primers ) if reverse_primers != 'disable': if 'ReversePrimer' not in headers: raise ValueError, ('To enable reverse primer check, there must '+\ 'be a "ReversePrimer" column in the mapping file with a reverse '+\ 'primer in each cell.') rev_primers = get_reverse_primers(id_map) else: rev_primers = False # *** Generate dictionary of {barcode: DNA.rc(ReversePrimer)} # First check for ReversePrimer in headers, raise error if not found # Implement local alignment for primer after barcode is determined. # Add option to flag seq with error for rev_primer not found # Check primer hit index, truncate sequence # unit tests. map_file.close() if errors: raise ValueError, "Invalid mapping file. "+\ "Validate with check_id_map first: %s" % "\n".join(errors) # Find actual length of barcodes in the mapping file, also check for # variable lengths barcode_length_check = list(set([len(bc) for bc in valid_map])) # Check barcode type if barcode_type not in BARCODE_TYPES: try: barcode_len, barcode_fun = int(barcode_type), correct_barcode except ValueError: raise ValueError, "Unsupported barcode type: %s" % barcode_type else: barcode_len, barcode_fun = BARCODE_TYPES[barcode_type] # As people often do not specify a barcode that matches the lengths # of the barcodes used, a check on the actual barcode lengths needs to # be done, and an exception raised if they are variable length and not # specified as so. if barcode_type != "variable_length": # Raise error if variable length barcodes are present but not # specified if len(barcode_length_check) != 1: raise ValueError, ('Mapping file has variable length '+\ 'barcodes. If this is intended, specifiy variable lengths '+\ 'with the -b variable_length option.') # Raise error if the specified barcode length doesn't match what # is present in the mapping file. if barcode_len != barcode_length_check[0]: raise ValueError, ('Barcode length detected in the mapping file, '+\ ' %d does not match specified barcode length, %d. ' % \ (barcode_length_check[0], barcode_len) + 'To specify a barcode '+\ 'length use -b golay_12 or -b hamming_8 for 12 and 8 base pair '+\ 'golay or hamming codes respectively, or -b # where # is the '+\ 'length of the barcode used. E.g. -b 4 for 4 base pair barcodes.') fasta_files = map(get_infile, fasta_files) qual_files = map(get_infile, qual_files) # Check fasta files valid format, no duplicate ids # and ids match between fasta and qual files all_fasta_ids = fasta_ids(fasta_files) all_qual_ids = fasta_ids(qual_files) if qual_files and (len(all_fasta_ids) != len(all_qual_ids)): f_ids = all_fasta_ids.difference(all_qual_ids) q_ids = all_qual_ids.difference(all_fasta_ids) raise ValueError, "Found %d ids in fasta file not in qual file, %d ids in qual file not in fasta" % (len(f_ids), len(q_ids)) for f in fasta_files: f.seek(0) if qual_files: for q in qual_files: q.seek(0) # Load quality scores qual_mappings = parse_qual_scores(qual_files) for q in qual_files: q.close() else: qual_mappings = {} #make filters filters = [] #seq len filter depends on whether we're including the barcode if trim_seq_len: # This processing occurs before primer testing, will use largest # primer length to calculate lengths. the dict all_primers has # keys of each primer with the length of said primer as the value if not disable_primer_check: primer_seq_len = max(all_primers.values()) else: # Set to zero if primers not used primer_seq_len = 0 trim = barcode_len + primer_seq_len filters.append(SeqQualBad( 'Length outside bounds of %s and %s' % (min_seq_len,max_seq_len), lambda id_, seq, qual: \ not (min_seq_len<=len(seq)-trim<= max_seq_len))) else: filters.append(SeqQualBad( 'Length outside bounds of %s and %s' % (min_seq_len,max_seq_len), lambda id_, seq, qual: not (min_seq_len<=len(seq)<= max_seq_len))) filters.append(SeqQualBad( 'Num ambiguous bases exceeds limit of %s' % max_ambig, lambda id_, seq, qual: count_ambig(seq) > max_ambig)) if qual_mappings: filters.append(QualMissing) filters.append(SeqQualBad( 'Mean qual score below minimum of %s' % min_qual_score, lambda id_, seq, qual: mean(qual) < min_qual_score)) if qual_score_window: filters.append(SeqQualBad('Mean window qual score below '+\ 'minimum of %s' % min_qual_score, lambda id_, seq, qual: \ not check_window_qual_scores(qual, qual_score_window, \ min_qual_score))) # Changed this to check entire sequence after barcode-could cause issue # if barcode-linker-primer have long homopolymers though. filters.append(SeqQualBad( 'Max homopolymer run exceeds limit of %s' % max_homopolymer, lambda id_, seq, qual: seq_exceeds_homopolymers( seq[barcode_len:], max_homopolymer))) # Check seqs and write out fasta_out = open(dir_prefix + '/' + 'seqs.fna', 'w+') if record_qual_scores: qual_out = open(dir_prefix + '/' + 'seqs_filtered.qual', 'w+') else: qual_out = False '''log_stats, pre_lens, post_lens = check_seqs(fasta_out, fasta_files, starting_ix, valid_map, qual_mappings, filters, barcode_len, primer_seq_len, keep_primer, keep_barcode, barcode_type, max_bc_errors, remove_unassigned) ''' log_stats, pre_lens, post_lens = check_seqs(fasta_out, fasta_files, starting_ix, valid_map, qual_mappings, filters, barcode_len, keep_primer, keep_barcode, barcode_type, max_bc_errors, remove_unassigned, attempt_bc_correction, primer_seqs_lens, all_primers, max_primer_mm, disable_primers, reverse_primers, rev_primers, qual_out) # Write log file log_file = open(dir_prefix + '/' + "split_library_log.txt", 'w+') log_file.write('\n'.join(log_stats)) log_file.close() # Write sequence distros here histogram_file = open(dir_prefix + '/' + 'histograms.txt', 'w+') histogram_file.write(format_histograms (*make_histograms(pre_lens, post_lens))) histogram_file.close()