def complement(seq): """Take the complement of a nucleotide sequence""" try: c = "".join(__complementaryBasePairs[i] for i in seq) except KeyError: print_error("error: non-cannonical representation of a nucleotide " "sequence provided. String is {0}".format(seq)) return c
def translate_quality(quals, encoding=33): """ Translate ASCII characters to quality scores """ valid_range = range(0, 43) qscores = [ord(i) - encoding for i in quals] for qscore in qscores: if qscore not in valid_range: print_error("error: wrong quality score encoding provided") return qscores
def parse_commas(args, argname): args = [i.lstrip() for i in args.split(",")] if 1 > len(args) > 2: seq_io.print_error("error: only one or two integer values should be " "provided to {0}".format(argname)) try: arg1 = int(args[0]) arg2 = int(args[1]) except ValueError: seq_io.print_error("error: input to {0} must be one or more integer " "values in the form INT or INT,INT".format(argname)) except IndexError: arg1 = arg2 = int(args[0]) return (arg1, arg2)
def parse_colons(argument): try: window, score = argument.split(':') except ValueError: seq_io.print_error("error: the input provided to sliding-window is " "formatted incorrectly. See --help for usage") else: if score.isdigit(): score = int(score) else: seq_io.print_error("error: the quality score threshold provided " "to sliding-window must be an integer value") if window.isdigit(): window = int(window) else: try: window = float(window) except ValueError: seq_io.print_error("error: the window-size provided to " "sliding-window must be either an integer " "value or a fraction") return (window, score)
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'fhandle', metavar='in1.fastq', type=str, action=seq_io.Open, mode='rb', default=sys.stdin, help="input reads in fastq format. Can be a file containing either " "single-end or forward/interleaved reads if reads are paired-end " "[required]") input_arg = parser.add_mutually_exclusive_group(required=False) input_arg.add_argument('--interleaved', action='store_true', help="input is interleaved paired-end reads") input_arg.add_argument('-r', '--reverse', dest='rhandle', metavar='in2.fastq', action=seq_io.Open, mode='rb', help="input reverse reads in fastq format") parser.add_argument('-o', '--out', metavar='FILE', dest='out_f', type=str, action=seq_io.Open, mode='wt', default=sys.stdout, help="output trimmed reads [default: stdout]") parser.add_argument('-v', '--out-reverse', metavar='FILE', dest='out_r', type=str, action=seq_io.Open, mode='wt', help="output trimmed reverse reads") parser.add_argument('-s', '--singles', metavar='FILE', dest='out_s', type=str, action=seq_io.Open, mode='wt', help="output trimmed orphaned reads") parser.add_argument( '-q', '--qual-offset', metavar='TYPE', dest='offset', type=int, choices=[33, 64], default=33, help="ASCII base quality score encoding [default: 33]. Options are " "33 (phred33) or 64 (phred64)") parser.add_argument( '-m', '--min-len', metavar='LEN [,LEN]', dest='minlen', type=str, help="filter reads shorter than the minimum length threshold [default:" " 0]. Different values can be provided for the forward and " "reverse reads, respectively, by separating them with a comma " "(e.g. 80,60), or a single value can be provided for both") trim_args = parser.add_argument_group('trimming options') trim_args.add_argument( '-O', '--trim-order', metavar='ORDER', dest='trim_order', type=str, default='ltw', help="order that the trimming methods should be applied [default: ltw]" ". Available methods are l (leading), t (trailing), and w " "(sliding-window)") trim_args.add_argument( '-W', '--sliding-window', metavar='FRAME', dest='sw', type=parse_colons, help="trim read ends using a sliding window approach. Input should be " "of the form 'window_size:qual_threshold', where 'qual_threshold' " "is an integer between 0 and 42 and 'window_size' can either be " "length in bases or fraction of total read length") trim_args.add_argument( '-H', '--headcrop', metavar='INT [,INT]', type=str, help="remove exactly the number of bases specified from the start of " "the reads [default: 0]. Different values can be provided for " "the forward and reverse reads, respectively, by separating them " "with a comma (e.g. 2,0), or a single value can be provided for " "both. Cropping will always be applied first") trim_args.add_argument( '-C', '--crop', metavar='INT [,INT]', type=str, help="crop reads to the specified position [default: off]. The " "value(s) should be less than the maximum read length, otherwise " "no cropping will be applied. Different values can be provided " "for the forward and reverse reads, respectively, by separating " "them with a comma (e.g. 120,115), or a single value can be " "provided for both. Cropping will always be applied first") trim_args.add_argument( '-L', '--leading', metavar='SCORE', dest='lead_score', type=int, help="trim by removing low quality bases from the start of the read") trim_args.add_argument( '-T', '--trailing', metavar='SCORE', dest='trail_score', type=int, help="trim by removing low quality bases from the end of the read") trim_args.add_argument( '--trunc-n', dest='trunc_n', action='store_true', help="truncate sequence at position of first ambiguous base [default: " "off]. Truncation will always be applied last") parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) parser.add_argument( '-t', '--threads', action=CheckThreads, type=int, default=1, help='number of threads to use for trimming [default: 1]') args = parser.parse_args() all_args = sys.argv[1:] seq_io.program_info('qtrim', all_args, __version__) # Track program run-time start_time = time() # Assign variables based on arguments supplied by the user crop = parse_commas(args.crop, "crop") if args.crop else (None, None) hcrop = parse_commas(args.headcrop, "headcrop") if \ args.headcrop else (0, 0) minlen = parse_commas(args.minlen, "minlen") if args.minlen \ else (0, 0) out_f = args.out_f paired = True if (args.interleaved or args.rhandle) else False trunc_n = trim.truncate_by_n if args.trunc_n else self # Prepare the iterator based on dataset type iterator = seq_io.read_iterator(args.fhandle, args.rhandle, \ args.interleaved, "fastq") # Populate list of trimming tasks to perform on reads trim_tasks = { 'l': (trim.trim_leading, args.lead_score), 't': (trim.trim_trailing, args.trail_score), 'w': (trim.adaptive_trim, args.sw) } trim_steps = [] for task in args.trim_order: value = trim_tasks[task][-1] if value: trim_steps.append(trim_tasks[task]) if len(trim_steps) < 1 and not (args.crop or args.headcrop): seq_io.print_error("error: no trimming steps were specified") # Counters for trimming statistics discarded = Counter(0) passed = Counter(0) # Assign variables based on dataset type (paired or single-end) if paired: print("Processing input as paired-end reads", file=sys.stderr) out_s = args.out_s if args.out_s else None out_r = out_f if not args.out_r else args.out_r output = "\nRecords processed:\t{!s}\nPassed filtering:\t{!s} " \ "({:.2%})\n Reads pairs kept:\t{!s} ({:.2%})\n Forward " \ "only kept:\t{!s} ({:.2%})\n Reverse only kept:\t{!s} " \ "({:.2%})\nRecords discarded:\t{!s} ({:.2%})\n" singles1 = Counter(0) singles2 = Counter(0) else: if args.out_s: print( "warning: argument --singles used with single-end reads" "... ignoring\n", file=sys.stderr) if args.out_r: print( "warning: argument --out-reverse used when input is " "single-end... ignoring\n", file=sys.stderr) print("Processing input as single-end reads", file=sys.stderr) out_s = None out_r = None output = "\nRecords processed:\t{!s}\nPassed filtering:\t{!s} ({:.2%})" \ "\nRecords discarded:\t{!s} ({:.2%})\n" singles1 = singles2 = None max_read_threads = args.threads - 1 if args.threads > 1 else 1 read_queue = Queue( max_read_threads) # Max queue size prevents race conditions write_queue = Queue(max_read_threads) # Initialize threads to process reads and writes read_processes = [] for i in range(max_read_threads): read_processes.append(Process(target=trim_reads, args=(read_queue, \ write_queue, trim_steps, trunc_n, crop, hcrop, args.offset,))) read_processes[i].start() write_process = Process(target=write_reads, args=(write_queue, out_f, \ out_r, out_s, minlen, passed, discarded, singles1, singles2,)) write_process.start() # Iterate over reads, populating read queue for trimming for processed_total, records in enumerate(iterator): read_queue.put(records) # Send kill message to threads responsible for trimming for process in read_processes: read_queue.put('DONE') # Wait for processes to finish before continuing for process in read_processes: process.join() # Send kill message to threads responsible for trimming while not write_queue.empty(): sleep(1) write_queue.put('DONE') # Wait for write processes to finish before continuing write_process.join() # Verify input file non-empty try: processed_total += 1 except UnboundLocalError: seq_io.print_error("error: no sequences were found to process") # Calculate and print output statistics p = passed.value() d = discarded.value() if paired: processed_total = processed_total * 2 s1, s2 = singles1.value(), singles2.value() passed_total = (p * 2 + s1 + s2) discarded_total = d * 2 + s1 + s2 pairs = p frac_pairs = (pairs * 2) / processed_total frac_s1 = s1 / processed_total frac_s2 = s2 / processed_total else: passed_total = p discarded_total = d processed_total = discarded_total + passed_total s1 = s2 = frac_s1 = frac_s2 = pairs = frac_pairs = None frac_discarded = discarded_total / processed_total frac_passed = passed_total / processed_total stats = [processed_total, passed_total, frac_passed] + [i for i in (pairs, frac_pairs, s1, \ frac_s1, s2, frac_s2) if i != None] + [discarded_total, frac_discarded] print(output.format(*tuple(stats)), file=sys.stderr) # Calculate and print program run-time end_time = time() total_time = (end_time - start_time) / 60.0 print("It took {:.2e} minutes to process {!s} records\n"\ .format(total_time, processed_total), file=sys.stderr)
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'fhandle', metavar='in1.fast<q|a>', action=seq_io.Open, mode='rb', help="input reads in fastq or fasta format. Can be a file containing " "either single-end or forward/interleaved reads if reads are " "paired-end [required]") input_arg = parser.add_mutually_exclusive_group(required=False) input_arg.add_argument('--interleaved', action='store_true', help="input is interleaved paired-end reads") input_arg.add_argument('-r', '--reverse', dest='rhandle', metavar='in2.fast<q|a>', action=seq_io.Open, mode='rb', help="input reverse reads") parser.add_argument( '-f', '--format', metavar='FORMAT', dest='format', default='fastq', choices=['fasta', 'fastq'], help="sequence file format [default: fastq]. Available options are " "'fasta' or 'fastq'") parser.add_argument( '-b', '--barcodes', metavar='FILE', action=seq_io.Open, mode='r', help="file containing sample names mapped to template barcode " "sequences, in tab-separated format. The first column should " "contain sample names and the second column should contain the " "appropriate barcodes. An optional third column can be used to " "assign barcodes to 'multiplex groups'. The name of a multiplex " "group should have first the run id, then the flowcell lane, " "separated by a colon (e.g. 432:4). If this argument is unused " "and --force is provided instead, the output files will be named " "for the barcode and run information found in the headers") parser.add_argument( '-s', '--suffix', metavar='STR', type=str, help="string to append to the end of the file name. The default is to " "append the file format (fastq or fasta) and the strand for PE " "data (forward, reverse, interleaved)") parser.add_argument( '-c', '--hist', metavar='FILE', action=seq_io.Open, mode='w', help="output histogram of barcode counts. This can be used for " "graphing the error distribution of a barcode sequence, for " "instance") parser.add_argument( '--no-out', dest='no_out', action='store_true', help="do not write sequences to a file. Only output a histogram of " "barcode counts") parser.add_argument( '--force', action='store_true', help="create new file for every barcode found in input") compress_arg = parser.add_mutually_exclusive_group(required=False) compress_arg.add_argument( '--gzip', action='store_true', help="output files should be compressed using the gzip algorithm. The " "suffix '.gz'. will be appended to the file names") compress_arg.add_argument( '--bzip2', action='store_true', help="output files should be compressed using the bzip2 algorithm. The " "suffix '.bz2' will be appended to the file names") parser.add_argument( '-d', '--distance', type=int, default=0, help="maximum hamming distance allowed between sequence barcodes to " "be sorted into the same partition. Requires a barcodes file " "providing template barcode sequences") parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() all_args = sys.argv[1:] seq_io.program_info('demultiplex_by_header', all_args, __version__) if args.distance and not args.barcodes: parser.error("error: argument -d/--distance cannot be used without " "-b/--barcodes") if args.no_out and not args.hist: parser.error("error: argument --no-out cannot be used without " "-c/--hist") # Track program run-time start_time = time() # Assign variables based on arguments supplied by the user if args.barcodes and args.distance > 0: outstats = "Records processed:\t{0}\nBarcode partitions created:\t{1}"\ "\nSequence barcodes with -\n exact match to a template:"\ "\t{2}\n one or more mismatchs:\t{3}\nSequences with "\ "unknown barcode:\t{4}\n" elif args.barcodes and args.distance == 0: outstats = "Records processed:\t\t{0}\nBarcode partitions created:\t{1}"\ "\nSequences with unknown barcode:\t{2}\n" else: outstats = "Records processed:\t\t{0}\nBarcode partitions created:\t{1}"\ "\n" suffix = args.suffix if args.suffix else args.format out_hist = args.hist.write if args.hist else do_nothing exact_total = mismatch_total = unknowns = 0 if args.gzip: compression = '.gz' algo = GzipFile elif args.bzip2: compression = '.bz2' algo = BZ2File else: compression = '' algo = io.open # Prepare the iterator based on dataset type iterator = seq_io.read_iterator(args.fhandle, args.rhandle, \ args.interleaved, args.format) # Store user-supplied barcodes template_barcodes = {} names = [] is_grouped = False if args.barcodes: barcode_dists = [] for barcodes_total, line in enumerate(args.barcodes): tag = BarcodeEntry() group = '' # Verify that barcodes file is correctly formatted try: # Has filename and barcode only name, barcode = line.strip().split('\t') except ValueError: try: # Has filename, barcode, and multiplex group name, barcode, group = line.strip().split('\t') is_grouped = True except ValueError: seq_io.print_error("error: barcode mapping file does not " "appear to be formatted correctly. See " "the help message for formatting " "requirements") tag.id = name tag.barcode = barcode tag.group = group # Add to dictionary of template barcodes try: dict_group = template_barcodes[group] except KeyError: template_barcodes[group] = {barcode: tag} else: # Verify unique sample names and calculate barcode distances for i in dict_group: if name == dict_group[i].id: seq_io.print_error("error: sample name '{}' is being " "used for more than one template " "barcode".format(name)) barcode_dists.append((group, hamming_distance(dict_group[i]\ .barcode, barcode))) if barcode not in dict_group: dict_group[barcode] = tag else: seq_io.error("error: template barcode '{}' has been seen " "more than once in a single multiplex group"\ .format(barcode)) # Verify non-empty barcodes file try: barcodes_total += 1 except UnboundLocalError: seq_io.print_error("error: no barcodes were found to process") # Template barcode statistics out_bstats = "Template barcodes found:\t{!s}\n # multiplex groups:\t{!s}\n\n" num_tem_groups = len(template_barcodes) bstats = [barcodes_total, num_tem_groups] if is_grouped: out_bstats += " Multiplex Group\t# Barcodes\tMin distance\tMax distance\n" for mul_group in template_barcodes: out_bstats += " {!s}\t{!s}\t{!s}\t{!s}\n" group_dists = [return_last(i) for i in barcode_dists if \ i[0] == group] bstats += [mul_group, len(template_barcodes[mul_group]), min(group_dists), \ max(group_dists)] else: out_bstats += "Minimum Hamming distance between all barcodes:\t"\ "{!s}\nMaximum Hamming distance between all barcodes:"\ "\t{!s}\n" all_dists = [return_last(i) for i in barcode_dists] bstats += [min(all_dists), max(all_dists)] print(out_bstats.format(*tuple(bstats)), file=sys.stderr) # Demultiplex reads outfiles = {} sequence_barcodes = [] for processed_total, record in enumerate(iterator): # Prepare output dependant on whether paired or unpaired try: seq_tag = record.forward.description.split(':')[-1] header = record.forward.id outf = record.forward.write() outr = record.reverse.write() except AttributeError: seq_tag = record.description.split(':')[-1] header = record.id outf = record.write() outr = None # Verify headers are formatted as Casava 1.8 try: run_info = tuple(header.split(':')[1:4]) #run id, flowcell, lane except IndexError: seq_io.print_error("error: the format of the sequence headers is " "incompatible with this method. Demultiplexing " "these reads will require a different method " "to be used instead") if (not seq_tag.isalpha()) or (len(seq_tag) != 6): seq_io.print_error("error: the format of the sequence headers is " "incompatible with this method. Demultiplexing " "these reads will require a different method " "to be used instead") file_prefix = "{0}_{1}_{2}_{3}".format(seq_tag, *run_info) # Increment barcode occurences index = "{0}:{1}:{2}:{3}".format(seq_tag, *run_info) already_seen = False for i in sequence_barcodes: if index == i.barcode: i.increment() already_seen = True if not already_seen: seq_entry = BarcodeEntry() seq_entry.barcode = index seq_entry.count = 1 sequence_barcodes.append(seq_entry) # Map sequence barcodes to barcodes in the provided barcodes file if args.barcodes: # Only consider other barcodes within the same multiplex group if is_grouped: try: relevant_barcodes = template_barcodes[":".join(run_info)] except KeyError: seq_io.warning("warning: run information in sequence " "header '{}' doesn't match any multiplex " "group in the provided barcodes file\n"\ .format(header)) continue else: # Consider all barcodes within file relevant_barcodes = template_barcodes[''] # Find the template barcode with the smallest hamming distance to # the record sequence barcode distances = sort_by_last([(i, hamming_distance(seq_tag, i)) \ for i in relevant_barcodes]) min_tag, min_dist = distances[0] #minimum is the first element if min_dist == 0: exact_total += 1 else: mismatch_total += 1 # Determine if more than one closest match if [i[1] for i in distances].count(min_dist) > 1: seq_io.print_warning("warning: barcode {0} in sequence " "{1} is equally similar to more than " "one template barcode. Unable to " "determine which partition to assign " "it to".format(seq_tag, header)) continue else: # Assign to template partition if within threshold distance if min_dist <= args.distance and args.distance: seq_tag = min_tag # Verify sequence tag is in the list of provided barcodes try: file_prefix = relevant_barcodes[seq_tag].id except KeyError: unknowns += 1 if not args.force: seq_io.print_warning("warning: barcode {0} in sequence {1}" " does not correspond to any of the " "template barcodes provided. The " "template ({2}) with the fewest " "mismatches is {3} nucleotides " "different. Use --force to write " "these records anyway".format(seq_tag,\ header, min_tag, min_dist)) continue # Write record to appropriate output file if not args.no_out: try: outfiles[file_prefix][0](outf) outfiles[file_prefix][1](outr) except KeyError: # Barcode not encountered previously, open new file for writes if args.rhandle: handle1 = io.TextIOWrapper(algo("{0}.forward.{1}{2}"\ .format(file_prefix, suffix, compression), mode='wb')) handle2 = io.TextIOWrapper(algo("{0}.reverse.{1}{2}"\ .format(file_prefix, suffix, compression), mode='wb')) write1, write2 = handle1.write, handle2.write elif args.interleaved: handle1 = io.TextIOWrapper(algo("{0}.interleaved.{1}{2}"\ .format(file_prefix, suffix, compression), mode='wb')) write1 = write2 = handle1.write else: handle1 = io.TextIOWrapper( algo("{0}.{1}{2}".format(file_prefix, suffix, compression), mode='wb')) write1 = handle1.write write2 = do_nothing outfiles[file_prefix] = (write1, write2) # Should be safe to write now outfiles[file_prefix][0](outf) outfiles[file_prefix][1](outr) # Write output histogram and sequence barcode statistics houtstats = "Sequence barcodes found:\t{!s}\n Mean abundance:\t"\ "{:.2f}\n Median abundance:\t{:.2f}\n Abundance SD:\t"\ "\t{:.2f}\n\n Barcode\tRun Information\tAbundance\n" num_seq_tags = len(sequence_barcodes) houtstats += " {!s}\t{!s}\t{!s}\n" * num_seq_tags barcode_abundances = [] hstats_extra = [] for seq_bar in sequence_barcodes: abund = seq_bar.count barcode_abundances.append(abund) hstats_extra += [seq_bar.sequence(), seq_bar.run_info(), abund] b_mean, b_median, b_sd = (mean(barcode_abundances), \ median(barcode_abundances), \ stdev(barcode_abundances)) hstats = [num_seq_tags, b_mean, b_median, b_sd] + hstats_extra print(houtstats.format(*tuple(hstats)), file=sys.stderr) if args.hist: out_hist("#Total: {}\n#Mean: {:.2f}\n#Median: {:.2f}\n#STDev: " "{:.2f}\n".format(num_seq_tags, b_mean, b_median, b_sd)) for abundance in sorted(set(barcode_abundances)): counts = barcode_abundances.count(abundance) out_hist("{0}\t{1}\n".format(abundance, counts)) # Verify input file non-empty try: processed_total += 1 except UnboundLocalError: seq_io.print_error("error: no sequences were found to process") # Calculate and print output statistics if not args.no_out: partitions_total = len(outfiles) stats = [processed_total, partitions_total] + [i for i in \ (exact_total, mismatch_total, unknowns) if i != None] print(outstats.format(*tuple(stats)), file=sys.stderr) # Calculate and print program run-time end_time = time() total_time = (end_time - start_time) / 60.0 print("It took {:.2e} minutes to process {!s} records\n"\ .format(total_time, processed_total), file=sys.stderr)
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'fhandle', metavar='in1.fast<q|a>', type=str, action=Open, mode='rb', default=sys.stdin, help="input reads in fastq or fasta format. Can be a file containing " "either single-end or forward/interleaved reads if reads are " "paired-end [required]") input_arg = parser.add_mutually_exclusive_group(required=False) input_arg.add_argument('--interleaved', action='store_true', help="input is interleaved paired-end reads") input_arg.add_argument('-r', '--reverse', dest='rhandle', metavar='in2.fast<q|a>', action=Open, mode='rb', help="input reverse reads") parser.add_argument('-o', '--out', metavar='FILE', dest='out_f', type=str, action=Open, mode='wt', default=sys.stdout, help="output trimmed reads [default: stdout]") parser.add_argument('-v', '--out-reverse', metavar='FILE', dest='out_r', type=str, action=Open, mode='wt', help="output reverse reads") parser.add_argument( '-f', '--format', metavar='FORMAT', dest='format', default='fastq', choices=['fasta', 'fastq'], help="sequence file format [default: fastq]. Available options are " "'fasta' or 'fastq'") parser.add_argument('-l', '--log', type=str, action=Open, mode='wt', help="output log of replicate types") dup_args = parser.add_argument_group('replicate types') dup_args.add_argument('--prefix', action='store_true', help="replicate can be a 5' prefix of another read") dup_args.add_argument( '--rev-comp', dest='rev_comp', action='store_true', help="replicate can be the reverse-complement of another read") parser.add_argument( '--reduce-memory', dest='mem_use', action='store_true', help="reduce the mount of memory that the program uses. This could " "result in a drastic increase in run-time") parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() all_args = sys.argv[1:] program_info('filter_replicates', all_args, __version__) # Track program run-time start_time = time() # Assign variables based on arguments supplied by the user out_f = args.out_f.write logger = args.log.write if args.log else do_nothing logger("#Replicate\tTemplate\tType\n") compress = zlib.compress if args.mem_use else self decompress = zlib.decompress if args.mem_use else self out_format = ">{0} {1}\n{2}\n" if args.format == "fasta" else \ "@{0} {1}\n{2}\n+\n{3}\n" paired = True if (args.interleaved or args.rhandle) else False # Prepare the iterator based on dataset type iterator = read_iterator(args.fhandle, args.rhandle, args.interleaved, \ args.format) # Assign variables based on dataset type (paired or single-end) if paired: print("Processing input as paired-end reads\n", file=sys.stderr) out_r = out_f if not args.out_r else args.out_r.write rc = reverse_complement_paired else: print("Processing input as single-end reads\n", file=sys.stderr) out_r = do_nothing rc = reverse_complement # Iterate over the reads, storing only the unique records uniques = {} for records_total, entry in enumerate(iterator): try: header = (entry.forward.id, entry.reverse.id) fdesc, rdesc = (entry.forward.description, entry.reverse.description) fseq, rseq = (entry.forward.sequence, entry.reverse.sequence) except AttributeError: header = (entry.id, '') fdesc, rdesc = (entry.description, '') fseq, rseq = (entry.sequence, '') try: qual = compress(entry.quality) except AttributeError: # Must be fasta format qual = None else: try: qual = compress(entry.forward.quality + entry.reverse.quality) except AttributeError: qual = None flen, rlen = len(fseq), len(rseq) record = [i for i in (header, flen, fseq + rseq, qual) if i != None] # Use hash of full or prefixed sequence as a key for quick comparisons fsub, rsub = ((20, 20) if args.prefix else (flen, rlen)) key = hashlib.md5((fseq[:fsub] + rseq[:rsub]).encode()).digest() # Search if replicate search_list = [] try: search_list = uniques[key] except KeyError: # No match to the database found. Need to check the reverse # complement if requested if args.rev_comp: try: fseq_rc, rseq_rc = rc(fseq, rseq) except TypeError: fseq_rc, rseq_rc = rc(fseq), '' rckey = hashlib.md5((fseq_rc[:fsub] + rseq_rc[:rsub])\ .encode()).digest() try: search_list = uniques[rckey] except KeyError: # Not a replicate. Add to the database of uniques uniques[key] = [record] continue else: duplicate_key = rckey fquery, rquery = fseq_rc, rseq_rc query_id = header[0] # Not a replicate. Add to the database of uniques else: uniques[key] = [record] continue else: duplicate_key = key fquery, rquery = fseq, rseq query_id = header[0] # Search through list of records with common key to see if the sequence # matches one that has been observed before duplicate = None for index, search_record in enumerate(search_list): # Get search sequences by splitting combined sequence on forward # length search_id = search_record[0][0] fsearch, rsearch = split_by_length(search_record[2], \ search_record[1]) # Check replicate status of forward sequence fstatus = duplicate_status(fquery, fsearch) if fstatus: # Check reverse only if forward a duplicate rstatus = duplicate_status(rquery, rsearch) if rstatus: # Query is an exact match to a DB record if (fstatus == 1 and rstatus == 1): duplicate_type = "exact" duplicate = query_id template = search_id break # Query is a prefix of a DB record elif (fstatus == 1 and rstatus == 3) or \ (fstatus == 3 and rstatus == 1) or \ (fstatus == 3 and rstatus == 3): duplicate_type = "prefix" duplicate = query_id template = search_id break # A DB record is a prefix of the query elif (fstatus == 1 and rstatus == 2) or \ (fstatus == 2 and rstatus == 1) or \ (fstatus == 2 and rstatus == 2): duplicate_type = "prefix" duplicate = search_id template = query_id # Replace old DB record with new uniques[duplicate_key][index] = record break if duplicate: # Add rc to duplicate type if search_list from rev_comp duplicate_type = "rev-comp {}".format(duplicate_type) \ if key != duplicate_key else duplicate_type logger("{}\t{}\t{}\n".format(duplicate, template, duplicate_type)) else: # record is definitely not a duplicate, so add to the list of # unique sequences with a common key uniques[duplicate_key].append(record) # Make sure input file non-empty try: records_total += 1 #number records processed except UnboundLocalError: print_error("error: no sequences were found to process.") # Write unique records uniques_total = 0 #remaining records after dereplication for unique_key in uniques: for record in uniques[unique_key]: uniques_total += 1 fheader, rheader = record[0] fseq, rseq = split_by_length(record[2], record[1]) try: fqual, rqual = split_by_length(decompress(record[3]), record[1]) except IndexError: fqual = rqual = None out_f(out_format.format(*tuple([i for i in (fheader, fdesc, fseq, \ fqual) if i != None]))) out_r(out_format.format(*tuple([i for i in (rheader, rdesc, rseq, \ rqual) if i != None]))) # Calculate and print output statistics replicates_total = records_total - uniques_total print( "Records processed:\t{!s}\nUnique reads found:\t{!s} ({:.2%})\nReplicate reads found:\t{!s} " "({:.2%})\n".format(records_total, uniques_total, uniques_total / records_total, replicates_total, replicates_total / records_total), file=sys.stderr) # Calculate and print program run-time end_time = time() total_time = (end_time - start_time) / 60.0 print("It took {:.2e} minutes to process {!s} records\n"\ .format(total_time, records_total), file=sys.stderr)