def fill(self): ''' parse the BAM to obtain the frequency for each UMI''' self.frequency2umis = collections.defaultdict(list) for read in self.inbam: if read.is_unmapped: continue if read.is_read2: continue try: self.umis[self.barcode_getter(read)[0]] += 1 except KeyError: continue self.umis_counter = collections.Counter(self.umis) total_umis = sum(self.umis_counter.values()) U.info("total_umis %i" % total_umis) U.info("#umis %i" % len(self.umis_counter)) self.prob = self.umis_counter.values() sum_prob = sum(self.prob) self.prob = [float(x) / sum_prob for x in self.prob] self.refill_random()
def breadth_first_search_recursive(node, adj_list): try: recursive_search.component = set((node,)) return recursive_search(node, adj_list) except RecursionError as error: U.info('Recursion Error: %s' % error) return breadth_first_search(node, adj_list)
def breadth_first_search_recursive(node, adj_list): try: recursive_search.component = set((node, )) return recursive_search(node, adj_list) except RecursionError as error: U.info('Recursion Error: %s' % error) return breadth_first_search(node, adj_list)
def getErrorCorrectMapping(cell_barcodes, whitelist, threshold=1): ''' Find the mappings between true and false cell barcodes based on an edit distance threshold. Any cell barcode within the threshold to more than one whitelist barcode will be excluded''' true_to_false = collections.defaultdict(set) # Unexpected results with cythonise hamming distance so redefine in python here def hamming_distance(first, second): ''' returns the edit distance/hamming distances between its two arguements ''' # We only want to define hamming distance for barcodes with the same length if len(first) != len(second): return np.inf dist = sum([not a == b for a, b in zip(first, second)]) return dist whitelist = set([str(x) for x in whitelist]) U.info('building bktree') tree2 = pybktree.BKTree(hamming_distance, whitelist) U.info('done building bktree') for cell_barcode in cell_barcodes: if cell_barcode in whitelist: # if the barcode is already whitelisted, no need to add continue # get all members of whitelist that are at distance 1 candidates = [ white_cell for d, white_cell in tree2.find(cell_barcode, threshold) if d > 0 ] if len(candidates) == 0: # the cell doesnt match to any whitelisted barcode, # hence we have to drop it # (as it cannot be asscociated with any frequent barcde) continue elif len(candidates) == 1: white_cell_str = candidates[0] true_to_false[white_cell_str].add(cell_barcode) else: # more than on whitelisted candidate: # we drop it as its not uniquely assignable continue return true_to_false
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False) nInput, nOutput = 0, 0 # set the method with which to extract umis from reads umi_getter = partial( umi_methods.get_umi_read_string, sep=options.umi_sep) options.stdout.write("%s\t%s\n" % ("gene", "count")) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) for gene, counts in umi_methods.get_gene_count_tab( options.stdin, umi_getter=umi_getter): umis = counts.keys() nInput += sum(counts.values()) # group the umis groups = processor( umis, counts, threshold=options.threshold) gene_count = len(groups) options.stdout.write("%s\t%i\n" % (gene, gene_count)) nOutput += gene_count U.info("Number of reads counted: %i" % nOutput) U.Stop()
def __init__(self, options): self.UMIClusterer = UMIClusterer(cluster_method=options.method) if options.filter_umi: self.umi_whitelist = whitelist_methods.getUserDefinedBarcodes( options.umi_whitelist, options.umi_whitelist_paired, deriveErrorCorrection=False)[0] self.umi_whitelist_counts = collections.Counter() U.info("Length of UMI whitelist: %i" % len(self.umi_whitelist)) else: self.umi_whitelist = None
def errorDetectAboveThreshold(cell_barcode_counts, cell_whitelist, true_to_false_map, errors=1, resolution_method="discard"): assert resolution_method in [ "discard", "correct" ], ("resolution method must be discard or correct") error_counter = collections.Counter() new_true_to_false_map = copy.deepcopy(true_to_false_map) discard_cbs = set() cell_whitelist = list(cell_whitelist) cell_whitelist.sort(key=lambda x: cell_barcode_counts[x]) for ix, cb in enumerate(cell_whitelist): near_misses = checkError(cb, cell_whitelist[ix + 1:], errors=errors) if len(near_misses) > 0: error_counter["error_discarded_mt_1"] discard_cbs.add(cb) # Will always discard CB from cell_whitelist if resolution_method == "correct" and len(near_misses) == 1: # Only correct substitutions as INDELs will also mess # up UMI so simple correction of CB is insufficient if regex.match("(%s){s<=%i}" % (cb, errors), near_misses[0]): # add corrected barcode to T:F map new_true_to_false_map[near_misses[0]].add(cb) error_counter["substitution_corrected"] += 1 else: discard_cbs.add(cb) error_counter["indel_discarded"] += 1 else: error_counter["error_discarded"] += 1 if resolution_method == "correct": U.info( "CBs above the knee corrected due to possible substitutions: %i" % error_counter["substitution_corrected"]) U.info("CBs above the knee discarded due to possible INDELs: %i" % error_counter["indel_discarded"]) U.info("CBs above the knee discarded due to possible errors from " "multiple other CBs: %i" % error_counter["error_discarded_mt_1"]) else: U.info("CBs above the knee discarded due to possible errors: %i" % len(discard_cbs)) cell_whitelist = set(cell_whitelist).difference(discard_cbs) return (cell_whitelist, new_true_to_false_map)
def close(self): '''Write mates for remaining chromsome. Search for matches to any unmatched reads''' self.write_mates() U.info("Searching for mates for %i unmatched alignments" % len(self.read1s)) found = 0 for name, chrom, pos in self.read1s: for read in self.outfile.fetch(start=pos, end=pos+1, tid=chrom): if (read.query_name, read.pos) == (name, pos): self.outfile.write(read) found += 1 break U.info("%i mates never found" % (len(self.read1s) - found)) self.outfile.close()
def close(self): '''Write mates for remaining chromsome. Search for matches to any unmatched reads''' self.write_mates() U.info("Searching for mates for %i unmatched alignments" % len(self.read1s)) found = 0 for name, chrom, pos in self.read1s: for read in self.outfile.fetch(start=pos, end=pos + 1, tid=chrom): if (read.query_name, read.pos) == (name, pos): self.outfile.write(read) found += 1 break U.info("%i mates never found" % (len(self.read1s) - found)) self.outfile.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False) nInput, nOutput = 0, 0 # set the method with which to extract umis from reads umi_getter = partial(umi_methods.get_umi_read_string, sep=options.umi_sep) options.stdout.write("%s\t%s\n" % ("gene", "count")) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) for gene, counts in umi_methods.get_gene_count_tab(options.stdin, umi_getter=umi_getter): umis = counts.keys() nInput += sum(counts.values()) # group the umis groups = processor(umis, counts, threshold=options.threshold) gene_count = len(groups) options.stdout.write("%s\t%i\n" % (gene, gene_count)) nOutput += gene_count U.info("Number of reads counted: %i" % nOutput) U.Stop()
def getCellWhitelist(cell_barcode_counts, knee_method="distance", expect_cells=False, cell_number=False, error_correct_threshold=0, plotfile_prefix=None): if knee_method == "distance": cell_whitelist = getKneeEstimateDistance(cell_barcode_counts, cell_number, plotfile_prefix) elif knee_method == "density": cell_whitelist = getKneeEstimateDensity(cell_barcode_counts, expect_cells, cell_number, plotfile_prefix) else: raise ValueError("knee_method must be 'distance' or 'density'") U.info("Finished - whitelist determination") true_to_false_map = None if cell_whitelist and error_correct_threshold > 0: U.info("Starting - finding putative error cell barcodes") true_to_false_map = getErrorCorrectMapping(cell_barcode_counts.keys(), cell_whitelist, error_correct_threshold) U.info("Finished - finding putative error cell barcodes") return cell_whitelist, true_to_false_map
def close(self): '''Write mates for remaining chromsome. Search for matches to any unmatched reads''' self.write_mates() U.info("Searching for mates for %i unmatched alignments" % len(self.read1s)) found = 0 for read in self.infile.fetch(until_eof=True, multiple_iterators=True): if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)): continue key = read.query_name, read.reference_name, read.reference_start if key in self.read1s: self.outfile.write(read) self.read1s.remove(key) found += 1 continue U.info("%i mates never found" % len(self.read1s)) self.outfile.close()
def main(argv=None): if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "RSEM preparation specific options") group.add_option( "--tags", dest="tags", type="string", default="UG,BX", help="Comma-seperated list of tags to transfer from read1 to read2") group.add_option("--sam", dest="sam", action="store_true", default=False, help="input and output SAM rather than BAM") parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False) skipped_stats = Counter() if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: in_name = "-" if options.sam: mode = "" else: mode = "b" inbam = pysam.AlignmentFile(in_name, "r" + mode) if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" outbam = pysam.AlignmentFile(out_name, "w" + mode, template=inbam) options.tags = options.tags.split(",") for template in chunk_bam(inbam): assert len(set(r.query_name for r in template)) == 1 current_template = {True: defaultdict(list), False: defaultdict(list)} for read in template: key = (read.reference_name, read.pos, not read.is_secondary) current_template[read.is_read1][key].append(read) output = set() for read in template: mate = None # if this read is a non_primary alignment, we first want to check if it has a mate # with the non-primary alignment flag set. mate_key_primary = (True) mate_key_secondary = (read.next_reference_name, read.next_reference_start, False) # First look for a read that has the same primary/secondary status # as read (i.e. secondary mate for secondary read, and primary mate # for primary read) mate_key = (read.next_reference_name, read.next_reference_start, read.is_secondary) mate = pick_mate(read, current_template, mate_key) # If none was found then look for the opposite (primary mate of secondary # read or seconadary mate of primary read) if mate is None: mate_key = (read.next_reference_name, read.next_reference_start, not read.is_secondary) mate = pick_mate(read, current_template, mate_key) # If we still don't have a mate, then their can't be one? if mate is None: skipped_stats["no_mate"] += 1 U.warn("Alignment {} has no mate -- skipped".format("\t".join( map(str, [ read.query_name, read.flag, read.reference_name, int(read.pos) ])))) continue # because we might want to make changes to the read, but not have those changes reflected # if we need the read again,we copy the read. This is only way I can find to do this. read = pysam.AlignedSegment().from_dict(read.to_dict(), read.header) mate = pysam.AlignedSegment().from_dict(mate.to_dict(), read.header) # Make it so that if our read is secondary, the mate is also secondary. We don't make the # mate primary if the read is primary because we would otherwise end up with mulitple # primary alignments. if read.is_secondary: mate.is_secondary = True # In a situation where there is already one mate for each read, then we will come across # each pair twice - once when we scan read1 and once when we scan read2. Thus we need # to make sure we don't output something already output. if read.is_read1: mate = copy_tags(options.tags, read, mate) output_key = str(read) + str(mate) if output_key not in output: output.add(output_key) outbam.write(read) outbam.write(mate) skipped_stats["pairs_output"] += 1 elif read.is_read2: read = copy_tags(options.tags, mate, read) output_key = str(mate) + str(read) if output_key not in output: output.add(output_key) outbam.write(mate) outbam.write(read) skipped_stats["pairs_output"] += 1 else: skipped_stats["skipped_not_read12"] += 1 U.warn("Alignment {} is neither read1 nor read2 -- skipped". format("\t".join( map(str, [ read.query_name, read.flag, read.reference_name, int(read.pos) ])))) continue if not out_name == "-": outbam.close() U.info("Total pairs output: {}, Pairs skipped - no mates: {}," " Pairs skipped - not read1 or 2: {}".format( skipped_stats["pairs_output"], skipped_stats["no_mate"], skipped_stats["skipped_not_read12"])) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "count_tab-specific options") group.add_option("--barcode-separator", dest="bc_sep", type="string", help="separator between read id and UMI " " and (optionally) the cell barcode", default="_") group.add_option("--per-cell", dest="per_cell", action="store_true", help="Readname includes cell barcode as well as UMI in " "format: read[sep]UMI[sep]CB") parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_sam_options=False) nInput, nOutput = 0, 0 # set the method with which to extract umis from reads if options.per_cell: bc_getter = partial(sam_methods.get_cell_umi_read_string, sep=options.bc_sep) else: bc_getter = partial(sam_methods.get_umi_read_string, sep=options.bc_sep) if options.per_cell: options.stdout.write("%s\t%s\t%s\n" % ("cell", "gene", "count")) else: options.stdout.write("%s\t%s\n" % ("gene", "count")) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) for gene, counts in sam_methods.get_gene_count_tab(options.stdin, bc_getter=bc_getter): for cell in counts.keys(): umis = counts[cell].keys() nInput += sum(counts[cell].values()) # group the umis groups = processor(counts[cell], threshold=options.threshold) gene_count = len(groups) if options.per_cell: options.stdout.write("%s\t%s\t%i\n" % (cell, gene, gene_count)) else: options.stdout.write("%s\t%i\n" % (gene, gene_count)) nOutput += gene_count U.info("Number of reads counted: %i" % nOutput) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--plot-prefix", dest="plot_prefix", type="string", help=("Prefix for plots to visualise the automated " "detection of the number of 'true' cell barcodes")) parser.add_option("--subset-reads", dest="subset_reads", type="int", help=("Use the first N reads to automatically identify " "the true cell barcodes. If N is greater than the " "number of reads, all reads will be used")) parser.add_option("--error-correct-threshold", dest="error_correct_threshold", type="int", help=("Hamming distance for correction of " "barcodes to whitelist barcodes")) parser.add_option("--method", dest="method", choices=["reads", "umis"], help=("Use reads or unique umi counts per cell")) parser.add_option("--expect-cells", dest="expect_cells", type="int", help=("Prior expectation on the upper limit on the " "number of cells sequenced")) parser.add_option("--set-cell-number", dest="cell_number", type="int", help=("Specify the number of cell barcodes to accept")) parser.set_defaults(method="reads", extract_method="string", filter_cell_barcodes=False, whitelist_tsv=None, blacklist_tsv=None, error_correct_threshold=1, pattern=None, pattern2=None, read2_in=None, plot_prefix=None, subset_reads=100000000, expect_cells=False, cell_number=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_sam_options=False) if options.expect_cells and options.cell_number: U.error("Cannot supply both --expect-cells and " "--cell-number options") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern2 " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.barcode_regex2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.barcode_regex2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % (options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" (options.pattern, options.pattern2)) if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % (options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" (options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( method=options.extract_method, pattern=options.pattern, pattern2=options.pattern2, prime3=options.prime3, extract_cell=extract_cell) cell_barcode_counts = collections.Counter() n_reads = 0 n_cell_barcodes = 0 # if using the umis method, need to keep a set of umis observed if options.method == "umis": cell_barcode_umis = collections.defaultdict(set) # variables for progress monitor displayMax = 100000 U.info("Starting barcode extraction") if not options.read2_in: for read1 in read1s: # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1) if barcode_values is None: continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_cell_barcodes > options.subset_reads: break else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) for read1, read2 in izip(read1s, read2s): # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1, read2) if barcode_values is None: continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_reads > options.subset_reads: break U.info("Starting - whitelist determination") if options.method == "umis": for cell in cell_barcode_umis: cell_barcode_counts[cell] = len(cell_barcode_umis[cell]) if options.cell_number and options.cell_number > len(cell_barcode_counts): raise ValueError( "--set-cell-barcode option specifies more cell barcodes than the " "number of observed cell barcodes. This may be because " "--subset-reads was set to a value too low to capture reads from " "all cells. %s cell barcodes observed from %s parsed reads. " "Expected>= %s cell barcodes" % (len(cell_barcode_counts), options.subset_reads, options.cell_number)) cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist( cell_barcode_counts, options.expect_cells, options.cell_number, options.error_correct_threshold, options.plot_prefix) U.info("Writing out whitelist") for barcode in sorted(list(cell_whitelist)): if true_to_false_map: corrected_barcodes = ",".join(sorted(true_to_false_map[barcode])) corrected_barcode_counts = ",".join( map(str, [ cell_barcode_counts[x] for x in sorted(true_to_false_map[barcode]) ])) else: corrected_barcodes, corrected_barcode_counts = "", "" options.stdout.write( "%s\t%s\t%s\t%s\n" % (barcode, corrected_barcodes, cell_barcode_counts[barcode], corrected_barcode_counts)) U.info("Parsed %i reads" % n_reads) U.info("%i reads matched the barcode pattern" % n_cell_barcodes) U.info("Found %i unique cell barcodes" % len(cell_barcode_counts)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "whitelist-specific options") group.add_option("--plot-prefix", dest="plot_prefix", type="string", help=("Prefix for plots to visualise the automated " "detection of the number of 'true' cell barcodes")) group.add_option("--subset-reads", dest="subset_reads", type="int", help=("Use the first N reads to automatically identify " "the true cell barcodes. If N is greater than the " "number of reads, all reads will be used. " "Default is 100,000,000")) group.add_option("--error-correct-threshold", dest="error_correct_threshold", type="int", help=("Hamming distance for correction of barcodes to " "whitelist barcodes. This value will also be used " "for error detection above the knee if required " "(--ed-above-threshold)")) group.add_option("--method", dest="method", choices=["reads", "umis"], help=("Use reads or unique umi counts per cell")) group.add_option("--knee-method", dest="knee_method", choices=["distance", "density"], help=("Use distance or density methods for detection of knee")) group.add_option("--expect-cells", dest="expect_cells", type="int", help=("Prior expectation on the upper limit on the " "number of cells sequenced")) group.add_option("--allow-threshold-error", dest="allow_threshold_error", action="store_true", help=("Don't select a threshold. Will still " "output the plots if requested (--plot-prefix)")) group.add_option("--set-cell-number", dest="cell_number", type="int", help=("Specify the number of cell barcodes to accept")) parser.add_option("--ed-above-threshold", dest="ed_above_threshold", type="choice", choices=["discard", "correct"], help=("Detect CBs above the threshold which may be " "sequence errors from another CB and either " "'discard' or 'correct'. Default=discard")) parser.add_option_group(group) parser.set_defaults(method="reads", knee_method="distance", extract_method="string", whitelist_tsv=None, blacklist_tsv=None, error_correct_threshold=1, pattern=None, pattern2=None, read2_in=None, plot_prefix=None, subset_reads=100000000, expect_cells=False, allow_threshold_error=False, cell_number=False, ed_above_threshold=None, ignore_suffix=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_extract_options=True, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False) if options.filtered_out and not options.extract_method == "regex": U.error("Reads will not be filtered unless extract method is" "set to regex (--extract-method=regex)") if options.expect_cells: if options.knee_method == "distance": U.error("Cannot use --expect-cells with 'distance' knee " "method. Switch to --knee-method=density if you want to " "provide an expectation for the number of " "cells. Alternatively, if you know the number of cell " "barcodes, use --cell-number") if options.cell_number: U.error("Cannot supply both --expect-cells and " "--cell-number options") extract_cell, extract_umi = U.validateExtractOptions(options) if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = extract_methods.ExtractFilterAndUpdate( method=options.extract_method, pattern=options.pattern, pattern2=options.pattern2, prime3=options.prime3, extract_cell=extract_cell) cell_barcode_counts = collections.Counter() n_reads = 0 n_cell_barcodes = 0 # if using the umis method, need to keep a set of umis observed if options.method == "umis": cell_barcode_umis = collections.defaultdict(set) # variables for progress monitor displayMax = 100000 U.info("Starting barcode extraction") if options.filtered_out: filtered_out = U.openFile(options.filtered_out, "w") if not options.read2_in: for read1 in read1s: # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1) if barcode_values is None: if options.filtered_out: filtered_out.write(str(read1) + "\n") continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_cell_barcodes > options.subset_reads: break else: if options.filtered_out2: filtered_out2 = U.openFile(options.filtered_out2, "w") read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) for read1, read2 in izip(read1s, read2s): # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1, read2) if barcode_values is None: if options.filtered_out: filtered_out.write(str(read1) + "\n") if options.filtered_out2: filtered_out2.write(str(read2) + "\n") continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_reads > options.subset_reads: break U.info("Starting - whitelist determination") if options.method == "umis": for cell in cell_barcode_umis: cell_barcode_counts[cell] = len(cell_barcode_umis[cell]) if options.cell_number and options.cell_number > len(cell_barcode_counts): raise ValueError( "--set-cell-barcode option specifies more cell barcodes than the " "number of observed cell barcodes. This may be because " "--subset-reads was set to a value too low to capture reads from " "all cells. %s cell barcodes observed from %s parsed reads. " "Expected>= %s cell barcodes" % ( len(cell_barcode_counts), options.subset_reads, options.cell_number)) cell_whitelist, true_to_false_map = whitelist_methods.getCellWhitelist( cell_barcode_counts, options.knee_method, options.expect_cells, options.cell_number, options.error_correct_threshold, options.plot_prefix) if cell_whitelist: U.info("Top %s cell barcodes passed the selected threshold" % len(cell_whitelist)) if options.ed_above_threshold: cell_whitelist, true_to_false_map = whitelist_methods.errorDetectAboveThreshold( cell_barcode_counts, cell_whitelist, true_to_false_map, errors=options.error_correct_threshold, resolution_method=options.ed_above_threshold) if cell_whitelist: U.info("Writing out whitelist") total_correct_barcodes = 0 total_corrected_barcodes = 0 for barcode in sorted(list(cell_whitelist)): total_correct_barcodes += cell_barcode_counts[barcode] if true_to_false_map: corrected_barcodes = ",".join( sorted(true_to_false_map[barcode])) correct_barcode_counts = [cell_barcode_counts[x] for x in sorted(true_to_false_map[barcode])] total_corrected_barcodes += sum(correct_barcode_counts) corrected_barcode_counts = ",".join( map(str, correct_barcode_counts)) else: corrected_barcodes, corrected_barcode_counts = "", "" options.stdout.write("%s\t%s\t%s\t%s\n" % ( barcode, corrected_barcodes, cell_barcode_counts[barcode], corrected_barcode_counts)) else: msg = ("No local minima was accepted. Recommend checking the plot " "output and counts per local minima (requires `--plot-prefix`" "option) and then re-running with manually selected threshold " "(`--set-cell-number` option)") if options.allow_threshold_error: U.info(msg) else: U.error(msg) U.info("Parsed %i reads" % n_reads) U.info("%i reads matched the barcode pattern" % n_cell_barcodes) U.info("Found %i unique cell barcodes" % len(cell_barcode_counts)) if cell_whitelist: U.info("Found %i total reads matching the selected cell barcodes" % total_correct_barcodes) U.info("Found %i total reads which can be error corrected to the " "selected cell barcodes" % total_corrected_barcodes) if options.filtered_out: filtered_out.close() if options.filtered_out2: filtered_out2.close() U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig (field 3 in BAM; RNAME)," " e.g for transcriptome where contig = gene")) parser.add_option("--per-gene", dest="per_gene", action="store_true", default=False, help=("Deduplicate per gene," "e.g for transcriptome where contig = transcript" "must also provide a transript to gene map with" "--gene-transcript-map [default=%default]")) parser.add_option("--gene-transcript-map", dest="gene_transcript_map", type="string", help="file mapping transcripts to genes (tab separated)", default=None) parser.add_option("--gene-tag", dest="gene_tag", type="string", help=("Deduplicate per gene where gene is" "defined by this bam tag [default=%default]"), default=None) parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) parser.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) parser.add_option( "--skip-tags-regex", dest="skip_regex", type="string", help=("Used with --gene-tag. " "Ignore reads where the gene-tag matches this regex"), default="^[__|Unassigned]") # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.per_gene: if not options.gene_transcript_map: raise ValueError( "--per-gene option requires --gene-transcript-map") infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join([ "read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id" ])) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") nInput, nOutput, unique_id = 0, 0, 0 if options.chrom: inreads = infile.fetch(reference=options.chrom) gene_tag = options.gene_tag else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) gene_tag = options.gene_tag for bundle, read_events, status in umi_methods.get_bundles( inreads, ignore_umi=False, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, gene_tag=gene_tag, skip_regex=options.skip_regex, read_length=options.read_length, umi_getter=umi_getter, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped): # write out read2s and unmapped if option set if status == 'single_read': # bundle is just a single read here outfile.write(bundle) nInput += 1 nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info( "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") parser.add_option("--read2-stdout", dest="read2_stdout", action="store_true", help="Paired reads, send read2 to stdout, discarding read1") parser.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) parser.add_option("--quality-filter-mask", dest="quality_filter_mask", type="int", help=("If a UMI base has a quality below this threshold, " "replace the base with 'N'")) parser.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from 'phred33'" "[33-77] 'phred64' [64-106] or 'solexa' [59-106]")) parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--filter-cell-barcode", dest="filter_cell_barcode", action="store_true", help="Filter the cell barcodes") parser.add_option("--error-correct-cell", dest="error_correct_cell", action="store_true", help=("Correct errors in the cell barcode")) parser.add_option("--whitelist", dest="whitelist", type="string", help=("A whitelist of accepted cell barcodes")) parser.add_option("--blacklist", dest="blacklist", type="string", help=("A blacklist of accepted cell barcodes")) parser.add_option("--reads-subset", dest="reads_subset", type="int", help=("Only extract from the first N reads. If N is " "greater than the number of reads, all reads will " "be used")) parser.add_option("--reconcile-pairs", dest="reconcile", action="store_true", help=("Allow the presences of reads in read2 input that are" "not present in read1 input. This allows cell barcode" "filtering of read1s without considering read2s")) parser.set_defaults(extract_method="string", filter_cell_barcodes=False, whitelist=None, blacklist=None, error_correct_cell=False, pattern=None, pattern2=None, read2_in=None, read2_out=False, read2_stdout=False, quality_filter_threshold=None, quality_encoding=None, reconcile=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_sam_options=False) if options.quality_filter_threshold or options.quality_filter_mask: if not options.quality_encoding: U.error("must provide a quality encoding (--quality-" "encoding) to filter UMIs by quality (--quality" "-filter-threshold) or mask low quality bases " "with (--quality-filter-mask)") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.pattern2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.pattern2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" ( options.pattern, options.pattern2)) if options.filter_cell_barcodes: if not options.whitelist: U.error("must provide a whitelist (--whitelist) if using " "--filter-cell-barcode option") if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( options.extract_method, options.pattern, options.pattern2, options.prime3, extract_cell, options.quality_encoding, options.quality_filter_threshold, options.quality_filter_mask, options.filter_cell_barcode) if options.filter_cell_barcode: cell_whitelist, false_to_true_map = umi_methods.getUserDefinedBarcodes( options.whitelist, options.error_correct_cell) ReadExtractor.cell_whitelist = cell_whitelist ReadExtractor.false_to_true_map = false_to_true_map if options.blacklist: blacklist = set() with U.openFile(options.blacklist, "r") as inf: for line in inf: blacklist.add(line.strip().split("\t")[0]) ReadExtractor.cell_blacklist = blacklist # variables for progress monitor progCount = 0 displayMax = 100000 U.info("Starting barcode extraction") if options.read2_in is None: for read in read1s: # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) new_read = ReadExtractor(read) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not new_read: continue options.stdout.write(str(new_read) + "\n") else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) if options.read2_out: read2_out = U.openFile(options.read2_out, "w") if options.reconcile: strict = False else: strict = True for read1, read2 in umi_methods.joinedFastqIterate( read1s, read2s, strict): # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) sys.stdout.flush() reads = ReadExtractor(read1, read2) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not reads: continue else: new_read1, new_read2 = reads if options.read2_stdout: options.stdout.write(str(new_read2) + "\n") else: options.stdout.write(str(new_read1) + "\n") if options.read2_out: read2_out.write(str(new_read2) + "\n") if options.read2_out: read2_out.close() for k, v in ReadExtractor.getReadCounts().most_common(): U.info("%s: %s" % (k, v)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) group = U.OptionGroup(parser, "group-specific options") group.add_option("--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) group.add_option("--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) group.add_option("--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename() sorted_out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename() sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join( ["read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id"])) nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) bundle_iterator = umi_methods.get_bundles( options, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped, metacontig_contig=metacontig2contig) for bundle, key, status in bundle_iterator(inreads): # write out read2s and unmapped (if these options are set) if status == 'single_read': # bundle is just a single read here nInput += 1 if outfile: outfile.write(bundle) nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nOutput >= output_reads + 10000: output_reads += 10000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor( umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join(map(str, ( read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft_clip_threshold)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info( "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common()])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format", default=False) parser.add_option("-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format", default=False) parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true", help="Ignore UMI and dedup only on position", default=False) parser.add_option("--subset", dest="subset", type="string", help="Use only a fraction of reads, specified by subset", default=1.1) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read os counted as spliced", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", help="Edit distance theshold at which to join two UMIs" "when clustering", default=1) parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="Use second-in-pair position when deduping") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional-adjacency", "percentile", "unique", "cluster"), default="directional-adjacency", help="method to use for umi deduping") parser.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option("--further-stats", dest="further_stats", action="store_true", default=False, help="Output further stats") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig," " e.g for transcriptome where contig = gene")) parser.add_option( "--whole-contig", dest="whole_contig", action="store_true", default=False, help= "Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option("--multimapping-detection-method", dest="detection_method", type="choice", choices=("NH", "X0", "XT"), default=None, help=("Some aligners identify multimapping using bam " "tags. Setting this option to NH, X0 or XT will " "use these tags when selecting the best read " "amongst reads with the same position and umi")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained", default=0) parser.add_option("--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates")) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "w" else: out_mode = "wb" if options.stats: if options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") if options.further_stats: if not options.stats: raise ValueError("'--further-stats' options requires " "'--output-stats' option") if options.method not in ["cluster", "adjacency"]: raise ValueError("'--further-stats' only enabled with 'cluster' " "and 'adjacency' methods") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = TwoPassPairWriter(infile, outfile) nInput, nOutput = 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = random_read_generator(infile.filename, chrom=options.chrom) for bundle in get_bundles(infile, ignore_umi=options.ignore_umi, subset=float(options.subset), quality_threshold=options.mapping_quality, paired=options.paired, chrom=options.chrom, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, whole_contig=options.whole_contig, read_length=options.read_length, detection_method=options.detection_method): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) if options.stats: # generate pre-dudep stats average_distance = get_average_umi_distance(bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # set up ClusterAndReducer functor with methods specific to # specified options.method processor = ClusterAndReducer(options.method) # dedup using umis and write out deduped bam reads, umis, umi_counts, topologies, nodes = processor( bundle, options.threshold, options.stats, options.further_stats) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [x.qname.split("_")[-1] for x in reads] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = get_average_umi_distance(post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) post_cluster_stats_null.append(average_distance_null) if options.further_stats: for c_type, count in topologies.most_common(): topology_counts[c_type] += count for c_type, count in nodes.most_common(): node_counts[c_type] += count outfile.close() if options.stats: stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # generate histograms of counts per UMI at each position UMI_counts_df_pre = pd.DataFrame( stats_pre_df.pivot_table(columns=stats_pre_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_post = pd.DataFrame( stats_post_df.pivot_table(columns=stats_post_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_pre.columns = ["instances"] UMI_counts_df_post.columns = ["instances"] UMI_counts_df = pd.merge(UMI_counts_df_pre, UMI_counts_df_post, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int UMI_counts_df = UMI_counts_df.fillna(0).astype(int) UMI_counts_df.to_csv(options.stats + "_per_umi_per_position.tsv", sep="\t") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - see comment above regarding missing values agg_df = agg_df.fillna(0).astype(int) agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame({ "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") if options.further_stats: with U.openFile(options.stats + "_topologies.tsv", "w") as outf: outf.write("\n".join([ "\t".join((x, str(y))) for x, y in topology_counts.most_common() ]) + "\n") with U.openFile(options.stats + "_nodes.tsv", "w") as outf: outf.write("\n".join([ "\t".join(map(str, (x, y))) for x, y in node_counts.most_common() ]) + "\n") # write footer and output benchmark information. U.info("Number of reads in: %i, Number of reads out: %i" % (nInput, nOutput)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig," " e.g for transcriptome where contig = gene")) parser.add_option( "--whole-contig", dest="whole_contig", action="store_true", default=False, help= "Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) parser.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "w" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = umi_methods.TwoPassPairWriter(infile, outfile, tags=True) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write( "read_id\tcontig\tposition\tumi\tumi_count\tfinal_umi\tfinal_umi_count\tunique_id\n" ) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") nInput, nOutput, unique_id = 0, 0, 0 read_events = collections.Counter() for bundle, read_events in umi_methods.get_bundles( infile, read_events, ignore_umi=False, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, chrom=options.chrom, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, whole_contig=options.whole_contig, read_length=options.read_length, umi_getter=umi_getter, all_reads=True): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadClusterer(options.method) bundle, groups, counts = processor(bundle=bundle, threshold=options.threshold, stats=True, deduplicate=False) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: if options.paired: # if paired, we need to supply the tags to # add to the paired read outfile.write(read, unique_id, top_umi) else: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft)[1], umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info( "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "extract-specific options") # (Experimental option) Retain the UMI in the sequence read" group.add_option("--retain-umi", dest="retain_umi", action="store_true", help=optparse.SUPPRESS_HELP) group.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") group.add_option("--read2-stdout", dest="read2_stdout", action="store_true", help="Paired reads, send read2 to stdout, discarding read1") group.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) group.add_option("--quality-filter-mask", dest="quality_filter_mask", type="int", help=("If a UMI base has a quality below this threshold, " "replace the base with 'N'")) group.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from 'phred33'" "[33-77] 'phred64' [64-106] or 'solexa' [59-106]")) group.add_option("--filter-cell-barcode", dest="filter_cell_barcode", action="store_true", help=optparse.SUPPRESS_HELP) group.add_option("--error-correct-cell", dest="error_correct_cell", action="store_true", help=("Correct errors in the cell barcode")) group.add_option("--whitelist", dest="whitelist", type="string", help=("A whitelist of accepted cell barcodes")) group.add_option("--blacklist", dest="blacklist", type="string", help=("A blacklist of rejected cell barcodes")) group.add_option("--filter-umi", dest="filter_umi", action="store_true", #help="Filter the UMIs" help=optparse.SUPPRESS_HELP) group.add_option("--umi-whitelist", dest="umi_whitelist", type="string", default=None, #help="A whitelist of accepted UMIs [default=%default]" help=optparse.SUPPRESS_HELP) group.add_option("--umi-whitelist-paired", dest="umi_whitelist_paired", type="string", default=None, #help="A whitelist of accepted UMIs for read2[default=%default]" help=optparse.SUPPRESS_HELP) group.add_option("--correct-umi-threshold", dest="correct_umi_threshold", type="int", default=0, #help="Correct errors in UMIs to the whitelist(s) provided" #"if within threshold [default=%default]" help=optparse.SUPPRESS_HELP) group.add_option("--umi-correct-log", dest="umi_correct_log", type="string", default=None, #help="File logging UMI error correction", help=optparse.SUPPRESS_HELP) group.add_option("--subset-reads", "--reads-subset", dest="reads_subset", type="int", help=("Only extract from the first N reads. If N is " "greater than the number of reads, all reads will " "be used")) group.add_option("--reconcile-pairs", dest="reconcile", action="store_true", help=("Allow the presences of reads in read2 input that " "are not present in read1 input. This allows cell " "barcode filtering of read1s without " "considering read2s")) parser.add_option_group(group) group = U.OptionGroup(parser, "[EXPERIMENTAl] barcode extraction options") group.add_option("--either-read", dest="either_read", action="store_true", help="UMI may be on either read (see " "--either-read-resolve) for options to resolve cases where" "UMI is on both reads") group.add_option("--either-read-resolve", dest="either_read_resolve", type="choice", choices=["discard", "quality"], help=("How to resolve instances where both reads " "contain a UMI but using --either-read." "Choose from 'discard' or 'quality'" "(use highest quality). default=dicard")) parser.add_option_group(group) parser.set_defaults(extract_method="string", filter_cell_barcodes=False, whitelist=None, blacklist=None, error_correct_cell=False, pattern=None, pattern2=None, read2_in=None, read2_out=False, read2_stdout=False, quality_filter_threshold=None, quality_encoding=None, reconcile=False, either_read=False, either_read_resolve="discard", ignore_suffix=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_extract_options=True, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False) if options.filter_cell_barcode: U.info('Use of --whitelist ensures cell barcodes are filtered. ' '--filter-cell-barcode is no longer required and may be ' 'removed in future versions.') if options.whitelist is not None: options.filter_cell_barcode = True if options.retain_umi and not options.extract_method == "regex": U.error("option --retain-umi only works with --extract-method=regex") if (options.filtered_out and not options.extract_method == "regex" and whitelist is None): U.error("Reads will not be filtered unless extract method is" "set to regex (--extract-method=regex) or cell" "barcodes are filtered (--whitelist)") if options.quality_filter_threshold or options.quality_filter_mask: if not options.quality_encoding: U.error("must provide a quality encoding (--quality-" "encoding) to filter UMIs by quality (--quality" "-filter-threshold) or mask low quality bases " "with (--quality-filter-mask)") extract_cell, extract_umi = U.validateExtractOptions(options) if options.either_read: if extract_cell: U.error("Option to extract from either read (--either-read) " "is not currently compatible with cell barcode extraction") if not options.extract_method == "regex": U.error("Option to extract from either read (--either-read)" "requires --extract-method=regex") if not options.pattern or not options.pattern2: U.error("Option to extract from either read (--either-read)" "requires --bc-pattern=[PATTERN1] and" "--bc-pattern2=[PATTERN2]") if options.filter_umi: if not options.umi_whitelist: U.error("must provide a UMI whitelist (--umi-whitelist) if using " "--filter-umi option") if options.pattern2 and not options.umi_whitelist_paired: U.error("must provide a UMI whitelist for paired end " "(--umi-whitelist-paired) if using --filter-umi option" "with paired end data") if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" ( options.pattern, options.pattern2)) if options.whitelist: if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = extract_methods.ExtractFilterAndUpdate( options.extract_method, options.pattern, options.pattern2, options.prime3, extract_cell, options.quality_encoding, options.quality_filter_threshold, options.quality_filter_mask, options.filter_umi, options.filter_cell_barcode, options.retain_umi, options.either_read, options.either_read_resolve) if options.filter_umi: umi_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes( options.umi_whitelist, options.umi_whitelist_paired, deriveErrorCorrection=True, threshold=options.correct_umi_threshold) U.info("Length of whitelist: %i" % len(umi_whitelist)) U.info("Length of 'correctable' whitelist: %i" % len(false_to_true_map)) ReadExtractor.umi_whitelist = umi_whitelist ReadExtractor.umi_false_to_true_map = false_to_true_map ReadExtractor.umi_whitelist_counts = collections.defaultdict( lambda: collections.Counter()) if options.whitelist: cell_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes( options.whitelist, getErrorCorrection=options.error_correct_cell) ReadExtractor.cell_whitelist = cell_whitelist ReadExtractor.false_to_true_map = false_to_true_map if options.blacklist: blacklist = set() with U.openFile(options.blacklist, "r") as inf: for line in inf: blacklist.add(line.strip().split("\t")[0]) ReadExtractor.cell_blacklist = blacklist # variables for progress monitor progCount = 0 displayMax = 100000 U.info("Starting barcode extraction") if options.filtered_out: filtered_out = U.openFile(options.filtered_out, "w") if options.read2_in is None: for read in read1s: # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) new_read = ReadExtractor(read) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not new_read: if options.filtered_out: filtered_out.write(str(read) + "\n") continue options.stdout.write(str(new_read) + "\n") else: if options.filtered_out2: filtered_out2 = U.openFile(options.filtered_out2, "w") read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) if options.read2_out: read2_out = U.openFile(options.read2_out, "w") if options.reconcile: strict = False else: strict = True for read1, read2 in umi_methods.joinedFastqIterate( read1s, read2s, strict, options.ignore_suffix): # incrementing count for monitoring progress progCount += 1 # Update display in every 100kth iteration if progCount % displayMax == 0: U.info("Parsed {} reads".format(progCount)) sys.stdout.flush() reads = ReadExtractor(read1, read2) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not reads: if options.filtered_out: filtered_out.write(str(read1) + "\n") if options.filtered_out2: filtered_out2.write(str(read2) + "\n") continue else: new_read1, new_read2 = reads if options.read2_stdout: options.stdout.write(str(new_read2) + "\n") else: options.stdout.write(str(new_read1) + "\n") if options.read2_out: read2_out.write(str(new_read2) + "\n") if options.read2_out: read2_out.close() if options.filtered_out: filtered_out.close() if options.filtered_out2: filtered_out2.close() for k, v in ReadExtractor.getReadCounts().most_common(): U.info("%s: %s" % (k, v)) if options.umi_correct_log: with U.openFile(options.umi_correct_log, "w") as outf: outf.write("umi\tcount_no_errors\tcount_errors\n") for umi, counts in ReadExtractor.umi_whitelist_counts.items(): outf.write("%s\t%i\t%i\n" % ( umi, counts["no_error"], counts["error"])) outf.close() U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--split-barcode", dest="split", action="store_true", help="barcode is split across read pair") parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern. Ns are random bases X's fixed") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern. Ns are random bases X's fixed") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read") parser.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") parser.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) parser.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from phred33" "[33-77] phred64 [64-106] or solexa [59-106]")) parser.add_option("--supress-stats", dest="stats", action="store_false", help="Suppress the writing of stats to the log") parser.set_defaults(split=False, pattern=None, pattern2=None, read2_in=None, read2_out=None, prime3=False, stats=True, quality_filter_threshold=None, quality_encoding=None) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) # check options if not options.pattern: raise ValueError("must specify a pattern using ``--bc-pattern``") if options.split: if not options.read2_in: raise ValueError("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern if options.read2_in: if not options.read2_out: raise ValueError("must specify an output for the paired end " "``--read2-out``") if options.quality_filter_threshold: if not options.quality_encoding: raise ValueError("must provide a quality encoding to filter UMIs " "by quality ``--quality-encoding``") # Initialise the processor processor = Extractor(options.pattern, options.pattern2, options.quality_filter_threshold, options.quality_encoding, options.prime3) read1s = fastqIterate(options.stdin) if options.read2_in is None: for read in read1s: new_1 = processor(read) if new_1: options.stdout.write(str(new_1) + "\n") else: read2s = fastqIterate(U.openFile(options.read2_in)) read2_out = U.openFile(options.read2_out, "w") for read1, read2 in izip(read1s, read2s): U.info("read1: %s, read2: %s" % ( read1, read2)) new_1, new_2 = processor(read1, read2) if new_1: options.stdout.write(str(new_1) + "\n") read2_out.write(str(new_2) + "\n") # write footer and output benchmark information. if options.stats: options.stdlog.write("\t".join(["Barcode", "UMI", "Sample", "Count"]) + "\n") for id in processor.bc_count: options.stdlog.write("\t".join(id+(str(processor.bc_count[id]),)) + "\n") U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "percentile", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig (field 3 in BAM; RNAME)," " e.g for transcriptome where contig = gene")) parser.add_option("--per-gene", dest="per_gene", action="store_true", default=False, help=("Deduplicate per gene," "e.g for transcriptome where contig = transcript" "must also provide a transript to gene map with" "--gene-transcript-map [default=%default]")) parser.add_option("--gene-transcript-map", dest="gene_transcript_map", type="string", help="file mapping transcripts to genes (tab separated)", default=None) parser.add_option("--gene-tag", dest="gene_tag", type="string", help=("Deduplicate per gene where gene is" "defined by this bam tag [default=%default]"), default=None) parser.add_option( "--skip-tags-regex", dest="skip_regex", type="string", help=("Used with --gene-tag. " "Ignore reads where the gene-tag matches this regex"), default="^[__|Unassigned]") # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.per_gene: if not options.gene_transcript_map and not options.gene_tag: raise ValueError( "--per-gene option requires --gene-transcript-map " "or --gene-tag") try: re.compile(options.skip_regex) except re.error: raise ValueError("skip-regex '%s' is not a " "valid regex" % options.skip_regex) infile = pysam.Samfile(in_name, in_mode) nInput, nOutput = 0, 0 # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() gene_tag = options.gene_tag options.stdout.write("%s\t%s\n" % ("gene", "count")) for gene, bundle, read_events in umi_methods.get_gene_count( inreads, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, per_contig=options.per_contig, gene_tag=options.gene_tag, skip_regex=options.skip_regex, umi_getter=umi_getter): umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) gene_count = len(groups) options.stdout.write("%s\t%i\n" % (gene, gene_count)) nOutput += gene_count # output reads events and benchmark information. for event in read_events.most_common(): U.info("%s: %s" % (event[0], event[1])) U.info("Number of reads counted: %i" % nOutput) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format", default=False) parser.add_option("-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format", default=False) parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true", help="Ignore UMI and dedup only on position", default=False) parser.add_option("--subset", dest="subset", type="string", help="Use only a fraction of reads, specified by subset", default=1.1) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read os counted as spliced", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", help="Edit distance theshold at which to join two UMIs" "when clustering", default=1) parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="Use second-in-pair position when deduping") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional-adjacency", "percentile", "unique", "cluster"), default="directional-adjacency", help="method to use for umi deduping") parser.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option("--further-stats", dest="further_stats", action="store_true", default=False, help="Output further stats") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig," " e.g for transcriptome where contig = gene")) parser.add_option("--whole-contig", dest="whole_contig", action="store_true", default=False, help="Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option("--multimapping-detection-method", dest="detection_method", type="choice", choices=("NH", "X0", "XT"), default=None, help=("Some aligners identify multimapping using bam " "tags. Setting this option to NH, X0 or XT will " "use these tags when selecting the best read " "amongst reads with the same position and umi")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained", default=0) parser.add_option("--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates")) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "w" else: out_mode = "wb" if options.stats: if options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") if options.further_stats: if not options.stats: raise ValueError("'--further-stats' options requires " "'--output-stats' option") if options.method not in ["cluster", "adjacency"]: raise ValueError("'--further-stats' only enabled with 'cluster' " "and 'adjacency' methods") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = TwoPassPairWriter(infile, outfile) nInput, nOutput = 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % ( options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = random_read_generator(infile.filename, chrom=options.chrom) for bundle in get_bundles(infile, ignore_umi=options.ignore_umi, subset=float(options.subset), quality_threshold=options.mapping_quality, paired=options.paired, chrom=options.chrom, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, whole_contig=options.whole_contig, read_length=options.read_length, detection_method=options.detection_method): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) if options.stats: # generate pre-dudep stats average_distance = get_average_umi_distance(bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # set up ClusterAndReducer functor with methods specific to # specified options.method processor = ClusterAndReducer(options.method) # dedup using umis and write out deduped bam reads, umis, umi_counts, topologies, nodes = processor( bundle, options.threshold, options.stats, options.further_stats) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [x.qname.split("_")[-1] for x in reads] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = get_average_umi_distance(post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) post_cluster_stats_null.append(average_distance_null) if options.further_stats: for c_type, count in topologies.most_common(): topology_counts[c_type] += count for c_type, count in nodes.most_common(): node_counts[c_type] += count outfile.close() if options.stats: stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # generate histograms of counts per UMI at each position UMI_counts_df_pre = pd.DataFrame(stats_pre_df.pivot_table( columns=stats_pre_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_post = pd.DataFrame(stats_post_df.pivot_table( columns=stats_post_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_pre.columns = ["instances"] UMI_counts_df_post.columns = ["instances"] UMI_counts_df = pd.merge(UMI_counts_df_pre, UMI_counts_df_post, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int UMI_counts_df = UMI_counts_df.fillna(0).astype(int) UMI_counts_df.to_csv( options.stats + "_per_umi_per_position.tsv", sep="\t") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - see comment above regarding missing values agg_df = agg_df.fillna(0).astype(int) agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int(max(map(max, [pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame({ "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins}) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") if options.further_stats: with U.openFile(options.stats + "_topologies.tsv", "w") as outf: outf.write( "\n".join(["\t".join((x, str(y))) for x, y in topology_counts.most_common()]) + "\n") with U.openFile(options.stats + "_nodes.tsv", "w") as outf: outf.write( "\n".join(["\t".join(map(str, (x, y))) for x, y in node_counts.most_common()]) + "\n") # write footer and output benchmark information. U.info("Number of reads in: %i, Number of reads out: %i" % (nInput, nOutput)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--read2-out", dest="read2_out", type="string", help="file to output processed paired read to") parser.add_option( "--read2-out-only", dest="read2_out_only", action="store_true", help="Paired reads, only output the second read in the pair") parser.add_option("--quality-filter-threshold", dest="quality_filter_threshold", type="int", help=("Remove reads where any UMI base quality score " "falls below this threshold")) parser.add_option( "--quality-filter-mask", dest="quality_filter_mask", type="int", help=("If a UMI base has a quality below this threshold, " "replace the base with 'N'")) parser.add_option("--quality-encoding", dest="quality_encoding", type="choice", choices=["phred33", "phred64", "solexa"], help=("Quality score encoding. Choose from 'phred33'" "[33-77] 'phred64' [64-106] or 'solexa' [59-106]")) parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--filter-cell-barcode", dest="filter_cell_barcode", action="store_true", help="Filter the cell barcodes") parser.add_option("--error-correct-cell", dest="error_correct_cell", action="store_true", help=("Correct errors in the cell barcode")) parser.add_option("--error-correct-threshold", dest="error_correct_threshold", type="int", help=("Hamming distance allowed for correction")) parser.add_option("--plot-prefix", dest="plot_prefix", type="string", help=("Prefix for plots to visualise the automated " "detection of the number of 'true' cell barcodes")) parser.add_option("--output-whitelist", dest="output_whitelist", type="string", help=("Write out the automatically generated whitelist")) parser.add_option("--whitelist-tsv", dest="whitelist_tsv", type="string", help=("A whitelist of accepted cell barcodes")) parser.add_option("--blacklist-tsv", dest="blacklist_tsv", type="string", help=("A blacklist of accepted cell barcodes")) parser.add_option( "--cell-barcode-subset", dest="cell_barcode_subset", type="int", help=("Use only the first N reads to automatically " "identify the true cell barcodes. If N is greater " "than the number of reads, all reads will be used")) parser.add_option("--reads-subset", dest="reads_subset", type="int", help=("Only extract from the first N reads. If N is " "greater than the number of reads, all reads will " "be used")) parser.add_option( "--reconcile-pairs", dest="reconcile", action="store_true", help=("Allow the presences of reads in read2 input that are" "not present in read1 input. This allows cell barcode" "filtering of read1s without considering read2s")) parser.set_defaults(extract_method="string", filter_cell_barcodes=False, whitelist_tsv=None, blacklist_tsv=None, error_correct_cell=False, error_correct_threshold=1, pattern=None, pattern2=None, read2_in=None, read2_out=False, read2_out_only=False, quality_filter_threshold=None, quality_encoding=None, plot_prefix=None, output_whitelist=None, cell_barcode_subset=50000000, reconcile=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.quality_filter_threshold or options.quality_filter_mask: if not options.quality_encoding: U.error("must provide a quality encoding (--quality-" "encoding) to filter UMIs by quality (--quality" "-filter-threshold) or mask low quality bases " "with (--quality-filter-mask)") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.barcode_regex2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.barcode_regex2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if options.whitelist_tsv: if options.blacklist_tsv: U.error("Do not supply a blacklist and a whitelist. Just " "remove the blacklist barcodes from the whitelist!") if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % (options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" (options.pattern, options.pattern2)) if options.stdin == sys.stdin: if not options.whitelist_tsv and options.filter_cell_barcode: U.error( "cannot support reading from stdin if correcting cell barcode") read1s = umi_methods.fastqIterate(U.openFile(options.stdin)) else: read1s = umi_methods.fastqIterate(U.openFile(options.stdin.name)) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( options.extract_method, options.pattern, options.pattern2, options.prime3, extract_cell, options.quality_encoding, options.quality_filter_threshold, options.quality_filter_mask, options.filter_cell_barcode) if options.filter_cell_barcode: if (not options.whitelist_tsv) or options.error_correct_cell: cell_barcode_counts = collections.Counter() n_reads = 0 if not options.read2_in: for read1 in read1s: n_reads += 1 cell_barcode = ReadExtractor.getCellBarcode(read1) if cell_barcode: cell_barcode_counts[cell_barcode] += 1 if options.cell_barcode_subset: if (n_reads > options.cell_barcode_subset): break else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) for read1, read2 in izip(read1s, read2s): n_reads += 1 cell_barcode = ReadExtractor.getCellBarcode(read1, read2) if cell_barcode: cell_barcode_counts[cell_barcode] += 1 if options.cell_barcode_subset: if (n_reads > options.cell_barcode_subset): break if options.blacklist_tsv: cell_blacklist = umi_methods.getUserDefinedBarcodes( options.blacklist_tsv) for cell in cell_blacklist: del cell_barcode_counts[cell] if options.whitelist_tsv: cell_whitelist = umi_methods.getUserDefinedBarcodes( options.whitelist_tsv) error_correct_mappings = umi_methods.getErrorCorrectMappings( cell_barcode_counts.keys(), cell_whitelist, options.error_correct_threshold) else: # getCellWhitelist has not been properly defined yet! cell_whitelist, error_correct_mappings = umi_methods.getCellWhitelist( cell_barcode_counts, options.error_correct_threshold, options.plot_prefix) # re-make the reads1s iterator read1s = umi_methods.fastqIterate(U.openFile(options.stdin.name)) else: cell_whitelist = umi_methods.getUserDefinedBarcodes( options.whitelist_tsv) error_correct_mappings = None, None false_to_true_map, true_to_false_map = error_correct_mappings if options.output_whitelist: with U.openFile(options.output_whitelist, "w") as outf: columns = [ "barcode", "count", "corrected_barcodes", "corrected_barcode_counts" ] outf.write("\t".join(columns) + "\n") for barcode in sorted(list(cell_whitelist)): if true_to_false_map: corrected_barcodes = ",".join( sorted(true_to_false_map[barcode])) corrected_barcode_counts = ",".join( map(str, [ cell_barcode_counts[x] for x in sorted(true_to_false_map[barcode]) ])) else: corrected_barcodes, corrected_barcode_counts = "", "" outf.write("%s\t%s\t%s\t%s\n" % (barcode, cell_barcode_counts[barcode], corrected_barcodes, corrected_barcode_counts)) ReadExtractor.cell_whitelist = cell_whitelist ReadExtractor.false_to_true_map = false_to_true_map if options.read2_in is None: for read in read1s: new_read = ReadExtractor(read) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not new_read: continue options.stdout.write(str(new_read) + "\n") else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) if options.read2_out: read2_out = U.openFile(options.read2_out, "w") if options.reconcile: strict = False else: strict = True for read1, read2 in umi_methods.joinedFastqIterate( read1s, read2s, strict): reads = ReadExtractor(read1, read2) if options.reads_subset: if (ReadExtractor.read_counts['Input Reads'] > options.reads_subset): break if not reads: continue else: new_read1, new_read2 = reads if not options.read2_out_only: options.stdout.write(str(new_read1) + "\n") if options.read2_out: read2_out.write(str(new_read2) + "\n") if options.read2_out: read2_out.close() for k, v in ReadExtractor.getReadCounts().most_common(): U.info("%s: %s" % (k, v)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) group = U.OptionGroup(parser, "group-specific options") group.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) group.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) group.add_option( "--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename() sorted_out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename() sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join([ "read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id" ])) nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) bundle_iterator = umi_methods.get_bundles( options, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped, metacontig_contig=metacontig2contig) for bundle, key, status in bundle_iterator(inreads): # write out read2s and unmapped (if these options are set) if status == 'single_read': # bundle is just a single read here nInput += 1 if outfile: outfile.write(bundle) nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nOutput >= output_reads + 10000: output_reads += 10000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft_clip_threshold)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info("Reads: %s" % ", ".join([ "%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common() ])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--bc-pattern", dest="pattern", type="string", help="Barcode pattern") parser.add_option("--bc-pattern2", dest="pattern2", type="string", help="Barcode pattern for paired reads") parser.add_option("--3prime", dest="prime3", action="store_true", help="barcode is on 3' end of read.") parser.add_option("--read2-in", dest="read2_in", type="string", help="file name for read pairs") parser.add_option("--extract-method", dest="extract_method", type="choice", choices=["string", "regex"], help=("How to extract the umi +/- cell barcodes, Choose " "from 'string' or 'regex'")) parser.add_option("--plot-prefix", dest="plot_prefix", type="string", help=("Prefix for plots to visualise the automated " "detection of the number of 'true' cell barcodes")) parser.add_option("--subset-reads", dest="subset_reads", type="int", help=("Use the first N reads to automatically identify " "the true cell barcodes. If N is greater than the " "number of reads, all reads will be used")) parser.add_option("--error-correct-threshold", dest="error_correct_threshold", type="int", help=("Hamming distance for correction of " "barcodes to whitelist barcodes")) parser.add_option("--method", dest="method", choices=["reads", "umis"], help=("Use reads or unique umi counts per cell")) parser.add_option("--expect-cells", dest="expect_cells", type="int", help=("Prior expectation on the upper limit on the " "number of cells sequenced")) parser.add_option("--set-cell-number", dest="cell_number", type="int", help=("Specify the number of cell barcodes to accept")) parser.set_defaults(method="reads", extract_method="string", filter_cell_barcodes=False, whitelist_tsv=None, blacklist_tsv=None, error_correct_threshold=1, pattern=None, pattern2=None, read2_in=None, plot_prefix=None, subset_reads=100000000, expect_cells=False, cell_number=False) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_sam_options=False) if options.expect_cells and options.cell_number: U.error("Cannot supply both --expect-cells and " "--cell-number options") if not options.pattern and not options.pattern2: if not options.read2_in: U.error("Must supply --bc-pattern for single-end") else: U.error("Must supply --bc-pattern and/or --bc-pattern2 " "if paired-end ") if options.pattern2: if not options.read2_in: U.error("must specify a paired fastq ``--read2-in``") if not options.pattern2: options.pattern2 = options.pattern extract_cell = False extract_umi = False # If the pattern is a regex we can compile the regex(es) prior to # ExtractFilterAndUpdate instantiation if options.extract_method == "regex": if options.pattern: try: options.pattern = regex.compile(options.pattern) except regex.error: U.error("barcode_regex '%s' is not a " "valid regex" % options.pattern) if options.pattern2: try: options.pattern2 = regex.compile(options.barcode_regex2) except regex.Error: U.error("barcode_regex2 '%s' is not a " "valid regex" % options.barcode_regex2) # check whether the regex contains a umi group(s) and cell groups(s) if options.extract_method == "regex": if options.pattern: for group in options.pattern.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True if options.pattern2: for group in options.pattern2.groupindex: if group.startswith("cell_"): extract_cell = True elif group.startswith("umi_"): extract_umi = True # check whether the pattern string contains umi/cell bases elif options.extract_method == "string": if options.pattern: if "C" in options.pattern: extract_cell = True if "N" in options.pattern: extract_umi = True if options.pattern2: if "C" in options.pattern2: extract_cell = True if "N" in options.pattern2: extract_umi = True if not extract_umi: if options.extract_method == "string": U.error("barcode pattern(s) do not include any umi bases " "(marked with 'Ns') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any umi groups " "(starting with 'umi_') %s, %s" ( options.pattern, options.pattern2)) if not extract_cell: if options.extract_method == "string": U.error("barcode pattern(s) do not include any cell bases " "(marked with 'Cs') %s, %s" % ( options.pattern, options.pattern2)) elif options.extract_method == "regex": U.error("barcode regex(es) do not include any cell groups " "(starting with 'cell_') %s, %s" ( options.pattern, options.pattern2)) read1s = umi_methods.fastqIterate(options.stdin) # set up read extractor ReadExtractor = umi_methods.ExtractFilterAndUpdate( method=options.extract_method, pattern=options.pattern, pattern2=options.pattern2, prime3=options.prime3, extract_cell=extract_cell) cell_barcode_counts = collections.Counter() n_reads = 0 n_cell_barcodes = 0 # if using the umis method, need to keep a set of umis observed if options.method == "umis": cell_barcode_umis = collections.defaultdict(set) # variables for progress monitor displayMax = 100000 U.info("Starting barcode extraction") if not options.read2_in: for read1 in read1s: # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1) if barcode_values is None: continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_cell_barcodes > options.subset_reads: break else: read2s = umi_methods.fastqIterate(U.openFile(options.read2_in)) for read1, read2 in izip(read1s, read2s): # Update display in every 100kth iteration if n_reads % displayMax == 0: U.info("Parsed {} reads".format(n_reads)) n_reads += 1 barcode_values = ReadExtractor.getBarcodes(read1, read2) if barcode_values is None: continue else: cell, umi, _, _, _, _, _ = barcode_values if options.method == "umis": cell_barcode_umis[cell].add(umi) else: cell_barcode_counts[cell] += 1 n_cell_barcodes += 1 if options.subset_reads: if n_reads > options.subset_reads: break U.info("Starting - whitelist determination") if options.method == "umis": for cell in cell_barcode_umis: cell_barcode_counts[cell] = len(cell_barcode_umis[cell]) if options.cell_number and options.cell_number > len(cell_barcode_counts): raise ValueError( "--set-cell-barcode option specifies more cell barcodes than the " "number of observed cell barcodes. This may be because " "--subset-reads was set to a value too low to capture reads from " "all cells. %s cell barcodes observed from %s parsed reads. " "Expected>= %s cell barcodes" % ( len(cell_barcode_counts), options.subset_reads, options.cell_number)) cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist( cell_barcode_counts, options.expect_cells, options.cell_number, options.error_correct_threshold, options.plot_prefix) U.info("Writing out whitelist") for barcode in sorted(list(cell_whitelist)): if true_to_false_map: corrected_barcodes = ",".join( sorted(true_to_false_map[barcode])) corrected_barcode_counts = ",".join( map(str, [cell_barcode_counts[x] for x in sorted(true_to_false_map[barcode])])) else: corrected_barcodes, corrected_barcode_counts = "", "" options.stdout.write("%s\t%s\t%s\t%s\n" % ( barcode, corrected_barcodes, cell_barcode_counts[barcode], corrected_barcode_counts)) U.info("Parsed %i reads" % n_reads) U.info("%i reads matched the barcode pattern" % n_cell_barcodes) U.info("Found %i unique cell barcodes" % len(cell_barcode_counts)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true", help="Ignore UMI and dedup" " only on position", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "percentile", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option( "--whole-contig", dest="whole_contig", action="store_true", default=False, help= "Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option("--multimapping-detection-method", dest="detection_method", type="choice", choices=("NH", "X0", "XT"), default=None, help=("Some aligners identify multimapping using bam " "tags. Setting this option to NH, X0 or XT will " "use these tags when selecting the best read " "amongst reads with the same position and umi " "[default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig (field 3 in BAM; RNAME)," " e.g for transcriptome where contig = gene")) parser.add_option("--per-gene", dest="per_gene", action="store_true", default=False, help=("Deduplicate per gene," "e.g for transcriptome where contig = transcript" "must also provide a transript to gene map with" "--gene-transcript-map [default=%default]")) parser.add_option("--gene-transcript-map", dest="gene_transcript_map", type="string", help="file mapping transcripts to genes (tab separated)", default=None) parser.add_option("--gene-tag", dest="gene_tag", type="string", help=("Deduplicate per gene where gene is" "defined by this bam tag [default=%default]"), default=None) parser.add_option( "--skip-tags-regex", dest="skip_regex", type="string", help=("Used with --gene-tag. " "Ignore reads where the gene-tag matches this regex"), default="^[__|Unassigned]") # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.stats: if options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") if options.per_gene: if not options.gene_transcript_map and not options.gene_map: raise ValueError( "--per-gene option requires --gene-transcript-map " "or --gene-tag") try: re.compile(options.skip_regex) except re.error: raise ValueError("skip-regex '%s' is not a " "valid regex" % options.skip_regex) infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = umi_methods.TwoPassPairWriter(infile, outfile) nInput, nOutput = 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = umi_methods.random_read_generator(infile.filename, chrom=options.chrom, umi_getter=umi_getter) if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() gene_tag = options.gene_tag for bundle, read_events, status in umi_methods.get_bundles( inreads, ignore_umi=options.ignore_umi, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, gene_tag=options.gene_tag, skip_regex=options.skip_regex, whole_contig=options.whole_contig, read_length=options.read_length, detection_method=options.detection_method, umi_getter=umi_getter, all_reads=False, return_read2=False, return_unmapped=False): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) if options.stats: # generate pre-dudep stats average_distance = umi_methods.get_average_umi_distance( bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadDeduplicator(options.method) # dedup using umis and write out deduped bam reads, umis, umi_counts = processor(bundle=bundle, threshold=options.threshold) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [umi_getter(x) for x in reads] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = umi_methods.get_average_umi_distance( post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) post_cluster_stats_null.append(average_distance_null) outfile.close() if options.stats: # generate the stats dataframe stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # tally the counts per umi per position pre_counts = collections.Counter(stats_pre_df["counts"]) post_counts = collections.Counter(stats_post_df["counts"]) counts_index = list( set(pre_counts.keys()).union(set(post_counts.keys()))) counts_index.sort() with U.openFile(options.stats + "_per_umi_per_position.tsv", "w") as outf: outf.write("counts\tinstances_pre\tinstances_post\n") for count in counts_index: values = (count, pre_counts[count], post_counts[count]) outf.write("\t".join(map(str, values)) + "\n") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int agg_df = agg_df.fillna(0).astype(int) agg_df.index = [x.decode() for x in agg_df.index] agg_df.index.name = 'UMI' agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame({ "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") # write footer and output benchmark information. U.info( "%s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i" % nOutput) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) group = U.OptionGroup(parser, "count-specific options") parser.add_option("--wide-format-cell-counts", dest="wide_format_cell_counts", action="store_true", default=False, help=("output the cell counts in a wide format " "(rows=genes, columns=cells)")) parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False) options.per_gene = True # hardcodes counting to per-gene only U.validateSamOptions(options, group=False) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.in_sam: in_mode = "r" else: in_mode = "rb" infile = pysam.Samfile(in_name, in_mode) # write out to tempfile and then sort to stdout tmpfilename = U.getTempFilename(dir=options.tmpdir) tmpfile = U.openFile(tmpfilename, mode="w") nInput, nOutput, input_reads = 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() bundle_iterator = umi_methods.get_bundles( options, only_count_reads=True, metacontig_contig=metacontig2contig) for bundle, key, status in bundle_iterator(inreads): if status == "single_read": continue gene, cell = key umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) gene_count = len(groups) if options.per_cell: tmpfile.write("%s\n" % "\t".join( (gene, cell.decode(), str(gene_count)))) else: tmpfile.write("%s\n" % "\t".join((gene, str(gene_count)))) nOutput += gene_count tmpfile.close() if options.per_cell: gene_counts_dict = {} with U.openFile(tmpfilename, mode="r") as inf: genes = set() cells = set() for line in inf: gene, cell, gene_count = line.strip().split("\t") genes.add(gene) cells.add(cell) if gene not in gene_counts_dict: gene_counts_dict[gene] = {} gene_counts_dict[gene][cell] = gene_count if options.wide_format_cell_counts: # write out in wide format options.stdout.write("%s\t%s\n" % ("gene", "\t".join(sorted(cells)))) for gene in sorted(genes): counts = [] for cell in sorted(cells): if cell in gene_counts_dict[gene]: counts.append(gene_counts_dict[gene][cell]) else: counts.append(0) options.stdout.write("%s\t%s\n" % (gene, "\t".join(map(str, counts)))) else: # write out in long format options.stdout.write("%s\t%s\t%s\n" % ("gene", "cell", "count")) for gene in sorted(genes): for cell in sorted(list(gene_counts_dict[gene].keys())): options.stdout.write( "%s\t%s\t%s\n" % (gene, cell, gene_counts_dict[gene][cell])) else: options.stdout.write("%s\t%s\n" % ("gene", "count")) with U.openFile(tmpfilename, mode="r") as inf: for line in inf: options.stdout.write(line) os.unlink(tmpfilename) # output reads events and benchmark information. for event in bundle_iterator.read_events.most_common(): U.info("%s: %s" % (event[0], event[1])) U.info("Number of (post deduplication) reads counted: %i" % nOutput) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "dedup-specific options") group.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options, group=False) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename(dir=options.tmpdir) sorted_out_name = options.stdout.name options.stdout.close() else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename(dir=options.tmpdir) sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.stats and options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = sam_methods.TwoPassPairWriter(infile, outfile) nInput, nOutput, input_reads, output_reads = 0, 0, 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_contig and options.gene_transcript_map: metacontig2contig = sam_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = sam_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadDeduplicator(options.method) bundle_iterator = sam_methods.get_bundles( options, metacontig_contig=metacontig2contig) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = umi_methods.random_read_generator( infile.filename, chrom=options.chrom, barcode_getter=bundle_iterator.barcode_getter) for bundle, key, status in bundle_iterator(inreads): nInput += sum([bundle[umi]["count"] for umi in bundle]) while nOutput >= output_reads + 100000: output_reads += 100000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) if options.stats: # generate pre-dudep stats average_distance = umi_methods.get_average_umi_distance( bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # dedup using umis and write out deduped bam reads, umis, umi_counts = processor(bundle=bundle, threshold=options.threshold) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [ bundle_iterator.barcode_getter(x)[0] for x in reads ] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = umi_methods.get_average_umi_distance( post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) post_cluster_stats_null.append(average_distance_null) outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.stats: # generate the stats dataframe stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # tally the counts per umi per position pre_counts = collections.Counter(stats_pre_df["counts"]) post_counts = collections.Counter(stats_post_df["counts"]) counts_index = list( set(pre_counts.keys()).union(set(post_counts.keys()))) counts_index.sort() with U.openFile(options.stats + "_per_umi_per_position.tsv", "w") as outf: outf.write("counts\tinstances_pre\tinstances_post\n") for count in counts_index: values = (count, pre_counts[count], post_counts[count]) outf.write("\t".join(map(str, values)) + "\n") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int agg_df = agg_df.fillna(0).astype(int) agg_df.index = [x.decode() for x in agg_df.index] agg_df.index.name = 'UMI' agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame( { "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }, columns=[ "unique", "unique_null", options.method, "%s_null" % options.method, "edit_distance" ]) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") # write footer and output benchmark information. U.info("Reads: %s" % ", ".join([ "%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common() ])) U.info("Number of reads out: %i" % nOutput) if not options.ignore_umi: # otherwise processor has not been used U.info("Total number of positions deduplicated: %i" % processor.UMIClusterer.positions) if processor.UMIClusterer.positions > 0: U.info("Mean number of unique UMIs per position: %.2f" % (float(processor.UMIClusterer.total_umis_per_position) / processor.UMIClusterer.positions)) U.info("Max. number of unique UMIs per position: %i" % processor.UMIClusterer.max_umis_per_position) else: U.warn("The BAM did not contain any valid " "reads/read pairs for deduplication") U.Stop()
def getUserDefinedBarcodes(whitelist_tsv, whitelist_tsv2=None, getErrorCorrection=False, deriveErrorCorrection=False, threshold=1): ''' whitelist_tsv: tab-separated file with whitelisted barcodes. First field should be whitelist barcodes. Second field [optional] should be comma-separated barcodes which are to be corrected to the barcode in the first field. whitelist_tsv2: as above but for read2s getErrorCorrection: extract the second field in whitelist_tsv and return a map of non-whitelist:whitelist deriveErrorCorrection: return a map of non-whitelist:whitelist using a simple edit distance threshold ''' base2errors = {"A": ["T", "C", "G", "N"], "T": ["A", "C", "G", "N"], "C": ["T", "A", "G", "N"], "G": ["T", "C", "A", "N"]} whitelist = [] if getErrorCorrection or deriveErrorCorrection: false_to_true_map = {} else: false_to_true_map = None def singleBarcodeGenerator(whitelist_tsv): with U.openFile(whitelist_tsv, "r") as inf: for line in inf: if line.startswith('#'): continue line = line.strip().split("\t") yield(line[0]) def pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2): whitelist1 = [] whitelist2 = [] with U.openFile(whitelist_tsv, "r") as inf: for line in inf: if line.startswith('#'): continue line = line.strip().split("\t") whitelist1.append(line[0]) with U.openFile(whitelist_tsv2, "r") as inf2: for line in inf2: if line.startswith('#'): continue line = line.strip().split("\t") whitelist2.append(line[0]) for w1, w2 in itertools.product(whitelist1, whitelist2): yield(w1 + w2) if deriveErrorCorrection: if whitelist_tsv2: whitelist_barcodes = pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2) else: whitelist_barcodes = singleBarcodeGenerator(whitelist_tsv) for whitelist_barcode in whitelist_barcodes: whitelist.append(whitelist_barcode) # for every possible combination of positions for error(s) for positions in itertools.product( range(0, len(whitelist_barcode)), repeat=threshold): m_bases = [base2errors[whitelist_barcode[x]] for x in positions] # for every possible combination of errors for m in itertools.product(*m_bases): error_barcode = list(whitelist_barcode) # add errors for pos, error_base in zip(positions, m): error_barcode[pos] = error_base error_barcode = "".join(error_barcode) # if error barcode has already been seen, must be within # threshold edit distance of >1 whitelisted barcodes if error_barcode in false_to_true_map: # don't report multiple times for the same barcode if false_to_true_map[error_barcode]: U.info("Error barcode %s can be assigned to more than " "one possible true barcode: %s or %s" % ( error_barcode, false_to_true_map[error_barcode], whitelist_barcode)) false_to_true_map[error_barcode] = None else: false_to_true_map[error_barcode] = whitelist_barcode elif getErrorCorrection: assert not whitelist_tsv2, ("Can only extract errors from the whitelist " "if a single whitelist is given") with U.openFile(whitelist_tsv, "r") as inf: for line in inf: if line.startswith('#'): continue line = line.strip().split("\t") whitelist_barcode = line[0] whitelist.append(whitelist_barcode) if getErrorCorrection: for error_barcode in line[1].split(","): false_to_true_map[error_barcode] = whitelist_barcode else: # no error correction if whitelist_tsv2: whitelist_barcodes = pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2) else: whitelist_barcodes = singleBarcodeGenerator(whitelist_tsv) whitelist = [x for x in whitelist_barcodes] return set(whitelist), false_to_true_map