def __call__(self, umis, counts): '''Counts is a directionary that maps UMIs to their counts''' len_umis = [len(x) for x in umis] if not max(len_umis) == min(len_umis): U.warn("not all umis are the same length(!): %d - %d" % ( min(len_umis), max(len_umis))) adj_list = self.get_adj_list(umis, counts) clusters = self.get_connected_components(umis, adj_list, counts) final_umis = [list(x) for x in self.get_groups(clusters, adj_list, counts)] return final_umis
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "dedup-specific options") group.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options, group=False) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename(dir=options.tmpdir) sorted_out_name = options.stdout.name options.stdout.close() else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename(dir=options.tmpdir) sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.stats and options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = sam_methods.TwoPassPairWriter(infile, outfile) nInput, nOutput, input_reads, output_reads = 0, 0, 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_contig and options.gene_transcript_map: metacontig2contig = sam_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = sam_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadDeduplicator(options.method) bundle_iterator = sam_methods.get_bundles( options, metacontig_contig=metacontig2contig) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = umi_methods.random_read_generator( infile.filename, chrom=options.chrom, barcode_getter=bundle_iterator.barcode_getter) for bundle, key, status in bundle_iterator(inreads): nInput += sum([bundle[umi]["count"] for umi in bundle]) while nOutput >= output_reads + 100000: output_reads += 100000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) if options.stats: # generate pre-dudep stats average_distance = umi_methods.get_average_umi_distance( bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # dedup using umis and write out deduped bam reads, umis, umi_counts = processor(bundle=bundle, threshold=options.threshold) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [ bundle_iterator.barcode_getter(x)[0] for x in reads ] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = umi_methods.get_average_umi_distance( post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) post_cluster_stats_null.append(average_distance_null) outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.stats: # generate the stats dataframe stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # tally the counts per umi per position pre_counts = collections.Counter(stats_pre_df["counts"]) post_counts = collections.Counter(stats_post_df["counts"]) counts_index = list( set(pre_counts.keys()).union(set(post_counts.keys()))) counts_index.sort() with U.openFile(options.stats + "_per_umi_per_position.tsv", "w") as outf: outf.write("counts\tinstances_pre\tinstances_post\n") for count in counts_index: values = (count, pre_counts[count], post_counts[count]) outf.write("\t".join(map(str, values)) + "\n") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int agg_df = agg_df.fillna(0).astype(int) agg_df.index = [x.decode() for x in agg_df.index] agg_df.index.name = 'UMI' agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame( { "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }, columns=[ "unique", "unique_null", options.method, "%s_null" % options.method, "edit_distance" ]) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") # write footer and output benchmark information. U.info("Reads: %s" % ", ".join([ "%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common() ])) U.info("Number of reads out: %i" % nOutput) if not options.ignore_umi: # otherwise processor has not been used U.info("Total number of positions deduplicated: %i" % processor.UMIClusterer.positions) if processor.UMIClusterer.positions > 0: U.info("Mean number of unique UMIs per position: %.2f" % (float(processor.UMIClusterer.total_umis_per_position) / processor.UMIClusterer.positions)) U.info("Max. number of unique UMIs per position: %i" % processor.UMIClusterer.max_umis_per_position) else: U.warn("The BAM did not contain any valid " "reads/read pairs for deduplication") U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "group-specific options") group.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) group.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options, group=True) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename(dir=options.tmpdir) sorted_out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename(dir=options.tmpdir) sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join([ "read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id" ])) nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.unmapped_reads in ["use", "output"]: output_unmapped = True else: output_unmapped = False if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = sam_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = sam_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=output_unmapped) bundle_iterator = sam_methods.get_bundles( options, all_reads=True, return_read2=True, return_unmapped=output_unmapped, metacontig_contig=metacontig2contig) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) for bundle, key, status in bundle_iterator(inreads): # write out read2s and unmapped/chimeric (if these options are set) if status == 'single_read': # bundle is just a single read here nInput += 1 if outfile: outfile.write(bundle) nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nOutput >= output_reads + 10000: output_reads += 10000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # group the umis groups = processor(counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.set_tag('UG', unique_id) read.set_tag(options.umi_group_tag, top_umi) outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, sam_methods.get_read_position( read, options.soft_clip_threshold)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, "--no-PG", out_name) os.unlink(out_name) # delete the tempfile if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info("Reads: %s" % ", ".join([ "%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common() ])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.info("Total number of positions deduplicated: %i" % processor.positions) if processor.positions > 0: U.info( "Mean number of unique UMIs per position: %.2f" % (float(processor.total_umis_per_position) / processor.positions)) U.info("Max. number of unique UMIs per position: %i" % processor.max_umis_per_position) else: U.warn("The BAM did not contain any valid " "reads/read pairs for deduplication") U.Stop()
def __call__(self, inreads): for read in inreads: if read.is_read2: if self.return_read2: if not read.is_unmapped or (read.is_unmapped and self.return_unmapped): yield read, None, "single_read" continue else: self.read_events['Input Reads'] += 1 # only ever dealing with read1s from here if self.options.paired: if read.is_paired: self.read_events['Read pairs'] += 1 else: self.read_events['Unpaired reads'] += 1 # if paired end input and read1 is unpaired... # skip, or if self.options.unpaired_reads == "discard": continue # yield without grouping, or elif self.options.unpaired_reads == "output": yield read, None, "single_read" # Use read pair; TLEN will be 0 elif self.options.unpaired_reads == "use": pass if read.is_unmapped: if self.options.paired: if read.mate_is_unmapped: self.read_events['Both unmapped'] += 1 else: self.read_events['Read 1 unmapped'] += 1 else: self.read_events['Single end unmapped'] += 1 # if read1 is unmapped, yield immediately or skip read if self.return_unmapped: self.read_events['Input Reads'] += 1 yield read, None, "single_read" continue if self.options.paired and read.mate_is_unmapped: if not read.is_unmapped: self.read_events['Read 2 unmapped'] += 1 # if paired end input and read2 is unmapped, skip unless # options.unmapped_reads == "use", in which case TLEN will be 0 if self.options.unmapped_reads != "use": if self.return_unmapped: yield read, None, "single_read" continue if read.is_paired and (read.reference_name != read.next_reference_name): self.read_events['Chimeric read pair'] += 1 # if paired end input and read2 is mapped to another contig... # skip, or if self.options.chimeric_pairs == "discard": continue # yield without grouping, or elif self.options.chimeric_pairs == "output": yield read, None, "single_read" continue # Use read pair; TLEN will be 0 elif self.options.chimeric_pairs == "use": pass if self.options.subset: if random.random() >= self.options.subset: self.read_events['Randomly excluded'] += 1 continue if self.options.mapping_quality: if read.mapq < self.options.mapping_quality: self.read_events['< MAPQ threshold'] += 1 continue # get the umi +/- cell barcodes if self.options.ignore_umi: if self.options.per_cell: umi, cell = self.barcode_getter(read) umi = "" else: umi, cell = "", "" else: try: umi, cell = self.barcode_getter(read) except KeyError: error_msg = "Read skipped, missing umi and/or cell tag" if self.read_events[error_msg] == 0: # pysam renamed .tostring -> to_string in 0.14 # .tostring requies access to the parent AlignmentFile try: formatted_read = read.to_string() except AttributeError: formatted_read = read.query_name U.warn("At least one read is missing UMI and/or " "cell tag(s): %s" % formatted_read) self.read_events[error_msg] += 1 continue self.current_chr = read.reference_name if self.options.per_gene: if self.options.per_contig: if self.metacontig_contig: transcript = read.reference_name gene = self.contig_metacontig[transcript] else: gene = read.reference_name elif self.options.gene_tag: try: assigned = read.get_tag(self.options.assigned_tag) gene = read.get_tag(self.options.gene_tag) except KeyError: self.read_events['Read skipped, no tag'] += 1 continue if gene == "": if self.read_events[ 'Read skipped - gene string is empty'] == 0: U.warn("Assigned gene is empty string. First such " "read:\n%s" % read.to_string()) self.read_events[ 'Read skipped - gene string is empty'] += 1 continue if re.search(self.options.skip_regex, assigned): self.read_events[ 'Read skipped - assigned tag matches skip_regex'] += 1 continue pos = gene key = pos if self.last_chr: do_output, out_keys = self.check_output() else: do_output = False if do_output: for p in out_keys: for k in sorted(self.reads_dict[p].keys()): yield self.reads_dict[p][k], k, "bundle" del self.reads_dict[p] self.last_chr = self.current_chr self.last_pos = pos else: start, pos, is_spliced = get_read_position( read, self.options.soft_clip_threshold) do_output, out_keys = self.check_output() if do_output: for p in out_keys: for k in sorted(self.reads_dict[p].keys()): yield self.reads_dict[p][k], k, "bundle" del self.reads_dict[p] if p in self.read_counts: del self.read_counts[p] self.last_pos = self.start self.last_chr = self.current_chr if self.options.read_length: r_length = read.query_length else: r_length = 0 key = (read.is_reverse, self.options.spliced and is_spliced, self.options.paired * read.tlen, r_length) # update dictionaries key = (key, cell) self.update_dicts(read, pos, key, umi) if self.metacontig_contig: # keep track of observed contigs for each gene self.observed_contigs[gene].add(transcript) # yield remaining bundles for p in sorted(self.reads_dict.keys()): for k in sorted(self.reads_dict[p].keys()): yield self.reads_dict[p][k], k, "bundle"
def main(argv=None): if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "RSEM preparation specific options") group.add_option( "--tags", dest="tags", type="string", default="UG,BX", help="Comma-seperated list of tags to transfer from read1 to read2") group.add_option("--sam", dest="sam", action="store_true", default=False, help="input and output SAM rather than BAM") parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False) skipped_stats = Counter() if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: in_name = "-" if options.sam: mode = "" else: mode = "b" inbam = pysam.AlignmentFile(in_name, "r" + mode) if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" outbam = pysam.AlignmentFile(out_name, "w" + mode, template=inbam) options.tags = options.tags.split(",") for template in chunk_bam(inbam): assert len(set(r.query_name for r in template)) == 1 current_template = {True: defaultdict(list), False: defaultdict(list)} for read in template: key = (read.reference_name, read.pos, not read.is_secondary) current_template[read.is_read1][key].append(read) output = set() for read in template: mate = None # if this read is a non_primary alignment, we first want to check if it has a mate # with the non-primary alignment flag set. mate_key_primary = (True) mate_key_secondary = (read.next_reference_name, read.next_reference_start, False) # First look for a read that has the same primary/secondary status # as read (i.e. secondary mate for secondary read, and primary mate # for primary read) mate_key = (read.next_reference_name, read.next_reference_start, read.is_secondary) mate = pick_mate(read, current_template, mate_key) # If none was found then look for the opposite (primary mate of secondary # read or seconadary mate of primary read) if mate is None: mate_key = (read.next_reference_name, read.next_reference_start, not read.is_secondary) mate = pick_mate(read, current_template, mate_key) # If we still don't have a mate, then their can't be one? if mate is None: skipped_stats["no_mate"] += 1 U.warn("Alignment {} has no mate -- skipped".format("\t".join( map(str, [ read.query_name, read.flag, read.reference_name, int(read.pos) ])))) continue # because we might want to make changes to the read, but not have those changes reflected # if we need the read again,we copy the read. This is only way I can find to do this. read = pysam.AlignedSegment().from_dict(read.to_dict(), read.header) mate = pysam.AlignedSegment().from_dict(mate.to_dict(), read.header) # Make it so that if our read is secondary, the mate is also secondary. We don't make the # mate primary if the read is primary because we would otherwise end up with mulitple # primary alignments. if read.is_secondary: mate.is_secondary = True # In a situation where there is already one mate for each read, then we will come across # each pair twice - once when we scan read1 and once when we scan read2. Thus we need # to make sure we don't output something already output. if read.is_read1: mate = copy_tags(options.tags, read, mate) output_key = str(read) + str(mate) if output_key not in output: output.add(output_key) outbam.write(read) outbam.write(mate) skipped_stats["pairs_output"] += 1 elif read.is_read2: read = copy_tags(options.tags, mate, read) output_key = str(mate) + str(read) if output_key not in output: output.add(output_key) outbam.write(mate) outbam.write(read) skipped_stats["pairs_output"] += 1 else: skipped_stats["skipped_not_read12"] += 1 U.warn("Alignment {} is neither read1 nor read2 -- skipped". format("\t".join( map(str, [ read.query_name, read.flag, read.reference_name, int(read.pos) ])))) continue if not out_name == "-": outbam.close() U.info("Total pairs output: {}, Pairs skipped - no mates: {}," " Pairs skipped - not read1 or 2: {}".format( skipped_stats["pairs_output"], skipped_stats["no_mate"], skipped_stats["skipped_not_read12"])) U.Stop()