def write_mates(self): '''Scan the current chromosome for matches to any of the reads stored in the read1s buffer''' if self.chrom is not None: U.debug("Dumping %i mates for contig %s" % (len(self.read1s), self.chrom)) for read in self.infile.fetch(reference=self.chrom, multiple_iterators=True): if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)): continue key = read.query_name, read.reference_name, read.reference_start if key in self.read1s: if self.read2tags is not None: unique_id, umi = self.read2tags[key] self.read2tags.pop(key) read.tags += [('UG', unique_id)] read.tags += [('FU', umi)] self.outfile.write(read) self.read1s.remove(key) U.debug("%i mates remaining" % len(self.read1s))
def write_mates(self): '''Scan the current chormosome for matches to any of the reads stored in the read1s buffer''' if self.chrom is not None: U.debug("Dumping %i mates for contig %s" % ( len(self.read1s), self.infile.get_reference_name(self.chrom))) for read in self.infile.fetch(tid=self.chrom, multiple_iterators=True): if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)): continue key = read.query_name, read.reference_id, read.reference_start if key in self.read1s: self.outfile.write(read) self.read1s.remove(key) U.debug("%i mates remaining" % len(self.read1s))
def write_mates(self): '''Scan the current chormosome for matches to any of the reads stored in the read1s buffer''' if self.chrom is not None: U.debug( "Dumping %i mates for contig %s" % (len(self.read1s), self.infile.get_reference_name(self.chrom))) for read in self.infile.fetch(tid=self.chrom, multiple_iterators=True): if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)): continue key = read.query_name, read.reference_id, read.reference_start if key in self.read1s: self.outfile.write(read) self.read1s.remove(key) U.debug("%i mates remaining" % len(self.read1s))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true", help="Ignore UMI and dedup" " only on position", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "percentile", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option( "--whole-contig", dest="whole_contig", action="store_true", default=False, help= "Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option("--multimapping-detection-method", dest="detection_method", type="choice", choices=("NH", "X0", "XT"), default=None, help=("Some aligners identify multimapping using bam " "tags. Setting this option to NH, X0 or XT will " "use these tags when selecting the best read " "amongst reads with the same position and umi " "[default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig (field 3 in BAM; RNAME)," " e.g for transcriptome where contig = gene")) parser.add_option("--per-gene", dest="per_gene", action="store_true", default=False, help=("Deduplicate per gene," "e.g for transcriptome where contig = transcript" "must also provide a transript to gene map with" "--gene-transcript-map [default=%default]")) parser.add_option("--gene-transcript-map", dest="gene_transcript_map", type="string", help="file mapping transcripts to genes (tab separated)", default=None) parser.add_option("--gene-tag", dest="gene_tag", type="string", help=("Deduplicate per gene where gene is" "defined by this bam tag [default=%default]"), default=None) parser.add_option( "--skip-tags-regex", dest="skip_regex", type="string", help=("Used with --gene-tag. " "Ignore reads where the gene-tag matches this regex"), default="^[__|Unassigned]") # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.stats: if options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") if options.per_gene: if not options.gene_transcript_map and not options.gene_map: raise ValueError( "--per-gene option requires --gene-transcript-map " "or --gene-tag") try: re.compile(options.skip_regex) except re.error: raise ValueError("skip-regex '%s' is not a " "valid regex" % options.skip_regex) infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = umi_methods.TwoPassPairWriter(infile, outfile) nInput, nOutput = 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = umi_methods.random_read_generator(infile.filename, chrom=options.chrom, umi_getter=umi_getter) if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() gene_tag = options.gene_tag for bundle, read_events, status in umi_methods.get_bundles( inreads, ignore_umi=options.ignore_umi, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, gene_tag=options.gene_tag, skip_regex=options.skip_regex, whole_contig=options.whole_contig, read_length=options.read_length, detection_method=options.detection_method, umi_getter=umi_getter, all_reads=False, return_read2=False, return_unmapped=False): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) if options.stats: # generate pre-dudep stats average_distance = umi_methods.get_average_umi_distance( bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadDeduplicator(options.method) # dedup using umis and write out deduped bam reads, umis, umi_counts = processor(bundle=bundle, threshold=options.threshold) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [umi_getter(x) for x in reads] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = umi_methods.get_average_umi_distance( post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) post_cluster_stats_null.append(average_distance_null) outfile.close() if options.stats: # generate the stats dataframe stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # tally the counts per umi per position pre_counts = collections.Counter(stats_pre_df["counts"]) post_counts = collections.Counter(stats_post_df["counts"]) counts_index = list( set(pre_counts.keys()).union(set(post_counts.keys()))) counts_index.sort() with U.openFile(options.stats + "_per_umi_per_position.tsv", "w") as outf: outf.write("counts\tinstances_pre\tinstances_post\n") for count in counts_index: values = (count, pre_counts[count], post_counts[count]) outf.write("\t".join(map(str, values)) + "\n") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int agg_df = agg_df.fillna(0).astype(int) agg_df.index = [x.decode() for x in agg_df.index] agg_df.index.name = 'UMI' agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame({ "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") # write footer and output benchmark information. U.info( "%s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i" % nOutput) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig (field 3 in BAM; RNAME)," " e.g for transcriptome where contig = gene")) parser.add_option("--per-gene", dest="per_gene", action="store_true", default=False, help=("Deduplicate per gene," "e.g for transcriptome where contig = transcript" "must also provide a transript to gene map with" "--gene-transcript-map [default=%default]")) parser.add_option("--gene-transcript-map", dest="gene_transcript_map", type="string", help="file mapping transcripts to genes (tab separated)", default=None) parser.add_option("--gene-tag", dest="gene_tag", type="string", help=("Deduplicate per gene where gene is" "defined by this bam tag [default=%default]"), default=None) parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) parser.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) parser.add_option( "--skip-tags-regex", dest="skip_regex", type="string", help=("Used with --gene-tag. " "Ignore reads where the gene-tag matches this regex"), default="^[__|Unassigned]") # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.per_gene: if not options.gene_transcript_map: raise ValueError( "--per-gene option requires --gene-transcript-map") infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join([ "read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id" ])) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") nInput, nOutput, unique_id = 0, 0, 0 if options.chrom: inreads = infile.fetch(reference=options.chrom) gene_tag = options.gene_tag else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) gene_tag = options.gene_tag for bundle, read_events, status in umi_methods.get_bundles( inreads, ignore_umi=False, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, gene_tag=gene_tag, skip_regex=options.skip_regex, read_length=options.read_length, umi_getter=umi_getter, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped): # write out read2s and unmapped if option set if status == 'single_read': # bundle is just a single read here outfile.write(bundle) nInput += 1 nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info( "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig," " e.g for transcriptome where contig = gene")) parser.add_option( "--whole-contig", dest="whole_contig", action="store_true", default=False, help= "Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) parser.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "w" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = umi_methods.TwoPassPairWriter(infile, outfile, tags=True) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write( "read_id\tcontig\tposition\tumi\tumi_count\tfinal_umi\tfinal_umi_count\tunique_id\n" ) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") nInput, nOutput, unique_id = 0, 0, 0 read_events = collections.Counter() for bundle, read_events in umi_methods.get_bundles( infile, read_events, ignore_umi=False, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, chrom=options.chrom, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, whole_contig=options.whole_contig, read_length=options.read_length, umi_getter=umi_getter, all_reads=True): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadClusterer(options.method) bundle, groups, counts = processor(bundle=bundle, threshold=options.threshold, stats=True, deduplicate=False) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: if options.paired: # if paired, we need to supply the tags to # add to the paired read outfile.write(read, unique_id, top_umi) else: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft)[1], umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info( "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format", default=False) parser.add_option("-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format", default=False) parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true", help="Ignore UMI and dedup only on position", default=False) parser.add_option("--subset", dest="subset", type="string", help="Use only a fraction of reads, specified by subset", default=1.1) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read os counted as spliced", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", help="Edit distance theshold at which to join two UMIs" "when clustering", default=1) parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="Use second-in-pair position when deduping") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional-adjacency", "percentile", "unique", "cluster"), default="directional-adjacency", help="method to use for umi deduping") parser.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option("--further-stats", dest="further_stats", action="store_true", default=False, help="Output further stats") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig," " e.g for transcriptome where contig = gene")) parser.add_option("--whole-contig", dest="whole_contig", action="store_true", default=False, help="Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option("--multimapping-detection-method", dest="detection_method", type="choice", choices=("NH", "X0", "XT"), default=None, help=("Some aligners identify multimapping using bam " "tags. Setting this option to NH, X0 or XT will " "use these tags when selecting the best read " "amongst reads with the same position and umi")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained", default=0) parser.add_option("--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates")) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "w" else: out_mode = "wb" if options.stats: if options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") if options.further_stats: if not options.stats: raise ValueError("'--further-stats' options requires " "'--output-stats' option") if options.method not in ["cluster", "adjacency"]: raise ValueError("'--further-stats' only enabled with 'cluster' " "and 'adjacency' methods") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = TwoPassPairWriter(infile, outfile) nInput, nOutput = 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % ( options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = random_read_generator(infile.filename, chrom=options.chrom) for bundle in get_bundles(infile, ignore_umi=options.ignore_umi, subset=float(options.subset), quality_threshold=options.mapping_quality, paired=options.paired, chrom=options.chrom, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, whole_contig=options.whole_contig, read_length=options.read_length, detection_method=options.detection_method): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) if options.stats: # generate pre-dudep stats average_distance = get_average_umi_distance(bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # set up ClusterAndReducer functor with methods specific to # specified options.method processor = ClusterAndReducer(options.method) # dedup using umis and write out deduped bam reads, umis, umi_counts, topologies, nodes = processor( bundle, options.threshold, options.stats, options.further_stats) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [x.qname.split("_")[-1] for x in reads] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = get_average_umi_distance(post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) post_cluster_stats_null.append(average_distance_null) if options.further_stats: for c_type, count in topologies.most_common(): topology_counts[c_type] += count for c_type, count in nodes.most_common(): node_counts[c_type] += count outfile.close() if options.stats: stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # generate histograms of counts per UMI at each position UMI_counts_df_pre = pd.DataFrame(stats_pre_df.pivot_table( columns=stats_pre_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_post = pd.DataFrame(stats_post_df.pivot_table( columns=stats_post_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_pre.columns = ["instances"] UMI_counts_df_post.columns = ["instances"] UMI_counts_df = pd.merge(UMI_counts_df_pre, UMI_counts_df_post, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int UMI_counts_df = UMI_counts_df.fillna(0).astype(int) UMI_counts_df.to_csv( options.stats + "_per_umi_per_position.tsv", sep="\t") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - see comment above regarding missing values agg_df = agg_df.fillna(0).astype(int) agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int(max(map(max, [pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame({ "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins}) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") if options.further_stats: with U.openFile(options.stats + "_topologies.tsv", "w") as outf: outf.write( "\n".join(["\t".join((x, str(y))) for x, y in topology_counts.most_common()]) + "\n") with U.openFile(options.stats + "_nodes.tsv", "w") as outf: outf.write( "\n".join(["\t".join(map(str, (x, y))) for x, y in node_counts.most_common()]) + "\n") # write footer and output benchmark information. U.info("Number of reads in: %i, Number of reads out: %i" % (nInput, nOutput)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format", default=False) parser.add_option("-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format", default=False) parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true", help="Ignore UMI and dedup only on position", default=False) parser.add_option("--subset", dest="subset", type="string", help="Use only a fraction of reads, specified by subset", default=1.1) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read os counted as spliced", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", help="Edit distance theshold at which to join two UMIs" "when clustering", default=1) parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="Use second-in-pair position when deduping") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional-adjacency", "percentile", "unique", "cluster"), default="directional-adjacency", help="method to use for umi deduping") parser.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option("--further-stats", dest="further_stats", action="store_true", default=False, help="Output further stats") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig," " e.g for transcriptome where contig = gene")) parser.add_option( "--whole-contig", dest="whole_contig", action="store_true", default=False, help= "Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option("--multimapping-detection-method", dest="detection_method", type="choice", choices=("NH", "X0", "XT"), default=None, help=("Some aligners identify multimapping using bam " "tags. Setting this option to NH, X0 or XT will " "use these tags when selecting the best read " "amongst reads with the same position and umi")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained", default=0) parser.add_option("--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates")) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "w" else: out_mode = "wb" if options.stats: if options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") if options.further_stats: if not options.stats: raise ValueError("'--further-stats' options requires " "'--output-stats' option") if options.method not in ["cluster", "adjacency"]: raise ValueError("'--further-stats' only enabled with 'cluster' " "and 'adjacency' methods") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = TwoPassPairWriter(infile, outfile) nInput, nOutput = 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = random_read_generator(infile.filename, chrom=options.chrom) for bundle in get_bundles(infile, ignore_umi=options.ignore_umi, subset=float(options.subset), quality_threshold=options.mapping_quality, paired=options.paired, chrom=options.chrom, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, whole_contig=options.whole_contig, read_length=options.read_length, detection_method=options.detection_method): nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) if options.stats: # generate pre-dudep stats average_distance = get_average_umi_distance(bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # set up ClusterAndReducer functor with methods specific to # specified options.method processor = ClusterAndReducer(options.method) # dedup using umis and write out deduped bam reads, umis, umi_counts, topologies, nodes = processor( bundle, options.threshold, options.stats, options.further_stats) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [x.qname.split("_")[-1] for x in reads] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = get_average_umi_distance(post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) post_cluster_stats_null.append(average_distance_null) if options.further_stats: for c_type, count in topologies.most_common(): topology_counts[c_type] += count for c_type, count in nodes.most_common(): node_counts[c_type] += count outfile.close() if options.stats: stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # generate histograms of counts per UMI at each position UMI_counts_df_pre = pd.DataFrame( stats_pre_df.pivot_table(columns=stats_pre_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_post = pd.DataFrame( stats_post_df.pivot_table(columns=stats_post_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_pre.columns = ["instances"] UMI_counts_df_post.columns = ["instances"] UMI_counts_df = pd.merge(UMI_counts_df_pre, UMI_counts_df_post, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int UMI_counts_df = UMI_counts_df.fillna(0).astype(int) UMI_counts_df.to_csv(options.stats + "_per_umi_per_position.tsv", sep="\t") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - see comment above regarding missing values agg_df = agg_df.fillna(0).astype(int) agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame({ "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") if options.further_stats: with U.openFile(options.stats + "_topologies.tsv", "w") as outf: outf.write("\n".join([ "\t".join((x, str(y))) for x, y in topology_counts.most_common() ]) + "\n") with U.openFile(options.stats + "_nodes.tsv", "w") as outf: outf.write("\n".join([ "\t".join(map(str, (x, y))) for x, y in node_counts.most_common() ]) + "\n") # write footer and output benchmark information. U.info("Number of reads in: %i, Number of reads out: %i" % (nInput, nOutput)) U.Stop()