def correct_umis(final_results, collapsing_threshold): """ Corrects umi barcodes within same cell/tag groups. Args: final_results (dict): Dict of dict of Counters with mapping results. collapsing_threshold (int): Max distance between umis. Returns: final_results (dict): Same as input but with corrected umis. corrected_umis (int): How many umis have been corrected. """ print('Correcting umis') corrected_umis = 0 for cell_barcode in final_results: for TAG in final_results[cell_barcode]: if len(final_results[cell_barcode][TAG]) > 1: umi_clusters = network.UMIClusterer() UMIclusters = umi_clusters( final_results[cell_barcode][TAG].keys(), final_results[cell_barcode][TAG], collapsing_threshold) for umi_cluster in UMIclusters: # This is a list with the first element the dominant barcode if (len(umi_cluster) > 1): # This means we got a correction major_umi = umi_cluster[0] for minor_umi in umi_cluster[1:]: corrected_umis += 1 temp = final_results[cell_barcode][TAG].pop( minor_umi) final_results[cell_barcode][TAG][major_umi] += temp return (final_results, corrected_umis)
def correct_umis(final_results, collapsing_threshold, top_cells, max_umis): """ Corrects umi barcodes within same cell/tag groups. Args: final_results (dict): Dict of dict of Counters with mapping results. collapsing_threshold (int): Max distance between umis. top_cells (set): Set of cells to go through. max_umis (int): Maximum UMIs to consider for one cluster. Returns: final_results (dict): Same as input but with corrected umis. corrected_umis (int): How many umis have been corrected. aberrant_umi_count_cells (set): Set of uncorrected cells. """ print("Correcting umis") corrected_umis = 0 aberrant_umi_count_cells = set() for cell_barcode in top_cells: for TAG in final_results[cell_barcode]: n_umis = len(final_results[cell_barcode][TAG]) if n_umis > 1 and n_umis <= max_umis: umi_clusters = network.UMIClusterer() UMIclusters = umi_clusters(final_results[cell_barcode][TAG], collapsing_threshold) (new_res, temp_corrected_umis) = update_umi_counts( UMIclusters, final_results[cell_barcode][TAG]) final_results[cell_barcode][TAG] = new_res corrected_umis += temp_corrected_umis elif n_umis > max_umis: aberrant_umi_count_cells.add(cell_barcode) return (final_results, corrected_umis, aberrant_umi_count_cells)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False) nInput, nOutput = 0, 0 # set the method with which to extract umis from reads umi_getter = partial(umi_methods.get_umi_read_string, sep=options.umi_sep) options.stdout.write("%s\t%s\n" % ("gene", "count")) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) for gene, counts in umi_methods.get_gene_count_tab(options.stdin, umi_getter=umi_getter): umis = counts.keys() nInput += sum(counts.values()) # group the umis groups = processor(umis, counts, threshold=options.threshold) gene_count = len(groups) options.stdout.write("%s\t%i\n" % (gene, gene_count)) nOutput += gene_count U.info("Number of reads counted: %i" % nOutput) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) group = U.OptionGroup(parser, "group-specific options") group.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) group.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) group.add_option( "--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename() sorted_out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename() sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join([ "read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id" ])) nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) bundle_iterator = umi_methods.get_bundles( options, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped, metacontig_contig=metacontig2contig) for bundle, key, status in bundle_iterator(inreads): # write out read2s and unmapped (if these options are set) if status == 'single_read': # bundle is just a single read here nInput += 1 if outfile: outfile.write(bundle) nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nOutput >= output_reads + 10000: output_reads += 10000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft_clip_threshold)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info("Reads: %s" % ", ".join([ "%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common() ])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option( "-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format [default=%default]", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one [default=%default]", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read is counted as spliced [default=%default]", default=4) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig (field 3 in BAM; RNAME)," " e.g for transcriptome where contig = gene")) parser.add_option("--per-gene", dest="per_gene", action="store_true", default=False, help=("Deduplicate per gene," "e.g for transcriptome where contig = transcript" "must also provide a transript to gene map with" "--gene-transcript-map [default=%default]")) parser.add_option("--gene-transcript-map", dest="gene_transcript_map", type="string", help="file mapping transcripts to genes (tab separated)", default=None) parser.add_option("--gene-tag", dest="gene_tag", type="string", help=("Deduplicate per gene where gene is" "defined by this bam tag [default=%default]"), default=None) parser.add_option( "--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates [default=%default]")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option( "--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) parser.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) parser.add_option( "--skip-tags-regex", dest="skip_regex", type="string", help=("Used with --gene-tag. " "Ignore reads where the gene-tag matches this regex"), default="^[__|Unassigned]") # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.per_gene: if not options.gene_transcript_map: raise ValueError( "--per-gene option requires --gene-transcript-map") infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join([ "read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id" ])) # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") nInput, nOutput, unique_id = 0, 0, 0 if options.chrom: inreads = infile.fetch(reference=options.chrom) gene_tag = options.gene_tag else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) gene_tag = options.gene_tag for bundle, read_events, status in umi_methods.get_bundles( inreads, ignore_umi=False, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, gene_tag=gene_tag, skip_regex=options.skip_regex, read_length=options.read_length, umi_getter=umi_getter, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped): # write out read2s and unmapped if option set if status == 'single_read': # bundle is just a single read here outfile.write(bundle) nInput += 1 nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) if nOutput % 10000 == 0: U.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: U.debug("Read %i input reads" % nInput) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info( "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in read_events.most_common()])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) group = U.OptionGroup(parser, "count-specific options") parser.add_option("--wide-format-cell-counts", dest="wide_format_cell_counts", action="store_true", default=False, help=("output the cell counts in a wide format " "(rows=genes, columns=cells)")) parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False) options.per_gene = True # hardcodes counting to per-gene only U.validateSamOptions(options, group=False) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.in_sam: in_mode = "r" else: in_mode = "rb" infile = pysam.Samfile(in_name, in_mode) # write out to tempfile and then sort to stdout tmpfilename = U.getTempFilename(dir=options.tmpdir) tmpfile = U.openFile(tmpfilename, mode="w") nInput, nOutput, input_reads = 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() bundle_iterator = umi_methods.get_bundles( options, only_count_reads=True, metacontig_contig=metacontig2contig) for bundle, key, status in bundle_iterator(inreads): if status == "single_read": continue gene, cell = key umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) gene_count = len(groups) if options.per_cell: tmpfile.write("%s\n" % "\t".join( (gene, cell.decode(), str(gene_count)))) else: tmpfile.write("%s\n" % "\t".join((gene, str(gene_count)))) nOutput += gene_count tmpfile.close() if options.per_cell: gene_counts_dict = {} with U.openFile(tmpfilename, mode="r") as inf: genes = set() cells = set() for line in inf: gene, cell, gene_count = line.strip().split("\t") genes.add(gene) cells.add(cell) if gene not in gene_counts_dict: gene_counts_dict[gene] = {} gene_counts_dict[gene][cell] = gene_count if options.wide_format_cell_counts: # write out in wide format options.stdout.write("%s\t%s\n" % ("gene", "\t".join(sorted(cells)))) for gene in sorted(genes): counts = [] for cell in sorted(cells): if cell in gene_counts_dict[gene]: counts.append(gene_counts_dict[gene][cell]) else: counts.append(0) options.stdout.write("%s\t%s\n" % (gene, "\t".join(map(str, counts)))) else: # write out in long format options.stdout.write("%s\t%s\t%s\n" % ("gene", "cell", "count")) for gene in sorted(genes): for cell in sorted(list(gene_counts_dict[gene].keys())): options.stdout.write( "%s\t%s\t%s\n" % (gene, cell, gene_counts_dict[gene][cell])) else: options.stdout.write("%s\t%s\n" % ("gene", "count")) with U.openFile(tmpfilename, mode="r") as inf: for line in inf: options.stdout.write(line) os.unlink(tmpfilename) # output reads events and benchmark information. for event in bundle_iterator.read_events.most_common(): U.info("%s: %s" % (event[0], event[1])) U.info("Number of (post deduplication) reads counted: %i" % nOutput) U.Stop()
import umi_tools.network as network from itertools import product input_data = { "ACGT": 456, "AAAT": 90, "ACAT": 72, "TCGT": 2, "CCGT": 2, "ACAG": 1 } output_data = { "unique": [['ACAG'], ['ACGT'], ['ACAT'], ['CCGT'], ['TCGT'], ['AAAT']], "percentile": [['ACAG'], ['ACGT'], ['ACAT'], ['CCGT'], ['TCGT'], ['AAAT']], "cluster": [['ACGT', 'AAAT', 'ACAT', 'CCGT', 'TCGT', 'ACAG']], "adjacency": [['ACGT', 'CCGT', 'TCGT'], ['AAAT'], ['ACAT', 'ACAG']], "directional": [['ACGT', 'ACAT', 'TCGT', 'CCGT', 'ACAG'], ['AAAT']] } methods = ["unique", "percentile", "cluster", "adjacency", "directional"] for method in methods: clusterer = network.UMIClusterer(method) clusters = clusterer(input_data.keys(), input_data, threshold=1) assert clusters == output_data[method], \ "failed on method %s\n %s is not %s" % (method, clusters, output_data[method])
library1_df = pd.DataFrame.from_dict(cbc1_filtered, orient='index') library1_df.to_csv('library1_min_' + repr(N_READS) + '_reads.csv', '\t') barcodes2, barcodes_counts2 = np.unique(library2_clean, return_counts=True) cbc2 = dict(zip(barcodes2, barcodes_counts2)) cbc2_filtered = {k: v for k, v in cbc2.items() if v >= N_READS} print('Replicate 2 retaining ' + repr(len(cbc2_filtered)) + ' out of ' + repr(len(cbc2)) + ' barcodes') library2_df = pd.DataFrame.from_dict(cbc2_filtered, orient='index') library2_df.to_csv('library2_min_' + repr(N_READS) + '_reads.csv', '\t') # In[65]: ###Here we apply UMIClusterer to collapse barcodes and generate a whitelist uc = nk.UMIClusterer() CBclusters = uc(cbc1_filtered, threshold=5) cbFinal = dict() for l in CBclusters: cbFinal[l[0]] = 0 for x in l: cbFinal[l[0]] += cbc1_filtered[x] whitelist1 = pd.DataFrame.from_dict(cbFinal, orient='index') whitelist1.to_csv('whitelist1_test.csv', '\t') uc = nk.UMIClusterer() CBclusters = uc(cbc2_filtered, threshold=5) cbFinal = dict() for l in CBclusters: cbFinal[l[0]] = 0
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "count_tab-specific options") group.add_option("--barcode-separator", dest="bc_sep", type="string", help="separator between read id and UMI " " and (optionally) the cell barcode", default="_") group.add_option("--per-cell", dest="per_cell", action="store_true", help="Readname includes cell barcode as well as UMI in " "format: read[sep]UMI[sep]CB") parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False, add_sam_options=False) nInput, nOutput = 0, 0 # set the method with which to extract umis from reads if options.per_cell: bc_getter = partial(sam_methods.get_cell_umi_read_string, sep=options.bc_sep) else: bc_getter = partial(sam_methods.get_umi_read_string, sep=options.bc_sep) if options.per_cell: options.stdout.write("%s\t%s\t%s\n" % ("cell", "gene", "count")) else: options.stdout.write("%s\t%s\n" % ("gene", "count")) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) for gene, counts in sam_methods.get_gene_count_tab(options.stdin, bc_getter=bc_getter): for cell in counts.keys(): umis = counts[cell].keys() nInput += sum(counts[cell].values()) # group the umis groups = processor(counts[cell], threshold=options.threshold) gene_count = len(groups) if options.per_cell: options.stdout.write("%s\t%s\t%i\n" % (cell, gene, gene_count)) else: options.stdout.write("%s\t%i\n" % (gene, gene_count)) nOutput += gene_count U.info("Number of reads counted: %i" % nOutput) U.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format [default=%default]", default=False) parser.add_option("--umi-separator", dest="umi_sep", type="string", help="separator between read id and UMI", default="_") parser.add_option("--umi-tag", dest="umi_tag", type="string", help="tag containing umi", default='RX') parser.add_option("--extract-umi-method", dest="get_umi_method", type="choice", choices=("read_id", "tag"), default="read_id", help="where is the read UMI encoded? [default=%default]") parser.add_option("--subset", dest="subset", type="float", help="Use only a fraction of reads, specified by subset", default=None) parser.add_option("--edit-distance-threshold", dest="threshold", type="int", default=1, help="Edit distance theshold at which to join two UMIs" "when clustering. [default=%default]") parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="paired BAM. [default=%default]") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional", "percentile", "unique", "cluster"), default="directional", help="method to use for umi deduping [default=%default]") parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained" " [default=%default]", default=0) parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig (field 3 in BAM; RNAME)," " e.g for transcriptome where contig = gene")) parser.add_option("--per-gene", dest="per_gene", action="store_true", default=False, help=("Deduplicate per gene," "e.g for transcriptome where contig = transcript" "must also provide a transript to gene map with" "--gene-transcript-map [default=%default]")) parser.add_option("--gene-transcript-map", dest="gene_transcript_map", type="string", help="file mapping transcripts to genes (tab separated)", default=None) parser.add_option("--gene-tag", dest="gene_tag", type="string", help=("Deduplicate per gene where gene is" "defined by this bam tag [default=%default]"), default=None) parser.add_option( "--skip-tags-regex", dest="skip_regex", type="string", help=("Used with --gene-tag. " "Ignore reads where the gene-tag matches this regex"), default="^[__|Unassigned]") # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.per_gene: if not options.gene_transcript_map and not options.gene_tag: raise ValueError( "--per-gene option requires --gene-transcript-map " "or --gene-tag") try: re.compile(options.skip_regex) except re.error: raise ValueError("skip-regex '%s' is not a " "valid regex" % options.skip_regex) infile = pysam.Samfile(in_name, in_mode) nInput, nOutput = 0, 0 # set the method with which to extract umis from reads if options.get_umi_method == "read_id": umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep) elif options.get_umi_method == "tag": umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag) else: raise ValueError("Unknown umi extraction method") if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() gene_tag = options.gene_tag options.stdout.write("%s\t%s\n" % ("gene", "count")) for gene, bundle, read_events in umi_methods.get_gene_count( inreads, subset=options.subset, quality_threshold=options.mapping_quality, paired=options.paired, per_contig=options.per_contig, gene_tag=options.gene_tag, skip_regex=options.skip_regex, umi_getter=umi_getter): umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) gene_count = len(groups) options.stdout.write("%s\t%i\n" % (gene, gene_count)) nOutput += gene_count # output reads events and benchmark information. for event in read_events.most_common(): U.info("%s: %s" % (event[0], event[1])) U.info("Number of reads counted: %i" % nOutput) U.Stop()