def build_uniques_associations(pysam_iterator, cell_barcodes=None): reads = initialize_iterator(pysam_iterator) uniques_hashtable = {} uniques = {} if not cell_barcodes or not isinstance(cell_barcodes,Iterable): logger.info('Cell barcode iterable not present! Quitting the program...') exit() for cb in cell_barcodes: uniques_hashtable.update({cb: {}}) for read in reads: if not read.is_unmapped: if read.get_tag("NH") == 1: xc = read.get_tag("XC") if xc in cell_barcodes: qname = read.query_name xm = read.get_tag("XM") refname = read.reference_name refpos = read.reference_start gene = None if not (read.has_tag("GE")) else read.get_tag("GE") update_uniq_hashtable(uniques_hashtable, xc, xm, qname) uniques.update({qname: (gene, refname, refpos)}) return uniques_hashtable, uniques
def umi_dist(pysam_iterator): reads = initialize_iterator(pysam_iterator) u1 = [] u2 = {} for r in reads: if not r.is_unmapped: nh = r.get_tag('NH') xm = r.get_tag('XM') qname = r.query_name if nh == 1: u1.append(xm) else: if xm in u2: u2[xm].append(qname) else: u2.update({xm: [qname]}) uniq_dist = Counter(Counter(u1).values()) B = {} C = {} for item in u2: B.update({item: len(u2[item])}) C.update({item: len(np.unique(u2[item]))}) map_dist = Counter(Counter(B).values()) multi_dist = Counter(Counter(C).values()) return uniq_dist, multi_dist, map_dist
def get_reads(pysam_iter): qnames = [] reads = initialize_iterator(pysam_iter) for r in reads: if not r.is_unmapped: qnames.append(r.query_name) return Counter(qnames).keys()
def tag_annotated_generator(pysam_iterator, tag='', includes=True): """Generator of type pysam.alignedSegment. Takes an iterable of type pysam.alignedSegment as input. Yields tag annotated/non-annotated reads (annotate=True/False).""" reads = initialize_iterator(pysam_iterator) for r in reads: if r.has_tag(tag) and includes: yield r elif not r.has_tag(tag) and not includes: yield r
def tag_based_generator(pysam_iterator, tag=None, values=None, include=True): reads = initialize_iterator(pysam_iterator) for r in reads: if r.has_tag(tag): if r.get_tag(tag) in values: if include: yield r else: if not include: yield r
def build_r2g_table(pysam_iterator): reads = initialize_iterator(pysam_iterator) r2g = {} for r in reads: if r.has_tag('GE'): ge = r.get_tag('GE') qname = r.query_name if qname in r2g: r2g[qname].append(ge) else: r2g.update({qname: [ge]}) return r2g
def field_based_generator(pysam_iterator, field=None, values=None, include=True): reads = initialize_iterator(pysam_iterator) for r in reads: value = getattr(r, field) if value in values: if include: yield r else: if not include: yield r
def total_alignments(pysam_iterator, include='mapped'): reads = initialize_iterator(pysam_iterator) total = None if include == 'unmapped': total = len([r for r in reads]) elif include == 'mapped': total = len([r for r in reads if not r.is_unmapped]) elif include == 'unique': total = len( [r for r in reads if not r.is_unmapped if r.get_tag('NH') == 1]) elif include == 'multi': total = len([ r for r in reads if not r.is_unmapped if not r.get_tag('NH') == 1 ]) return total
def build_multimapping_hashtable(pysam_iterator, cell_barcode=None, region='all'): reads = initialize_iterator(pysam_iterator) multi_maps = {} for read in reads: if not read.is_unmapped: qname = read.query_name xc = read.get_tag("XC") xm = read.get_tag("XM") refname = read.reference_name refpos = read.reference_start gene = None if not read.has_tag("GE") else read.get_tag("GE") if not read.get_tag("NH") == 1: if not cell_barcode or cell_barcode == xc: if region == 'all' or (region == 'gene' and gene) or (region == 'ref' and not gene): update_multimap(multi_maps, qname, xc, xm, gene, refname, refpos) return multi_maps
def get_non_isolated_umis(pysam_iterator): reads = initialize_iterator(pysam_iterator) umis = {} u1 = [] for r in reads: if not r.is_unmapped: xm = r.get_tag('XM') nh = r.get_tag('NH') if nh > 1: qn = r.query_name if xm in umis: umis[xm].append(qn) else: umis.update({xm: [qn]}) elif nh == 1: u1.append(xm) uniqs_umis = Counter(u1).keys() non_isolated = [] pending = [] for umi in umis: p = len(np.unique(umis[umi])) if p > 1: non_isolated.append(umi) else: pending.append(umi) intersect = list(set(pending).intersection(uniqs_umis)) non_isolated += intersect isolated = list(set(pending) - set(intersect)) return non_isolated, isolated