예제 #1
0
def build_uniques_associations(pysam_iterator, cell_barcodes=None):

    reads = initialize_iterator(pysam_iterator)

    uniques_hashtable = {}
    uniques = {}

    if not cell_barcodes or not isinstance(cell_barcodes,Iterable):
        logger.info('Cell barcode iterable not present! Quitting the program...')
        exit()

    for cb in cell_barcodes:
        uniques_hashtable.update({cb: {}})

    for read in reads:
        if not read.is_unmapped:
            if read.get_tag("NH") == 1:
                xc = read.get_tag("XC")
                if xc in cell_barcodes:
                    qname = read.query_name
                    xm = read.get_tag("XM")
                    refname = read.reference_name
                    refpos = read.reference_start
                    gene = None if not (read.has_tag("GE")) else read.get_tag("GE")
                    update_uniq_hashtable(uniques_hashtable, xc, xm, qname)
                    uniques.update({qname: (gene, refname, refpos)})

    return uniques_hashtable, uniques
예제 #2
0
def umi_dist(pysam_iterator):

    reads = initialize_iterator(pysam_iterator)

    u1 = []
    u2 = {}

    for r in reads:
        if not r.is_unmapped:
            nh = r.get_tag('NH')
            xm = r.get_tag('XM')
            qname = r.query_name

            if nh == 1:
                u1.append(xm)
            else:
                if xm in u2:
                    u2[xm].append(qname)
                else:
                    u2.update({xm: [qname]})

    uniq_dist = Counter(Counter(u1).values())

    B = {}
    C = {}
    for item in u2:
        B.update({item: len(u2[item])})
        C.update({item: len(np.unique(u2[item]))})

    map_dist = Counter(Counter(B).values())
    multi_dist = Counter(Counter(C).values())

    return uniq_dist, multi_dist, map_dist
예제 #3
0
def get_reads(pysam_iter):

    qnames = []
    reads = initialize_iterator(pysam_iter)
    for r in reads:
        if not r.is_unmapped:
            qnames.append(r.query_name)
    return Counter(qnames).keys()
예제 #4
0
def tag_annotated_generator(pysam_iterator, tag='', includes=True):
    """Generator of type pysam.alignedSegment.
       Takes an iterable of type pysam.alignedSegment as input.
       Yields tag annotated/non-annotated reads (annotate=True/False)."""

    reads = initialize_iterator(pysam_iterator)

    for r in reads:
        if r.has_tag(tag) and includes:
            yield r
        elif not r.has_tag(tag) and not includes:
            yield r
예제 #5
0
def tag_based_generator(pysam_iterator, tag=None, values=None, include=True):

    reads = initialize_iterator(pysam_iterator)

    for r in reads:
        if r.has_tag(tag):
            if r.get_tag(tag) in values:
                if include:
                    yield r
            else:
                if not include:
                    yield r
예제 #6
0
def build_r2g_table(pysam_iterator):

    reads = initialize_iterator(pysam_iterator)

    r2g = {}
    for r in reads:
        if r.has_tag('GE'):
            ge = r.get_tag('GE')
            qname = r.query_name
            if qname in r2g:
                r2g[qname].append(ge)
            else:
                r2g.update({qname: [ge]})

    return r2g
예제 #7
0
def field_based_generator(pysam_iterator,
                          field=None,
                          values=None,
                          include=True):

    reads = initialize_iterator(pysam_iterator)

    for r in reads:
        value = getattr(r, field)
        if value in values:
            if include:
                yield r
        else:
            if not include:
                yield r
예제 #8
0
def total_alignments(pysam_iterator, include='mapped'):

    reads = initialize_iterator(pysam_iterator)
    total = None

    if include == 'unmapped':
        total = len([r for r in reads])
    elif include == 'mapped':
        total = len([r for r in reads if not r.is_unmapped])
    elif include == 'unique':
        total = len(
            [r for r in reads if not r.is_unmapped if r.get_tag('NH') == 1])
    elif include == 'multi':
        total = len([
            r for r in reads if not r.is_unmapped if not r.get_tag('NH') == 1
        ])

    return total
예제 #9
0
def build_multimapping_hashtable(pysam_iterator, cell_barcode=None, region='all'):

    reads = initialize_iterator(pysam_iterator)

    multi_maps = {}

    for read in reads:
        if not read.is_unmapped:

            qname = read.query_name
            xc = read.get_tag("XC")
            xm = read.get_tag("XM")
            refname = read.reference_name
            refpos = read.reference_start
            gene = None if not read.has_tag("GE") else read.get_tag("GE")

            if not read.get_tag("NH") == 1:
                if not cell_barcode or cell_barcode == xc:
                    if region == 'all' or (region == 'gene' and gene) or (region == 'ref' and not gene):
                        update_multimap(multi_maps, qname, xc, xm, gene, refname, refpos)

    return multi_maps
예제 #10
0
def get_non_isolated_umis(pysam_iterator):

    reads = initialize_iterator(pysam_iterator)

    umis = {}
    u1 = []
    for r in reads:
        if not r.is_unmapped:
            xm = r.get_tag('XM')
            nh = r.get_tag('NH')
            if nh > 1:
                qn = r.query_name
                if xm in umis:
                    umis[xm].append(qn)
                else:
                    umis.update({xm: [qn]})
            elif nh == 1:
                u1.append(xm)

    uniqs_umis = Counter(u1).keys()

    non_isolated = []
    pending = []
    for umi in umis:
        p = len(np.unique(umis[umi]))
        if p > 1:
            non_isolated.append(umi)
        else:
            pending.append(umi)

    intersect = list(set(pending).intersection(uniqs_umis))

    non_isolated += intersect
    isolated = list(set(pending) - set(intersect))

    return non_isolated, isolated