示例#1
0
def count_mapped_bp(args, tempdir, genes):
    """ Count number of bp mapped to each gene across pangenomes.
    Return number covered genes and average gene depth per species.
    Result contains only covered species, but being a defaultdict,
    would yield 0 for any uncovered species, which is appropriate.
    """
    bam_path = f"{tempdir}/pangenomes.bam"
    bamfile = AlignmentFile(bam_path, "rb")
    covered_genes = {}

    # loop over alignments, sum values per gene
    for aln in bamfile.fetch(until_eof=True):
        gene_id = bamfile.getrname(aln.reference_id)
        gene = genes[gene_id]
        gene["aligned_reads"] += 1
        if keep_read(aln, args.aln_mapid, args.aln_readq, args.aln_mapq, args.aln_cov):
            gene["mapped_reads"] += 1
            gene["depth"] += len(aln.query_alignment_sequence) / float(gene["length"])
            covered_genes[gene_id] = gene

    tsprint("Pangenome count_mapped_bp:  total aligned reads: %s" % sum(g["aligned_reads"] for g in genes.values()))
    tsprint("Pangenome count_mapped_bp:  total mapped reads: %s" % sum(g["mapped_reads"] for g in genes.values()))

    # Filter to genes with non-zero depth, then group by species
    nonzero_gene_depths = defaultdict(list)
    for g in covered_genes.values():
        gene_depth = g["depth"]
        if gene_depth > 0:  # This should always pass, because ags.aln_cov is always >0.
            species_id = g["species_id"]
            nonzero_gene_depths[species_id].append(gene_depth)

    # Compute number of covered genes per species, and average gene depth.
    num_covered_genes = defaultdict(int)
    mean_coverage = defaultdict(float)
    for species_id, non_zero_depths in nonzero_gene_depths.items():
        num_covered_genes[species_id] = len(non_zero_depths)
        mean_coverage[species_id] = np.mean(non_zero_depths)

    return num_covered_genes, mean_coverage, covered_genes
示例#2
0
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
             cb_histogram, cb_cutoff, no_scale_evidence, subsample, sparse):
    ''' Count up evidence for tagged molecules
    '''
    from pysam import AlignmentFile

    from io import StringIO
    import pandas as pd

    from utils import weigh_evidence

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            try:
                gene_map = dict(p.strip().split() for p in fh)
            except ValueError:
                logger.error('Incorrectly formatted gene_map, need to be tsv.')
                sys.exit()

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    if not cb_cutoff:
        cb_cutoff = 0

    if cb_histogram and cb_cutoff == "auto":
        cb_cutoff = guess_depth_cutoff(cb_histogram)

    cb_cutoff = int(cb_cutoff)

    cb_hist = None
    filter_cb = False
    if cb_histogram:
        cb_hist = pd.read_table(cb_histogram,
                                index_col=0,
                                header=-1,
                                squeeze=True)
        total_num_cbs = cb_hist.shape[0]
        cb_hist = cb_hist[cb_hist > cb_cutoff]
        logger.info('Keeping {} out of {} cellular barcodes.'.format(
            cb_hist.shape[0], total_num_cbs))
        filter_cb = True

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    if subsample:
        logger.info(
            'Creating reservoir of subsampled reads ({} per cell)'.format(
                subsample))
        start_sampling = time.time()

        reservoir = collections.defaultdict(list)
        cb_hist_sampled = 0 * cb_hist
        cb_obs = 0 * cb_hist

        track = stream_bamfile(sam)
        current_read = 'none_observed_yet'
        for i, aln in enumerate(track):
            if aln.qname == current_read:
                continue

            current_read = aln.qname
            match = parser_re.match(aln.qname)
            CB = match.group('CB')

            if CB not in cb_hist.index:
                continue

            cb_obs[CB] += 1
            if len(reservoir[CB]) < subsample:
                reservoir[CB].append(i)
                cb_hist_sampled[CB] += 1
            else:
                s = pd.np.random.randint(0, cb_obs[CB])
                if s < subsample:
                    reservoir[CB][s] = i

        index_filter = set(itertools.chain.from_iterable(reservoir.values()))
        sam_file.close()
        sampling_time = time.time() - start_sampling
        logger.info('Sampling done - {:.3}s'.format(sampling_time))

    evidence = collections.defaultdict(int)

    logger.info('Tallying evidence')
    start_tally = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    targets = [x["SN"] for x in sam_file.header["SQ"]]
    track = sam_file.fetch(until_eof=True)
    count = 0
    unmapped = 0
    kept = 0
    nomatchcb = 0
    current_read = 'none_observed_yet'
    count_this_read = True
    for i, aln in enumerate(track):
        if count and not count % 100000:
            logger.info("Processed %d alignments, kept %d." % (count, kept))
            logger.info("%d were filtered for being unmapped." % unmapped)
            if filter_cb:
                logger.info(
                    "%d were filtered for not matching known barcodes." %
                    nomatchcb)
        count += 1

        if aln.is_unmapped:
            unmapped += 1
            continue

        if aln.qname != current_read:
            current_read = aln.qname
            if subsample and i not in index_filter:
                count_this_read = False
                continue
            else:
                count_this_read = True
        else:
            if not count_this_read:
                continue

        match = parser_re.match(aln.qname)
        CB = match.group('CB')
        if filter_cb:
            if CB not in cb_hist.index:
                nomatchcb += 1
                continue

        MB = match.group('MB')

        txid = sam_file.getrname(aln.reference_id)
        if gene_map:
            target_name = gene_map[txid]

        else:
            target_name = txid

        e_tuple = tuple_template.format(CB, target_name, aln.pos, MB)

        # Scale evidence by number of hits
        if no_scale_evidence:
            evidence[e_tuple] += 1.0
        else:
            evidence[e_tuple] += weigh_evidence(aln.tags)
        kept += 1

    tally_time = time.time() - start_tally
    logger.info('Tally done - {:.3}s, {:,} alns/min'.format(
        tally_time, int(60. * count / tally_time)))
    logger.info('Collapsing evidence')

    logger.info('Writing evidence')
    with tempfile.NamedTemporaryFile('w+t') as out_handle:
        for key in evidence:
            line = '{},{}\n'.format(key, evidence[key])
            out_handle.write(line)

        out_handle.flush()
        out_handle.seek(0)
        evidence_table = pd.read_csv(out_handle, header=None)

    del evidence

    evidence_query = 'evidence >= %f' % minevidence
    if positional:
        evidence_table.columns = ['cell', 'gene', 'umi', 'pos', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(
            ['cell', 'gene'])['umi', 'pos'].size()

    else:
        evidence_table.columns = ['cell', 'gene', 'umi', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(
            ['cell', 'gene'])['umi'].size()

    expanded = collapsed.unstack().T

    if gene_map:
        # This Series is just for sorting the index
        genes = pd.Series(index=set(gene_map.values()))
        genes = genes.sort_index()
        # Now genes is assigned to a DataFrame
        genes = expanded.ix[genes.index]

    else:
        # make data frame have a complete accounting of transcripts
        targets = pd.Series(index=set(targets))
        targets = targets.sort_index()
        expanded = expanded.reindex(targets.index.values, fill_value=0)
        genes = expanded

    genes.replace(pd.np.nan, 0, inplace=True)

    logger.info('Output results')

    if subsample:
        cb_hist_sampled.to_csv('ss_{}_'.format(subsample) +
                               os.path.basename(cb_histogram),
                               sep='\t')

    if output_evidence_table:
        import shutil
        buf.seek(0)
        with open(output_evidence_table, 'w') as etab_fh:
            shutil.copyfileobj(buf, etab_fh)

    if sparse:
        pd.Series(genes.index).to_csv(out + ".rownames", index=False)
        pd.Series(genes.columns.values).to_csv(out + ".colnames", index=False)
        with open(out, "w+b") as out_handle:
            scipy.io.mmwrite(out_handle, scipy.sparse.csr_matrix(genes))
    else:
        genes.to_csv(out)
示例#3
0
文件: umis.py 项目: Teichlab/umis
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence):
    ''' Count up evidence for tagged molecules
    '''
    from pysam import AlignmentFile
    from cStringIO import StringIO
    import pandas as pd

    from utils import weigh_evidence

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            gene_map = dict(p.strip().split() for p in fh)

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    logger.info('Tallying evidence')
    start_tally = time.time()

    evidence = collections.defaultdict(int)

    sam_file = AlignmentFile(sam, mode='r')
    track = sam_file.fetch(until_eof=True)
    for i, aln in enumerate(track):
        if aln.is_unmapped:
            continue

        match = parser_re.match(aln.qname)
        CB = match.group('CB')
        MB = match.group('MB')

        txid = sam_file.getrname(aln.reference_id)
        if gene_map:
            target_name = gene_map[txid]

        else:
            target_name = txid

        e_tuple = tuple_template.format(CB, target_name, aln.pos, MB)

        # Scale evidence by number of hits
        evidence[e_tuple] += weigh_evidence(aln.tags)

    tally_time = time.time() - start_tally
    logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * i / tally_time)))
    logger.info('Collapsing evidence')

    buf = StringIO()
    for key in evidence:
        line = '{},{}\n'.format(key, evidence[key])
        buf.write(line)

    buf.seek(0)
    evidence_table = pd.read_csv(buf)
    evidence_query = 'evidence >= %f' % minevidence
    if positional:
        evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size()

    else:
        evidence_table.columns=['cell', 'gene', 'umi', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size()

    expanded = collapsed.unstack().T

    if gene_map:
        # This Series is just for sorting the index
        genes = pd.Series(index=set(gene_map.values()))
        genes = genes.sort_index()
        # Now genes is assigned to a DataFrame
        genes = expanded.ix[genes.index]

    else:
        genes = expanded

    genes.replace(pd.np.nan, 0, inplace=True)

    logger.info('Output results')

    if output_evidence_table:
        import shutil
        buf.seek(0)
        with open(output_evidence_table, 'w') as etab_fh:
            shutil.copyfileobj(buf, etab_fh)

    genes.to_csv(out)
示例#4
0
文件: umis.py 项目: lylamha/umis
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
             cb_histogram, cb_cutoff, no_scale_evidence):
    ''' Count up evidence for tagged molecules
    '''
    from pysam import AlignmentFile
    from cStringIO import StringIO
    import pandas as pd

    from utils import weigh_evidence

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            gene_map = dict(p.strip().split() for p in fh)

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    cb_set = set()
    if cb_histogram:
        with open(cb_histogram) as fh:
            cb_map = dict(p.strip().split() for p in fh)
            cb_set = set([k for k, v in cb_map.items() if int(v) > cb_cutoff])
            logger.info('Keeping %d out of %d cellular barcodes.' %
                        (len(cb_map), len(cb_set)))

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    logger.info('Tallying evidence')
    start_tally = time.time()

    evidence = collections.defaultdict(int)

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    track = sam_file.fetch(until_eof=True)
    count = 0
    kept = 0
    for i, aln in enumerate(track):
        count += 1
        if not count % 100000:
            logger.info("Processed %d alignments, kept %d." % (count, kept))

        if aln.is_unmapped:
            continue

        match = parser_re.match(aln.qname)
        CB = match.group('CB')
        if cb_set and CB not in cb_set:
            continue
        MB = match.group('MB')

        txid = sam_file.getrname(aln.reference_id)
        if gene_map:
            target_name = gene_map[txid]

        else:
            target_name = txid

        e_tuple = tuple_template.format(CB, target_name, aln.pos, MB)

        # Scale evidence by number of hits
        if no_scale_evidence:
            evidence[e_tuple] += 1.0
        else:
            evidence[e_tuple] += weigh_evidence(aln.tags)
        kept += 1

    tally_time = time.time() - start_tally
    logger.info('Tally done - {:.3}s, {:,} alns/min'.format(
        tally_time, int(60. * i / tally_time)))
    logger.info('Collapsing evidence')

    buf = StringIO()
    for key in evidence:
        line = '{},{}\n'.format(key, evidence[key])
        buf.write(line)

    buf.seek(0)
    evidence_table = pd.read_csv(buf)
    evidence_query = 'evidence >= %f' % minevidence
    if positional:
        evidence_table.columns = ['cell', 'gene', 'umi', 'pos', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(
            ['cell', 'gene'])['umi', 'pos'].size()

    else:
        evidence_table.columns = ['cell', 'gene', 'umi', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(
            ['cell', 'gene'])['umi'].size()

    expanded = collapsed.unstack().T

    if gene_map:
        # This Series is just for sorting the index
        genes = pd.Series(index=set(gene_map.values()))
        genes = genes.sort_index()
        # Now genes is assigned to a DataFrame
        genes = expanded.ix[genes.index]

    else:
        genes = expanded

    genes.replace(pd.np.nan, 0, inplace=True)

    logger.info('Output results')

    if output_evidence_table:
        import shutil
        buf.seek(0)
        with open(output_evidence_table, 'w') as etab_fh:
            shutil.copyfileobj(buf, etab_fh)

    genes.to_csv(out)
示例#5
0
def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
                 cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix):
    ''' Count up evidence for tagged molecules, this implementation assumes the
    alignment file is coordinate sorted
    '''
    from pysam import AlignmentFile

    from io import StringIO
    import pandas as pd

    from utils import weigh_evidence

    if sam.endswith(".sam"):
        logger.error(
            "To use the fasttagcount subcommand, the alignment file must be a "
            "coordinate sorted, indexed BAM file.")
        sys.exit(1)

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            try:
                gene_map = dict(p.strip().split() for p in fh)
            except ValueError:
                logger.error('Incorrectly formatted gene_map, need to be tsv.')
                sys.exit()

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    if not cb_cutoff:
        cb_cutoff = 0

    if cb_histogram and cb_cutoff == "auto":
        cb_cutoff = guess_depth_cutoff(cb_histogram)

    cb_cutoff = int(cb_cutoff)

    cb_hist = None
    filter_cb = False
    if cb_histogram:
        cb_hist = pd.read_table(cb_histogram,
                                index_col=0,
                                header=-1,
                                squeeze=True)
        total_num_cbs = cb_hist.shape[0]
        cb_hist = cb_hist[cb_hist > cb_cutoff]
        logger.info('Keeping {} out of {} cellular barcodes.'.format(
            cb_hist.shape[0], total_num_cbs))
        filter_cb = True

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    if subsample:
        logger.info(
            'Creating reservoir of subsampled reads ({} per cell)'.format(
                subsample))
        start_sampling = time.time()

        reservoir = collections.defaultdict(list)
        cb_hist_sampled = 0 * cb_hist
        cb_obs = 0 * cb_hist

        track = stream_bamfile(sam)
        current_read = 'none_observed_yet'
        for i, aln in enumerate(track):
            if aln.qname == current_read:
                continue

            current_read = aln.qname

            if parse_tags:
                CB = aln.get_tag('CR')
            else:
                match = parser_re.match(aln.qname)
                CB = match.group('CB')

            if CB not in cb_hist.index:
                continue

            cb_obs[CB] += 1
            if len(reservoir[CB]) < subsample:
                reservoir[CB].append(i)
                cb_hist_sampled[CB] += 1
            else:
                s = pd.np.random.randint(0, cb_obs[CB])
                if s < subsample:
                    reservoir[CB][s] = i

        index_filter = set(itertools.chain.from_iterable(reservoir.values()))
        sam_file.close()
        sampling_time = time.time() - start_sampling
        logger.info('Sampling done - {:.3}s'.format(sampling_time))

    evidence = collections.defaultdict(lambda: collections.defaultdict(float))
    bare_evidence = collections.defaultdict(float)
    logger.info('Tallying evidence')
    start_tally = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    transcript_map = collections.defaultdict(set)
    sam_transcripts = [x["SN"] for x in sam_file.header["SQ"]]
    if gene_map:
        for transcript, gene in gene_map.items():
            if transcript in sam_transcripts:
                transcript_map[gene].add(transcript)
    else:
        for transcript in sam_transcripts:
            transcript_map[transcript].add(transcript)
    missing_transcripts = set()
    alignments_processed = 0
    unmapped = 0
    kept = 0
    nomatchcb = 0
    current_read = 'none_observed_yet'
    current_transcript = None
    count_this_read = True
    transcripts_processed = 0
    genes_processed = 0
    cells = list(cb_hist.index)
    targets_seen = set()

    if umi_matrix:
        bare_evidence_handle = open(umi_matrix, "w")
        bare_evidence_handle.write(",".join(["gene"] + cells) + "\n")

    with open(out, "w") as out_handle:
        out_handle.write(",".join(["gene"] + cells) + "\n")
        for gene, transcripts in transcript_map.items():
            for transcript in transcripts:
                for aln in sam_file.fetch(transcript):
                    alignments_processed += 1

                    if aln.is_unmapped:
                        unmapped += 1
                        continue

                    if gene_tags and not aln.has_tag('GX'):
                        unmapped += 1
                        continue

                    if aln.qname != current_read:
                        current_read = aln.qname
                        if subsample and i not in index_filter:
                            count_this_read = False
                            continue
                        else:
                            count_this_read = True
                    else:
                        if not count_this_read:
                            continue

                    if parse_tags:
                        CB = aln.get_tag('CR')
                    else:
                        match = parser_re.match(aln.qname)
                        CB = match.group('CB')

                    if filter_cb:
                        if CB not in cb_hist.index:
                            nomatchcb += 1
                            continue

                    if parse_tags:
                        MB = aln.get_tag('UM')
                    else:
                        MB = match.group('MB')

                    if gene_tags:
                        target_name = aln.get_tag('GX').split(',')[0]
                    else:
                        txid = sam_file.getrname(aln.reference_id)
                        if gene_map:
                            if txid in gene_map:
                                target_name = gene_map[txid]
                            else:
                                missing_transcripts.add(txid)
                                continue
                        else:
                            target_name = txid
                    targets_seen.add(target_name)

                    # Scale evidence by number of hits
                    evidence[CB][MB] += weigh_evidence(aln.tags)
                    bare_evidence[CB] += weigh_evidence(aln.tags)
                    kept += 1
                transcripts_processed += 1
                if not transcripts_processed % 1000:
                    logger.info("%d genes processed." % genes_processed)
                    logger.info("%d transcripts processed." %
                                transcripts_processed)
                    logger.info("%d alignments processed." %
                                alignments_processed)

            earray = []
            for cell in cells:
                umis = [
                    1 for _, v in evidence[cell].items() if v >= minevidence
                ]
                earray.append(str(sum(umis)))
            out_handle.write(",".join([gene] + earray) + "\n")
            earray = []
            if umi_matrix:
                for cell in cells:
                    earray.append(str(int(bare_evidence[cell])))
                bare_evidence_handle.write(",".join([gene] + earray) + "\n")

            evidence = collections.defaultdict(
                lambda: collections.defaultdict(int))
            bare_evidence = collections.defaultdict(int)
            genes_processed += 1

    if umi_matrix:
        bare_evidence_handle.close()

    # fill dataframe with missing values, sort and output
    df = pd.read_csv(out, index_col=0, header=0)
    targets = pd.Series(index=set(transcript_map.keys()))
    targets = targets.sort_index()
    df = df.reindex(targets.index.values, fill_value=0)
    df = df.sort_index()
    df.to_csv(out)

    if umi_matrix:
        df = pd.read_csv(umi_matrix, index_col=0, header=0)
        df = df.reindex(targets.index.values, fill_value=0)
        df = df.sort_index()
        df.to_csv(umi_matrix)
示例#6
0
文件: umis.py 项目: roryk/umis
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
             cb_histogram, cb_cutoff, no_scale_evidence, subsample):
    ''' Count up evidence for tagged molecules
    '''
    from pysam import AlignmentFile

    from io import StringIO
    import pandas as pd

    from utils import weigh_evidence

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            try:
                gene_map = dict(p.strip().split() for p in fh)
            except ValueError:
                logger.error('Incorrectly formatted gene_map, need to be tsv.')
                sys.exit()

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    if not cb_cutoff:
        cb_cutoff = 0

    if cb_histogram and cb_cutoff == "auto":
        cb_cutoff = guess_depth_cutoff(cb_histogram)

    cb_cutoff = int(cb_cutoff)

    cb_hist = None
    filter_cb = False
    if cb_histogram:
        cb_hist = pd.read_table(cb_histogram, index_col=0, header=-1, squeeze=True)
        total_num_cbs = cb_hist.shape[0]
        cb_hist = cb_hist[cb_hist > cb_cutoff]
        logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs))
        filter_cb = True

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    if subsample:
        logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample))
        start_sampling  = time.time()

        reservoir = collections.defaultdict(list)
        cb_hist_sampled = 0 * cb_hist
        cb_obs = 0 * cb_hist

        sam_mode = 'r' if sam.endswith(".sam") else 'rb'
        sam_file = AlignmentFile(sam, mode=sam_mode)
        track = sam_file.fetch(until_eof=True)
        current_read = 'none_observed_yet'
        for i, aln in enumerate(track):
            if aln.qname == current_read:
                continue

            current_read = aln.qname
            match = parser_re.match(aln.qname)
            CB = match.group('CB')

            if CB not in cb_hist.index:
                continue

            cb_obs[CB] += 1
            if len(reservoir[CB]) < subsample:
                reservoir[CB].append(i)
                cb_hist_sampled[CB] += 1
            else:
                s = pd.np.random.randint(0, cb_obs[CB])
                if s < subsample:
                    reservoir[CB][s] = i

        index_filter = set(itertools.chain.from_iterable(reservoir.values()))
        sam_file.close()
        sampling_time = time.time() - start_sampling
        logger.info('Sampling done - {:.3}s'.format(sampling_time))

    evidence = collections.defaultdict(int)

    logger.info('Tallying evidence')
    start_tally = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    track = sam_file.fetch(until_eof=True)
    count = 0
    unmapped = 0
    kept = 0
    nomatchcb = 0
    current_read = 'none_observed_yet'
    count_this_read = True
    for i, aln in enumerate(track):
        count += 1
        if not count % 100000:
            logger.info("Processed %d alignments, kept %d." % (count, kept))
            logger.info("%d were filtered for being unmapped." % unmapped)
            if filter_cb:
                logger.info("%d were filtered for not matching known barcodes."
                            % nomatchcb)

        if aln.is_unmapped:
            unmapped += 1
            continue

        if aln.qname != current_read:
            current_read = aln.qname
            if subsample and i not in index_filter:
                count_this_read = False
                continue
            else:
                count_this_read = True
        else:
            if not count_this_read:
                continue

        match = parser_re.match(aln.qname)
        CB = match.group('CB')
        if filter_cb:
            if CB not in cb_hist.index:
                nomatchcb += 1
                continue

        MB = match.group('MB')

        txid = sam_file.getrname(aln.reference_id)
        if gene_map:
            target_name = gene_map[txid]

        else:
            target_name = txid

        e_tuple = tuple_template.format(CB, target_name, aln.pos, MB)

        # Scale evidence by number of hits
        if no_scale_evidence:
            evidence[e_tuple] += 1.0
        else:
            evidence[e_tuple] += weigh_evidence(aln.tags)
        kept += 1

    tally_time = time.time() - start_tally
    logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * count / tally_time)))
    logger.info('Collapsing evidence')

    buf = StringIO()
    for key in evidence:
        line = '{},{}\n'.format(key, evidence[key])
        buf.write(unicode(line), "utf-8")

    buf.seek(0)
    evidence_table = pd.read_csv(buf)
    evidence_query = 'evidence >= %f' % minevidence
    if positional:
        evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size()

    else:
        evidence_table.columns=['cell', 'gene', 'umi', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size()

    expanded = collapsed.unstack().T

    if gene_map:
        # This Series is just for sorting the index
        genes = pd.Series(index=set(gene_map.values()))
        genes = genes.sort_index()
        # Now genes is assigned to a DataFrame
        genes = expanded.ix[genes.index]

    else:
        genes = expanded

    genes.replace(pd.np.nan, 0, inplace=True)

    logger.info('Output results')

    if subsample:
        cb_hist_sampled.to_csv('ss_{}_'.format(subsample) + os.path.basename(cb_histogram), sep='\t')

    if output_evidence_table:
        import shutil
        buf.seek(0)
        with open(output_evidence_table, 'w') as etab_fh:
            shutil.copyfileobj(buf, etab_fh)

    genes.to_csv(out)
示例#7
0
文件: umis.py 项目: vals/umis
def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, 
                 cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix):
    ''' Count up evidence for tagged molecules, this implementation assumes the
    alignment file is coordinate sorted
    '''
    from pysam import AlignmentFile

    from io import StringIO
    import pandas as pd

    from utils import weigh_evidence

    if sam.endswith(".sam"):
        logger.error("To use the fasttagcount subcommand, the alignment file must be a "
                     "coordinate sorted, indexed BAM file.")
        sys.exit(1)

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            try:
                gene_map = dict(p.strip().split() for p in fh)
            except ValueError:
                logger.error('Incorrectly formatted gene_map, need to be tsv.')
                sys.exit()

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    if not cb_cutoff:
        cb_cutoff = 0

    if cb_histogram and cb_cutoff == "auto":
        cb_cutoff = guess_depth_cutoff(cb_histogram)

    cb_cutoff = int(cb_cutoff)

    cb_hist = None
    filter_cb = False
    if cb_histogram:
        cb_hist = pd.read_csv(cb_histogram, index_col=0, header=-1, squeeze=True, sep="\t")
        total_num_cbs = cb_hist.shape[0]
        cb_hist = cb_hist[cb_hist > cb_cutoff]
        logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs))
        filter_cb = True

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    if subsample:
        logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample))
        start_sampling  = time.time()

        reservoir = collections.defaultdict(list)
        cb_hist_sampled = 0 * cb_hist
        cb_obs = 0 * cb_hist

        track = stream_bamfile(sam)
        current_read = 'none_observed_yet'
        for i, aln in enumerate(track):
            if aln.qname == current_read:
                continue

            current_read = aln.qname

            if parse_tags:
                CB = aln.get_tag('CR')
            else:
                match = parser_re.match(aln.qname)
                CB = match.group('CB')

            if CB not in cb_hist.index:
                continue

            cb_obs[CB] += 1
            if len(reservoir[CB]) < subsample:
                reservoir[CB].append(i)
                cb_hist_sampled[CB] += 1
            else:
                s = pd.np.random.randint(0, cb_obs[CB])
                if s < subsample:
                    reservoir[CB][s] = i

        index_filter = set(itertools.chain.from_iterable(reservoir.values()))
        sam_file.close()
        sampling_time = time.time() - start_sampling
        logger.info('Sampling done - {:.3}s'.format(sampling_time))

    evidence = collections.defaultdict(lambda: collections.defaultdict(float))
    bare_evidence = collections.defaultdict(float)
    logger.info('Tallying evidence')
    start_tally = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    transcript_map = collections.defaultdict(set)
    sam_transcripts = [x["SN"] for x in sam_file.header["SQ"]]
    if gene_map:
        for transcript, gene in gene_map.items():
            if transcript in sam_transcripts:
                transcript_map[gene].add(transcript)
    else:
        for transcript in sam_transcripts:
            transcript_map[transcript].add(transcript)
    missing_transcripts = set()
    alignments_processed = 0
    unmapped = 0
    kept = 0
    nomatchcb = 0
    current_read = 'none_observed_yet'
    current_transcript = None
    count_this_read = True
    transcripts_processed = 0
    genes_processed = 0
    cells = list(cb_hist.index)
    targets_seen = set()

    if umi_matrix:
        bare_evidence_handle = open(umi_matrix, "w")
        bare_evidence_handle.write(",".join(["gene"] + cells) + "\n")

    with open(out, "w") as out_handle:
        out_handle.write(",".join(["gene"] + cells) + "\n")
        for gene, transcripts in transcript_map.items():
            for transcript in transcripts:
                for aln in sam_file.fetch(transcript):
                    alignments_processed += 1

                    if aln.is_unmapped:
                        unmapped += 1
                        continue

                    if gene_tags and not aln.has_tag('GX'):
                        unmapped += 1
                        continue

                    if aln.qname != current_read:
                        current_read = aln.qname
                        if subsample and i not in index_filter:
                            count_this_read = False
                            continue
                        else:
                            count_this_read = True
                    else:
                        if not count_this_read:
                            continue

                    if parse_tags:
                        CB = aln.get_tag('CR')
                    else:
                        match = parser_re.match(aln.qname)
                        CB = match.group('CB')

                    if filter_cb:
                        if CB not in cb_hist.index:
                            nomatchcb += 1
                            continue

                    if parse_tags:
                        MB = aln.get_tag('UM')
                    else:
                        MB = match.group('MB')

                    if gene_tags:
                        target_name = aln.get_tag('GX').split(',')[0]
                    else:
                        txid = sam_file.getrname(aln.reference_id)
                        if gene_map:
                            if txid in gene_map:
                                target_name = gene_map[txid]
                            else:
                                missing_transcripts.add(txid)
                                continue
                        else:
                            target_name = txid
                    targets_seen.add(target_name)

                    # Scale evidence by number of hits
                    evidence[CB][MB] += weigh_evidence(aln.tags)
                    bare_evidence[CB] += weigh_evidence(aln.tags)
                    kept += 1
                transcripts_processed += 1
                if not transcripts_processed % 1000:
                    logger.info("%d genes processed." % genes_processed)
                    logger.info("%d transcripts processed." % transcripts_processed)
                    logger.info("%d alignments processed." % alignments_processed)

            earray = []
            for cell in cells:
                umis = [1 for _, v in evidence[cell].items() if v >= minevidence]
                earray.append(str(sum(umis)))
            out_handle.write(",".join([gene] + earray) + "\n")
            earray = []
            if umi_matrix:
                for cell in cells:
                    earray.append(str(int(bare_evidence[cell])))
                bare_evidence_handle.write(",".join([gene] + earray) + "\n")

            evidence = collections.defaultdict(lambda: collections.defaultdict(int))
            bare_evidence = collections.defaultdict(int)
            genes_processed += 1

    if umi_matrix:
        bare_evidence_handle.close()

    # fill dataframe with missing values, sort and output
    df = pd.read_csv(out, index_col=0, header=0)
    targets = pd.Series(index=set(transcript_map.keys()))
    targets = targets.sort_index()
    df = df.reindex(targets.index.values, fill_value=0)
    df = df.sort_index()
    df.to_csv(out)

    if umi_matrix:
        df = pd.read_csv(umi_matrix, index_col=0, header=0)
        df = df.reindex(targets.index.values, fill_value=0)
        df = df.sort_index()
        df.to_csv(umi_matrix)
示例#8
0
文件: umis.py 项目: vals/umis
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
             cb_histogram, cb_cutoff, no_scale_evidence, subsample, sparse,
             parse_tags, gene_tags):
    ''' Count up evidence for tagged molecules
    '''
    from pysam import AlignmentFile

    from io import StringIO
    import pandas as pd

    from utils import weigh_evidence

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            try:
                gene_map = dict(p.strip().split() for p in fh)
            except ValueError:
                logger.error('Incorrectly formatted gene_map, need to be tsv.')
                sys.exit()

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    if not cb_cutoff:
        cb_cutoff = 0

    if cb_histogram and cb_cutoff == "auto":
        cb_cutoff = guess_depth_cutoff(cb_histogram)

    cb_cutoff = int(cb_cutoff)

    cb_hist = None
    filter_cb = False
    if cb_histogram:
        cb_hist = pd.read_csv(cb_histogram, index_col=0, header=-1, squeeze=True, sep="\t")
        total_num_cbs = cb_hist.shape[0]
        cb_hist = cb_hist[cb_hist > cb_cutoff]
        logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs))
        filter_cb = True

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    if subsample:
        logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample))
        start_sampling  = time.time()

        reservoir = collections.defaultdict(list)
        cb_hist_sampled = 0 * cb_hist
        cb_obs = 0 * cb_hist

        track = stream_bamfile(sam)
        current_read = 'none_observed_yet'
        for i, aln in enumerate(track):
            if aln.qname == current_read:
                continue

            current_read = aln.qname

            if parse_tags:
                CB = aln.get_tag('CR')
            else:
                match = parser_re.match(aln.qname)
                CB = match.group('CB')

            if CB not in cb_hist.index:
                continue

            cb_obs[CB] += 1
            if len(reservoir[CB]) < subsample:
                reservoir[CB].append(i)
                cb_hist_sampled[CB] += 1
            else:
                s = pd.np.random.randint(0, cb_obs[CB])
                if s < subsample:
                    reservoir[CB][s] = i

        index_filter = set(itertools.chain.from_iterable(reservoir.values()))
        sam_file.close()
        sampling_time = time.time() - start_sampling
        logger.info('Sampling done - {:.3}s'.format(sampling_time))

    evidence = collections.defaultdict(int)

    logger.info('Tallying evidence')
    start_tally = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    targets = [x["SN"] for x in sam_file.header["SQ"]]
    track = sam_file.fetch(until_eof=True)
    count = 0
    unmapped = 0
    kept = 0
    nomatchcb = 0
    current_read = 'none_observed_yet'
    count_this_read = True
    missing_transcripts = set()
    for i, aln in enumerate(track):
        if count and not count % 1000000:
            logger.info("Processed %d alignments, kept %d." % (count, kept))
            logger.info("%d were filtered for being unmapped." % unmapped)
            if filter_cb:
                logger.info("%d were filtered for not matching known barcodes."
                            % nomatchcb)
        count += 1

        if aln.is_unmapped:
            unmapped += 1
            continue

        if gene_tags and not aln.has_tag('GX'):
            unmapped += 1
            continue

        if aln.qname != current_read:
            current_read = aln.qname
            if subsample and i not in index_filter:
                count_this_read = False
                continue
            else:
                count_this_read = True
        else:
            if not count_this_read:
                continue

        if parse_tags:
            CB = aln.get_tag('CR')
        else:
            match = parser_re.match(aln.qname)
            CB = match.group('CB')

        if filter_cb:
            if CB not in cb_hist.index:
                nomatchcb += 1
                continue

        if parse_tags:
            MB = aln.get_tag('UM')
        else:
            MB = match.group('MB')

        if gene_tags:
            target_name = aln.get_tag('GX').split(',')[0]
        else:
            txid = sam_file.getrname(aln.reference_id)
            if gene_map:
                if txid in gene_map:
                    target_name = gene_map[txid]
                else:
                    missing_transcripts.add(txid)
                    target_name = txid
            else:
                target_name = txid

        e_tuple = tuple_template.format(CB, target_name, aln.pos, MB)

        # Scale evidence by number of hits
        if no_scale_evidence:
            evidence[e_tuple] += 1.0
        else:
            evidence[e_tuple] += weigh_evidence(aln.tags)

        kept += 1

    tally_time = time.time() - start_tally
    if missing_transcripts:
        logger.warn('The following transcripts were missing gene_ids, so we added them as the transcript ids: %s' % str(missing_transcripts))
    logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * count / tally_time)))
    logger.info('Collapsing evidence')

    logger.info('Writing evidence')
    with tempfile.NamedTemporaryFile('w+t') as out_handle:
        for key in evidence:
            line = '{},{}\n'.format(key, evidence[key])
            out_handle.write(line)

        out_handle.flush()
        out_handle.seek(0)

        evidence_table = pd.read_csv(out_handle, header=None)

    del evidence

    evidence_query = 'evidence >= %f' % minevidence
    if positional:
        evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size()

    else:
        evidence_table.columns=['cell', 'gene', 'umi', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size()

    expanded = collapsed.unstack().T

    if gene_map:
        # This Series is just for sorting the index
        genes = pd.Series(index=set(gene_map.values()))
        genes = genes.sort_index()
        # Now genes is assigned to a DataFrame
        genes = expanded.ix[genes.index]

    elif gene_tags:
        expanded.sort_index()
        genes = expanded

    else:
        # make data frame have a complete accounting of transcripts
        targets = pd.Series(index=set(targets))
        targets = targets.sort_index()
        expanded = expanded.reindex(targets.index.values, fill_value=0)
        genes = expanded

    genes.fillna(0, inplace=True)
    genes = genes.astype(int)
    genes.index.name = "gene"

    logger.info('Output results')

    if subsample:
        cb_hist_sampled.to_csv('ss_{}_'.format(subsample) + os.path.basename(cb_histogram), sep='\t')

    if output_evidence_table:
        import shutil
        buf.seek(0)
        with open(output_evidence_table, 'w') as etab_fh:
            shutil.copyfileobj(buf, etab_fh)

    if sparse:
        pd.Series(genes.index).to_csv(out + ".rownames", index=False, header=False)
        pd.Series(genes.columns.values).to_csv(out + ".colnames", index=False, header=False)
        with open(out, "w+b") as out_handle:
            scipy.io.mmwrite(out_handle, scipy.sparse.csr_matrix(genes))
    else:
        genes.to_csv(out)
def pesam2bed(input_file,
              chrlen_file,
              output_file,
              logfile,
              errfile,
              filter_function=check_good_read):
    '''
    Converts aligned BAM file into BED file of fragments filtering reads that do not pass the filtering criteria
    
    Input: 
        input_file - aligned (unsorted) paired end BAM
        chrlen_file - TSV file of format chromosome<TAB>length
        output_file - output file (BED file with start and end corresponding to both ends of the PE fragment)
        logfile, errfile - logging files
        filter_function - function that accepts an aligned read (pysam.AlignedSegment) and returns true, false or weight of the fragment
                            Default: utils.check_good_reads
                            Note: the function can return weight, if trying to accommodate reads from repetitive sequences
    
    Output: 
        None (written into output file)
    
    See also: 
        check_good_read
        
    '''

    samfile = AlignmentFile(input_file)

    take_read = [x for x in itertools.imap(filter_function, samfile)]
    # check good_read also checks if this is R1...
    samfile.reset()

    logfile.write("%d out of %d reads passed filtering\n" %
                  (sum(take_read), len(take_read) / 2))
    logfile.flush()
    chrlens = genomic_utils.get_chr_lens(chrlen_file)

    outfile = open(output_file, 'w')

    count = 0
    for i, r in enumerate(samfile):
        count += 1
        if count % 1000000 == 0:
            logfile.write("pesam2bed: %s, Read # %d\n" % (input_file, count))
            logfile.flush()

        read_1 = r
        if take_read[i] == 0:
            continue
        if not read_1.is_reverse:
            rstart = read_1.reference_start

            start = max(0, rstart)
            rend = rstart + read_1.template_length

            end = min(rend, chrlens[samfile.getrname(read_1.reference_id)])
            if start > end:
                continue
            try:
                outfile.write(samfile.getrname(read_1.reference_id))
                outfile.write('\t')
                outfile.write('%d\t%d\t%s\t%d\t+\n' %
                              (start, end, read_1.qname, take_read[i] * 100))
            except ValueError:
                errfile.write(
                    "***** pesam2bed failed with the following value\n")
                errfile.write(samfile.getrname(read_1.reference_id))
                errfile.write(' %d\t%d\t%s\t%d\t-\n' %
                              (start, end, read_1.qname, take_read[i] * 100))
                errfile.flush()
                sys.exit(-1)

        else:
            rstart = read_1.reference_end - abs(read_1.template_length)

            start = max(0, rstart)
            rend = read_1.reference_end
            end = min(rend, chrlens[samfile.getrname(read_1.reference_id)])
            if start > end:
                continue
            try:
                outfile.write(samfile.getrname(read_1.reference_id))
                outfile.write('\t')
                outfile.write('%d\t%d\t%s\t%d\t-\n' %
                              (start, end, read_1.qname, take_read[i] * 100))
            except ValueError:
                errfile.write(
                    "***** pesam2bed failed with the following value\n")
                errfile.write(samfile.getrname(read_1.reference_id))
                errfile.write(' %d\t%d\t%s\t%d\t-\n' %
                              (start, end, read_1.qname, take_read[i] * 100))
                errfile.flush()
                sys.exit(-1)

    outfile.close()