示例#1
0
def main(args, outs):
    molecule_counter = cr_mol_counter.MoleculeCounter.open(
        args.raw_molecules,
        'r',
        start=int(args.chunk_start),
        length=int(args.chunk_len))

    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)

    raw_matrices = cr_matrix.GeneBCMatrices.build_from_mol_counter(
        molecule_counter)
    filtered_matrices = raw_matrices.filter_barcodes(filtered_bcs_per_genome)

    raw_matrices.save_h5(outs.raw_matrices_h5)
    raw_matrices.save_mex(outs.raw_matrices_mex)
    raw_matrices.save_barcode_summary_h5(outs.barcode_summary_h5)

    filtered_matrices.save_h5(outs.filtered_matrices_h5)
    filtered_matrices.save_mex(outs.filtered_matrices_mex)

    genome_ids = molecule_counter.get_ref_column('genome_ids')

    with cr_mol_counter.MoleculeCounter.open(outs.filtered_molecules,
                                             'w') as ctr_out:
        summary = write_filtered_molecules(molecule_counter, ctr_out,
                                           genome_ids, filtered_bcs_per_genome)

    with open(outs.summary, 'w') as f:
        tk_json.dump_numpy(summary, f, pretty=True)
示例#2
0
def main(args, outs):
    np.random.seed(0)

    subsample_rate = args.subsample_info.get('subsample_rate')
    if subsample_rate is None:
        return

    mol_counter = MoleculeCounter.open(args.molecule_info,
                                       'r',
                                       start=int(args.chunk_start),
                                       length=int(args.chunk_len))

    # Subsample the matrices
    subsample_result = {}
    subsampled_raw_mats = cr_matrix.GeneBCMatrices.build_from_mol_counter(
        mol_counter,
        subsample_rate=subsample_rate,
        subsample_result=subsample_result)

    # Filter the subsampled matrices
    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    subsampled_filt_mats = subsampled_raw_mats.filter_barcodes(
        filtered_bcs_per_genome)

    # Calculations for subsampled duplication rate
    reporter = cr_report.Reporter(
        genomes=map(str, mol_counter.get_ref_column('genome_ids')),
        subsample_types=cr_constants.ALL_SUBSAMPLE_TYPES,
        subsample_depths=args.subsample_info['all_target_rpc'])

    reporter.subsampled_duplication_frac_cb(
        subsampled_raw_mats,
        mol_counter,
        args.subsample_info['subsample_rate'],
        args.subsample_info['subsample_type'],
        args.subsample_info['target_rpc'],
        subsample_result['mapped_reads'],
    )

    mol_counter.close()

    reporter.save(outs.chunked_reporter)

    outs.subsampled_matrices = {}
    outs.subsampled_matrices['raw_matrices'] = martian.make_path(
        'raw_matrices.h5')
    outs.subsampled_matrices['filtered_matrices'] = martian.make_path(
        'filtered_matrices.h5')

    subsampled_raw_mats.save_h5(outs.subsampled_matrices['raw_matrices'])
    subsampled_filt_mats.save_h5(outs.subsampled_matrices['filtered_matrices'])
示例#3
0
def main(args, outs):
    outs.coerce_strings()

    # Load whitelist
    whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_to_idx = OrderedDict((k, i) for i, k in enumerate(whitelist))

    # Load feature reference
    feature_ref = rna_feature_ref.from_transcriptome_and_csv(
        args.reference_path, args.feature_reference)

    # Load library info from BAM
    in_bam = tk_bam.create_bam_infile(args.chunk_input)
    library_info = rna_library.get_bam_library_info(in_bam)

    # Get cell-associated barcodes by genome
    filtered_bcs_by_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bc_union = cr_utils.get_cell_associated_barcode_set(
        args.filtered_barcodes)

    # Create the barcode info
    barcode_info = MoleculeCounter.build_barcode_info(filtered_bcs_by_genome,
                                                      library_info, whitelist)

    # Create the molecule info file
    mc = MoleculeCounter.open(outs.output,
                              mode='w',
                              feature_ref=feature_ref,
                              barcodes=whitelist,
                              library_info=library_info,
                              barcode_info=barcode_info)

    # Initialize per-library metrics
    lib_metrics = {}
    for lib_idx in xrange(len(library_info)):
        lib_metrics[str(lib_idx)] = {}
        lib_metrics[str(lib_idx)][cr_mol_counter.USABLE_READS_METRIC] = 0

    # Record read-counts per molecule. Note that UMIs are not contiguous
    # in the input because no sorting was done after UMI correction.

    prev_gem_group = None
    prev_barcode_idx = None

    for (gem_group, barcode_seq), reads_iter in \
        itertools.groupby(in_bam, key=cr_utils.barcode_sort_key_no_umi):
        if barcode_seq is None:
            continue

        barcode_idx = barcode_to_idx[barcode_seq]

        # Assert expected sort order of input BAM
        assert gem_group >= prev_gem_group
        if gem_group == prev_gem_group:
            assert barcode_idx >= prev_barcode_idx

        is_cell_barcode = cr_utils.format_barcode_seq(
            barcode_seq, gem_group) in filtered_bc_union

        counts = defaultdict(int)

        for read in reads_iter:
            # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent.
            if read.is_secondary or \
               read.is_read2 or \
               cr_utils.is_read_low_support_umi(read) or \
               not cr_utils.is_read_conf_mapped_to_feature(read):
                continue

            umi_seq = cr_utils.get_read_umi(read)
            if umi_seq is None:
                continue

            umi_int = MoleculeCounter.compress_umi_seq(
                umi_seq,
                MoleculeCounter.get_column_dtype('umi').itemsize * 8)

            feature_ids = cr_utils.get_read_gene_ids(read)
            assert len(feature_ids) == 1
            feature_int = feature_ref.id_map[feature_ids[0]].index

            library_idx = cr_utils.get_read_library_index(read)

            counts[(umi_int, library_idx, feature_int)] += 1

            if is_cell_barcode:
                lib_metrics[str(library_idx)][
                    cr_mol_counter.USABLE_READS_METRIC] += 1

            prev_gem_group = gem_group
            prev_barcode_idx = barcode_idx

        # Record data for this barcode
        gg_int = MoleculeCounter.get_column_dtype('gem_group').type(gem_group)
        mc.append_column('gem_group', np.repeat(gg_int, len(counts)))
        bc_int = MoleculeCounter.get_column_dtype('barcode_idx').type(
            barcode_idx)
        mc.append_column('barcode_idx', np.repeat(bc_int, len(counts)))

        feature_ints = np.fromiter(
            (k[2] for k in counts.iterkeys()),
            dtype=MoleculeCounter.get_column_dtype('feature_idx'),
            count=len(counts))
        # Sort by feature for fast matrix construction
        order = np.argsort(feature_ints)
        feature_ints = feature_ints[order]
        mc.append_column('feature_idx', feature_ints)
        del feature_ints

        li_ints = np.fromiter(
            (k[1] for k in counts.iterkeys()),
            dtype=MoleculeCounter.get_column_dtype('library_idx'),
            count=len(counts))[order]
        mc.append_column('library_idx', li_ints)
        del li_ints

        umi_ints = np.fromiter((k[0] for k in counts.iterkeys()),
                               dtype=MoleculeCounter.get_column_dtype('umi'),
                               count=len(counts))[order]
        mc.append_column('umi', umi_ints)
        del umi_ints

        count_ints = np.fromiter(
            counts.itervalues(),
            dtype=MoleculeCounter.get_column_dtype('count'),
            count=len(counts))[order]
        mc.append_column('count', count_ints)
        del count_ints

    in_bam.close()

    mc.set_metric(cr_mol_counter.LIBRARIES_METRIC, dict(lib_metrics))

    mc.save()
示例#4
0
def split(args):
    # Get the cell count
    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bcs = set()
    for _, bcs in filtered_bcs_per_genome.iteritems():
        filtered_bcs |= set(bcs)
    n_cells = len(filtered_bcs)

    if n_cells == 0:
        return {
            'chunks': [{
                'chunk_start': 0,
                'chunk_len': 0,
                'subsample_info': {}
            }]
        }

    # Get required info from the mol info
    with MoleculeCounter.open(args.molecule_info, 'r') as mol_counter:
        n_molecule_info_entries = mol_counter.nrows()
        barcode_whitelist = mol_counter.get_barcode_whitelist()
        gem_groups = mol_counter.get_gem_groups()

        raw_reads = mol_counter.get_total_raw_reads()
        raw_rpc = tk_stats.robust_divide(raw_reads, n_cells)
        mapped_reads = mol_counter.get_total_conf_mapped_filtered_bc_reads()

    mapped_read_frac = tk_stats.robust_divide(mapped_reads, raw_reads)

    subsamplings = list()  # track subsample info definitions

    # Calculate extra deciles to add in based on raw reads
    if raw_reads > 0:
        subsampling_deciles = [
            round(decile * raw_rpc) for decile in np.arange(0.1, 1.1, 0.1)
        ]
    else:
        subsampling_deciles = []

    # All target depths
    target_rpcs = cr_constants.SUBSAMPLE_READS_PER_CELL + subsampling_deciles

    for subsample_type, rpc_multiplier in [
        (cr_constants.RAW_SUBSAMPLE_TYPE, mapped_read_frac),
        (cr_constants.MAPPED_SUBSAMPLE_TYPE, 1.0)
    ]:
        # Generate subsampling definitions
        for target_rpc in target_rpcs:
            target_mapped_reads = int(
                float(target_rpc) * float(n_cells) * rpc_multiplier)

            subsample_rate = tk_stats.robust_divide(target_mapped_reads,
                                                    mapped_reads)

            if subsample_rate > 1.0:
                continue

            subsamplings.append({
                'subsample_type': subsample_type,
                'target_rpc': target_rpc,
                'subsample_rate': subsample_rate,
                'all_target_rpc': target_rpcs,
            })

    # Each chunk needs to store the entire gene-bc matrix and a piece of the mol info h5
    matrix_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        barcode_whitelist, gem_groups)
    chunk_len = cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK
    chunk_mem_gb = matrix_mem_gb + MoleculeCounter.estimate_mem_gb(chunk_len)
    join_mem_gb = matrix_mem_gb

    # Split the molecule info h5 into equi-RAM chunks
    chunks = []
    for subsample_info in subsamplings:
        for chunk_start in xrange(0, n_molecule_info_entries, chunk_len):
            chunks.append({
                'chunk_start':
                str(chunk_start),
                'chunk_len':
                str(min(n_molecule_info_entries - chunk_start, chunk_len)),
                'subsample_info':
                subsample_info,
                '__mem_gb':
                chunk_mem_gb,
            })
    join = {
        '__mem_gb': join_mem_gb,
    }

    if len(chunks) == 0:
        chunks.append({
            'chunk_start': str(0),
            'chunk_len': str(0),
            'subsample_info': {},
        })

    return {'chunks': chunks, 'join': join}
示例#5
0
def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    counter = cr_mol_counter.MoleculeCounter.open(outs.output, mode='w')

    mol_data_keys = cr_mol_counter.MoleculeCounter.get_data_columns()
    mol_data_columns = {key: idx for idx, key in enumerate(mol_data_keys)}

    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    genomes = cr_utils.get_reference_genomes(args.reference_path)
    genome_index = cr_reference.get_genome_index(genomes)
    none_gene_id = len(gene_index.get_genes())

    # store reference index columns
    # NOTE - these must be cast to str first, as unicode is not supported
    counter.set_ref_column('genome_ids', [str(genome) for genome in genomes])
    counter.set_ref_column('gene_ids',
                           [str(gene.id) for gene in gene_index.genes])
    counter.set_ref_column('gene_names',
                           [str(gene.name) for gene in gene_index.genes])

    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bcs = set()
    for _, bcs in filtered_bcs_per_genome.iteritems():
        filtered_bcs |= set(bcs)

    gg_metrics = collections.defaultdict(
        lambda: {cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC: 0})

    for (gem_group, barcode, gene_ids), reads_iter in itertools.groupby(
            in_bam, key=cr_utils.barcode_sort_key):
        if barcode is None or gem_group is None:
            continue
        is_cell_barcode = cr_utils.format_barcode_seq(
            barcode, gem_group) in filtered_bcs
        molecules = collections.defaultdict(
            lambda: np.zeros(len(mol_data_columns), dtype=np.uint64))

        compressed_barcode = cr_mol_counter.MoleculeCounter.compress_barcode_seq(
            barcode)
        gem_group = cr_mol_counter.MoleculeCounter.compress_gem_group(
            gem_group)

        read_positions = collections.defaultdict(set)
        for read in reads_iter:
            umi = cr_utils.get_read_umi(read)
            # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent.
            if read.is_secondary or umi is None or read.is_read2:
                continue

            raw_umi = cr_utils.get_read_raw_umi(read)
            raw_bc, raw_gg = cr_utils.split_barcode_seq(
                cr_utils.get_read_raw_barcode(read))
            proc_bc, proc_gg = cr_utils.split_barcode_seq(
                cr_utils.get_read_barcode(read))

            if cr_utils.is_read_conf_mapped_to_transcriptome(
                    read, cr_utils.get_high_conf_mapq(args.align)):
                assert len(gene_ids) == 1

                mol_key, map_type = (umi, gene_index.gene_id_to_int(
                    gene_ids[0])), 'reads'

                read_pos = (read.tid, read.pos)
                uniq_read_pos = read_pos not in read_positions[mol_key]
                read_positions[mol_key].add(read_pos)

                if is_cell_barcode:
                    gg_metrics[int(gem_group)][
                        cr_mol_counter.
                        GG_CONF_MAPPED_FILTERED_BC_READS_METRIC] += 1

            elif read.is_unmapped:
                mol_key, map_type, uniq_read_pos = (
                    umi, none_gene_id), 'unmapped_reads', False
            else:
                mol_key, map_type, uniq_read_pos = (
                    umi, none_gene_id), 'nonconf_mapped_reads', False
            molecules[mol_key][mol_data_columns[map_type]] += 1
            molecules[mol_key][mol_data_columns['umi_corrected_reads']] += int(
                not raw_umi == umi)
            molecules[mol_key][mol_data_columns[
                'barcode_corrected_reads']] += int(not raw_bc == proc_bc)
            molecules[mol_key][mol_data_columns[
                'conf_mapped_uniq_read_pos']] += int(uniq_read_pos)

        for mol_key, molecule in sorted(molecules.items()):
            umi, gene_id = mol_key
            genome = cr_utils.get_genome_from_str(
                gene_index.int_to_gene_id(gene_id), genomes)
            genome_id = cr_reference.get_genome_id(genome, genome_index)
            counter.add(
                barcode=compressed_barcode,
                gem_group=gem_group,
                umi=cr_mol_counter.MoleculeCounter.compress_umi_seq(umi),
                gene=gene_id,
                genome=genome_id,
                **{
                    key: molecule[col_idx]
                    for key, col_idx in mol_data_columns.iteritems()
                })

    in_bam.close()

    counter.set_metric(cr_mol_counter.GEM_GROUPS_METRIC, dict(gg_metrics))

    counter.save()