Exemplo n.º 1
0
    def save_barcode_summary_h5(self, filename):
        """ Generate a minimal barcode summary h5 without going through the reporter.
        NOTE: only use this if all genomes have the same set of barcodes, i.e. a raw matrix.
        """
        bc_sequences = None
        bc_table_cols = {}
        total_conf_mapped_deduped_reads = None

        for (genome, matrix) in self.matrices.iteritems():
            if bc_sequences is None:
                bc_sequences = np.array(matrix.bcs)
                bc_table_cols[cr_constants.H5_BC_SEQUENCE_COL] = bc_sequences
            conf_mapped_deduped_reads_key = cr_utils.format_barcode_summary_h5_key(genome,
                cr_constants.TRANSCRIPTOME_REGION, cr_constants.CONF_MAPPED_DEDUPED_READ_TYPE)
            conf_mapped_deduped_reads = matrix.get_reads_per_bc()

            if len(bc_sequences) != len(conf_mapped_deduped_reads):
                raise ValueError('Cannot write barcode summary since different genomes have different number of barcodes!')
            bc_table_cols[conf_mapped_deduped_reads_key] = conf_mapped_deduped_reads

            # Track total counts (across genomes)
            if total_conf_mapped_deduped_reads is None:
                total_conf_mapped_deduped_reads = conf_mapped_deduped_reads.copy()
            else:
                total_conf_mapped_deduped_reads += conf_mapped_deduped_reads

        # Record the 'multi'-prefixed (aka total) counts
        # for the web summary to display the barcode rank plot.
        key = cr_utils.format_barcode_summary_h5_key(cr_constants.MULTI_REFS_PREFIX,
                                                     cr_constants.TRANSCRIPTOME_REGION,
                                                     cr_constants.CONF_MAPPED_DEDUPED_READ_TYPE)
        bc_table_cols[key] = total_conf_mapped_deduped_reads

        cr_utils.write_h5(filename, bc_table_cols)
Exemplo n.º 2
0
def plot_barcode_rank(chart, sample_properties, sample_data):
    """ Generate the RNA counter barcode rank plot """
    if sample_properties.get(
            'genomes'
    ) is None or sample_data.barcode_summary is None or sample_data.cell_barcodes is None:
        return None

    if len(sample_properties['genomes']) == 0:
        return None

    # UMI counts per BC across all genomes present
    if len(sample_properties['genomes']) > 1:
        genome = lib_constants.MULTI_REFS_PREFIX
    else:
        genome = sample_properties['genomes'][0]

    gex_prefix = rna_library.get_library_type_metric_prefix(
        lib_constants.GENE_EXPRESSION_LIBRARY_TYPE)
    key = cr_utils.format_barcode_summary_h5_key(
        gex_prefix, genome, cr_constants.TRANSCRIPTOME_REGION,
        cr_constants.CONF_MAPPED_DEDUPED_READ_TYPE)

    if key in sample_data.barcode_summary:
        counts_per_bc, plot_segments = sample_data.counter_barcode_rank_plot_data(
            key)
        return _plot_counter_barcode_rank(chart, counts_per_bc, plot_segments)
    else:
        # Not guaranteed to exist, depending on pipeline
        pass
Exemplo n.º 3
0
def plot_barcode_rank(chart, sample_properties, sample_data):
    """ Generate the RNA counter barcode rank plot """
    if sample_properties.get(
            'genomes') is None or sample_data.barcode_summary is None:
        return None

    if len(sample_properties['genomes']) == 0:
        return None

    # UMI counts per BC across all genomes present
    if len(sample_properties['genomes']) > 1:
        genome = cr_constants.MULTI_REFS_PREFIX
    else:
        genome = sample_properties['genomes'][0]

    key = cr_utils.format_barcode_summary_h5_key(
        genome, cr_constants.TRANSCRIPTOME_REGION,
        cr_constants.CONF_MAPPED_DEDUPED_READ_TYPE)

    if key in sample_data.barcode_summary:
        counts_per_bc = sample_data.barcode_summary[key][:]
        return _plot_barcode_rank(chart, counts_per_bc, sample_data.num_cells)
    else:
        # Not guaranteed to exist, depending on pipeline
        pass
Exemplo n.º 4
0
def plot_barcode_rank(chart, sample_properties, sample_data):
    """ Generate the RNA counter barcode rank plot """
    if sample_properties.get('genomes') is None or sample_data.barcode_summary is None:
        return None

    if len(sample_properties['genomes']) == 0:
        return None

    counts_per_bc = []
    for genome in sample_properties['genomes']:
        key = cr_utils.format_barcode_summary_h5_key(genome, cr_constants.TRANSCRIPTOME_REGION, cr_constants.CONF_MAPPED_DEDUPED_READ_TYPE)
        if key in sample_data.barcode_summary:
            counts_per_bc.append(sample_data.barcode_summary[key][:])
        else:
            # Not guaranteed to exist, depending on pipeline
            return
    counts_per_bc = np.concatenate(counts_per_bc)

    return _plot_barcode_rank(chart, counts_per_bc, sample_data.num_cells)
Exemplo n.º 5
0
 def save_barcode_summary_h5(self, filename):
     """ Generate a minimal barcode summary h5 without going through the reporter.
     NOTE: only use this if all genomes have the same set of barcodes, i.e. a raw matrix.
     """
     bc_sequences = None
     bc_table_cols = {}
     for (genome, matrix) in self.matrices.iteritems():
         if bc_sequences is None:
             bc_sequences = np.array(matrix.bcs)
             bc_table_cols[cr_constants.H5_BC_SEQUENCE_COL] = bc_sequences
         conf_mapped_deduped_reads_key = cr_utils.format_barcode_summary_h5_key(
             genome, cr_constants.TRANSCRIPTOME_REGION,
             cr_constants.CONF_MAPPED_DEDUPED_READ_TYPE)
         conf_mapped_deduped_reads = matrix.get_reads_per_bc()
         if len(bc_sequences) != len(conf_mapped_deduped_reads):
             raise ValueError(
                 'Cannot write barcode summary since different genomes have different number of barcodes!'
             )
         bc_table_cols[
             conf_mapped_deduped_reads_key] = conf_mapped_deduped_reads
     cr_utils.write_h5(filename, bc_table_cols)
Exemplo n.º 6
0
    def report(self, genome, barcode_summary_h5, recovered_cells, cell_bc_seqs):
        d = {}

        filtered_mat = self.select_barcodes_by_seq(cell_bc_seqs)
        cell_bc_indices = self.bcs_to_ints(cell_bc_seqs)
        n_cell_bcs = len(cell_bc_seqs)

        # Don't compute metrics if no cells detected
        if n_cell_bcs == 0:
            return d

        # Compute matrix density
        d['filtered_gene_bc_matrix_density'] = tk_stats.robust_divide(filtered_mat.m.getnnz(), filtered_mat.m.shape[0]*filtered_mat.m.shape[1])

        reads_per_gene = filtered_mat.get_reads_per_gene()
        top_genes_with_reads = {filtered_mat.int_to_gene_id(i): int(count) for i, count in filtered_mat._topN(reads_per_gene)}
        d['filtered_bcs_top_genes_with_reads'] = top_genes_with_reads

        unique_bcs_per_gene = filtered_mat._sum(filtered_mat.m >= cr_constants.MIN_READS_PER_BARCODE, axis=1)
        top_genes_with_unique_bcs = {filtered_mat.int_to_gene_id(i): int(count) for i, count in filtered_mat._topN(unique_bcs_per_gene)}
        d['filtered_bcs_top_genes_with_unique_bcs'] = top_genes_with_unique_bcs

        # Total genes and counts
        total_genes_detected = np.count_nonzero(reads_per_gene)
        total_counts = int(reads_per_gene.sum())
        d['filtered_bcs_total_unique_genes_detected'] = total_genes_detected
        d['filtered_bcs_total_counts'] = total_counts

        def _summarize_per_barcode(a):
            mean = np.mean(a)
            stddev = np.std(a)
            return {
                'mean': mean,
                'median': np.median(a),
                'cv': tk_stats.robust_divide(float(stddev), float(mean)),
                'iqr': np.percentile(a, 75) - np.percentile(a, 25),
            }

        # Unique genes per bc
        unique_genes_per_bc = filtered_mat._sum(filtered_mat.m >= cr_constants.MIN_READS_PER_GENE, axis=0)
        unique_genes_stats = _summarize_per_barcode(unique_genes_per_bc)
        for stat, value in unique_genes_stats.iteritems():
            d['filtered_bcs_%s_unique_genes_detected' % stat] = value

        # Counts per bc
        counts_per_bc_stats = _summarize_per_barcode(filtered_mat._sum(filtered_mat.m, axis=0))
        for stat, value in counts_per_bc_stats.iteritems():
            d['filtered_bcs_%s_counts' % stat] = value

        # Cumulative fraction of counts going to top bcs
        d['filtered_bcs_cum_frac'] = tk_stats.robust_divide(filtered_mat.m.sum(), self.m.sum())

        # cDNA PCR duplication in top bcs
        dupe_candidate_h5_key = cr_utils.format_barcode_summary_h5_key(genome, cr_constants.TRANSCRIPTOME_REGION, cr_constants.CONF_MAPPED_BC_READ_TYPE)
        if dupe_candidate_h5_key in barcode_summary_h5:
            n_reads = barcode_summary_h5[dupe_candidate_h5_key][list(cell_bc_indices)].sum()
            n_deduped_reads = filtered_mat.m.sum()
        else:
            n_reads = 0
            n_deduped_reads = 0
        d['filtered_bcs_%s_dupe_reads_frac' % cr_constants.CDNA_PCR_DUPE_TYPE] = 1 - tk_stats.robust_divide(n_deduped_reads, n_reads)

        # Reads per top bc for the various read types (computed over top bcs)
        for read_type in cr_constants.MATRIX_REPORT_READ_TYPES:
            # Compute (n_reads)/(n_bcs) over all bcs and over top bcs
            per_bc_metric = 'filtered_bcs_%s_reads_per_filtered_bc' % read_type

            # Cumulative fraction of reads going to top bcs
            frac_metric = 'filtered_bcs_%s_reads_cum_frac' % read_type

            if read_type in cr_constants.MATRIX_USE_MATRIX_FOR_READ_TYPE:
                n_reads = filtered_mat.m.sum()
                n_all_reads = self.m.sum()
            else:
                h5_key = cr_utils.format_barcode_summary_h5_key(genome, cr_constants.TRANSCRIPTOME_REGION, read_type)
                if h5_key in barcode_summary_h5:
                    n_reads = barcode_summary_h5[h5_key][list(cell_bc_indices)].sum()
                    n_all_reads = barcode_summary_h5[h5_key][()].sum()
                else:
                    n_reads = 0
                    n_all_reads = 0
            d[per_bc_metric] = tk_stats.robust_divide(n_reads, n_cell_bcs)
            d[frac_metric] = tk_stats.robust_divide(n_reads, n_all_reads)
        return d
Exemplo n.º 7
0
def _report_genome_agnostic_metrics(matrix, barcode_summary_h5,
                                    recovered_cells, cell_bc_seqs, total_reads,
                                    total_conf_mapped_reads, library_prefix):
    """Report metrics that are computed across all barcodes and all genomes"""
    d = {}

    matrix = matrix.view()

    genomes = matrix.get_genomes()
    if len(genomes) == 0:
        # For genomeless features, use "multi" in place of the genome prefix
        genomes = [lib_constants.MULTI_REFS_PREFIX]

    # Get number of cell bcs across all genomes
    cell_bcs_union = reduce(lambda a, x: a | set(x), cell_bc_seqs.itervalues(),
                            set())
    n_cell_bcs_union = len(cell_bcs_union)

    d['filtered_bcs_transcriptome_union'] = n_cell_bcs_union
    d['%s_filtered_bcs' % lib_constants.MULTI_REFS_PREFIX] = n_cell_bcs_union

    # Report reads/cell across all genomes
    mean_reads_per_cell = tk_stats.robust_divide(total_reads, n_cell_bcs_union)
    d['%s_%s_total_raw_reads_per_filtered_bc' %
      (lib_constants.MULTI_REFS_PREFIX,
       cr_constants.TRANSCRIPTOME_REGION)] = mean_reads_per_cell
    # Create a feature-barcode dual whose name makes sense
    d['reads_per_cell'] = mean_reads_per_cell

    d['%s_%s_total_conf_mapped_reads_per_filtered_bc' %
      (lib_constants.MULTI_REFS_PREFIX,
       cr_constants.TRANSCRIPTOME_REGION)] = tk_stats.robust_divide(
           total_conf_mapped_reads, n_cell_bcs_union)

    # Split the matrix by genome
    if genomes[0] != lib_constants.MULTI_REFS_PREFIX:
        genome_matrices = OrderedDict(
            ((g, matrix.select_features_by_genome(g)) for g in genomes))
    else:
        # Genomeless feature types
        genome_matrices = OrderedDict(
            ((lib_constants.MULTI_REFS_PREFIX, matrix), ))

    # Total UMI counts across all matrices and all filtered barcodes
    total_umi_counts = 0
    for mat in genome_matrices.itervalues():
        total_umi_counts += mat.select_barcodes_by_seq(cell_bcs_union).sum()

    # Deviation from cell load
    if recovered_cells is None:
        d['%s_filtered_bcs_difference_from_recovered_cells' %
          lib_constants.MULTI_REFS_PREFIX] = 0
        d['%s_filtered_bcs_relative_difference_from_recovered_cells' %
          lib_constants.MULTI_REFS_PREFIX] = 0
    else:
        d['%s_filtered_bcs_difference_from_recovered_cells' %
          lib_constants.MULTI_REFS_PREFIX] = int(n_cell_bcs_union) - int(
              recovered_cells)
        d['%s_filtered_bcs_relative_difference_from_recovered_cells' %
          lib_constants.MULTI_REFS_PREFIX] = tk_stats.robust_divide(
              n_cell_bcs_union - recovered_cells, recovered_cells)

    # Duplicate these metrics across genomes for backwards-compat
    for genome in genomes:
        d['%s_total_raw_reads_per_filtered_bc' %
          genome] = tk_stats.robust_divide(total_reads, n_cell_bcs_union)
        d['%s_total_conf_mapped_reads_per_filtered_bc' %
          genome] = tk_stats.robust_divide(total_conf_mapped_reads,
                                           n_cell_bcs_union)

        for read_type in cr_constants.MATRIX_REPORT_READ_TYPES:
            metric = '%s_total_%s_reads_per_filtered_bc' % (genome, read_type)
            if read_type in cr_constants.MATRIX_USE_MATRIX_FOR_READ_TYPE:
                n_reads = total_umi_counts
            else:
                # Note the extra underscore after the library_type prefix.
                # This is induced by the Reporter framework.
                h5_keys = [
                    cr_utils.format_barcode_summary_h5_key(
                        library_prefix, g, cr_constants.TRANSCRIPTOME_REGION,
                        read_type) for g in genomes
                ]
                h5_keys = [x for x in h5_keys if x in barcode_summary_h5]
                n_reads = sum(
                    np.array(barcode_summary_h5[h5_key]).sum()
                    for h5_key in h5_keys)
            d[metric] = tk_stats.robust_divide(n_reads, n_cell_bcs_union)

    # Report frac reads in cells across all genomes
    total_conf_mapped_reads_in_cells = 0
    total_conf_mapped_barcoded_reads = 0

    for genome, g_mat in genome_matrices.iteritems():
        h5_key = '%s_%s_%s_%s_reads' % (library_prefix, genome,
                                        cr_constants.TRANSCRIPTOME_REGION,
                                        cr_constants.CONF_MAPPED_BC_READ_TYPE)
        cmb_reads = barcode_summary_h5[h5_key][:]
        cell_bc_indices = _get_barcode_summary_h5_indices(
            barcode_summary_h5, cell_bcs_union)
        total_conf_mapped_reads_in_cells += cmb_reads[cell_bc_indices].sum()
        total_conf_mapped_barcoded_reads += cmb_reads.sum()
    frac_reads_in_cells = tk_stats.robust_divide(
        total_conf_mapped_reads_in_cells, total_conf_mapped_barcoded_reads)
    d['multi_filtered_bcs_conf_mapped_barcoded_reads_cum_frac'] = frac_reads_in_cells
    # Create a feature-barcode dual whose name makes sense
    d['feature_reads_in_cells'] = frac_reads_in_cells

    # Compute fraction of reads usable (conf mapped, barcoded, filtered barcode)
    unique_barcodes = set(cell_bcs_union)
    in_unique_barcodes_vectorized = np.vectorize(
        lambda x: x in unique_barcodes)
    filtered_bc_h5_row = in_unique_barcodes_vectorized(
        np.array(barcode_summary_h5['bc_sequence']))

    usable_reads = 0

    for genome in genomes:
        h5_key = cr_utils.format_barcode_summary_h5_key(
            library_prefix, genome, cr_constants.TRANSCRIPTOME_REGION,
            cr_constants.CONF_MAPPED_BC_READ_TYPE)

        if h5_key not in barcode_summary_h5:
            continue

        usable_reads += (filtered_bc_h5_row *
                         np.array(barcode_summary_h5[h5_key])).sum()

    # Fraction reads usable
    d['%s_transcriptome_usable_reads_frac' %
      lib_constants.MULTI_REFS_PREFIX] = tk_stats.robust_divide(
          usable_reads, total_reads)
    # Create a feature barcoding dual whose name makes sense
    d['frac_feature_reads_usable'] = tk_stats.robust_divide(
        usable_reads, total_reads)

    # Usable reads
    d['%s_usable_reads' % lib_constants.MULTI_REFS_PREFIX] = usable_reads

    # Usable reads per cell
    reads_usable_per_cell = tk_stats.robust_divide(usable_reads,
                                                   n_cell_bcs_union)
    d['%s_usable_reads_per_filtered_bc' %
      lib_constants.MULTI_REFS_PREFIX] = reads_usable_per_cell
    # Create a feature barcoding dual whose name makes sense
    d['feature_reads_usable_per_cell'] = reads_usable_per_cell

    # Compute matrix density
    filtered_mat = matrix.select_barcodes_by_seq(cell_bcs_union)
    total_nonzero_entries = filtered_mat.get_num_nonzero()
    filtered_shape = filtered_mat.get_shape()
    total_entries = filtered_shape[0] * filtered_shape[1]
    print total_entries, total_nonzero_entries, filtered_shape
    d['%s_filtered_gene_bc_matrix_density' %
      lib_constants.MULTI_REFS_PREFIX] = tk_stats.robust_divide(
          total_nonzero_entries, total_entries)

    return d
Exemplo n.º 8
0
def _report(matrix, genome, barcode_summary_h5, recovered_cells, cell_bc_seqs,
            library_prefix):
    d = {}
    matrix = matrix.view()

    filtered_mat = matrix.select_barcodes_by_seq(cell_bc_seqs)
    filtered_mat_shape = filtered_mat.get_shape()
    cell_bc_indices = _get_barcode_summary_h5_indices(barcode_summary_h5,
                                                      cell_bc_seqs)
    n_cell_bcs = len(cell_bc_seqs)

    # Don't compute metrics if no cells detected
    if n_cell_bcs == 0:
        return d

    # Compute matrix density
    d['filtered_gene_bc_matrix_density'] = tk_stats.robust_divide(
        filtered_mat.get_num_nonzero(),
        filtered_mat_shape[0] * filtered_mat_shape[1])

    counts_per_gene = filtered_mat.sum(axis=1)
    genes_top_n = min(cr_constants.TOP_N, len(counts_per_gene))
    top_genes_with_counts = {
        filtered_mat.int_to_feature_id(i): int(count)
        for i, count in cr_matrix.top_n(counts_per_gene, genes_top_n)
    }
    d['filtered_bcs_top_genes_with_reads'] = top_genes_with_counts

    unique_bcs_per_gene = filtered_mat.count_ge(
        axis=1, threshold=cr_constants.MIN_COUNTS_PER_BARCODE)
    top_genes_with_unique_bcs = {
        filtered_mat.int_to_feature_id(i): int(count)
        for i, count in cr_matrix.top_n(unique_bcs_per_gene, genes_top_n)
    }
    d['filtered_bcs_top_genes_with_unique_bcs'] = top_genes_with_unique_bcs

    # Total genes and counts
    total_genes_detected = np.count_nonzero(counts_per_gene)
    total_counts = int(counts_per_gene.sum())
    d['filtered_bcs_total_unique_genes_detected'] = total_genes_detected
    d['filtered_bcs_total_counts'] = total_counts

    def _summarize_per_barcode(a):
        mean = np.mean(a)
        stddev = np.std(a)
        return {
            'mean': mean,
            'median': np.median(a),
            'cv': tk_stats.robust_divide(float(stddev), float(mean)),
            'iqr': np.percentile(a, 75) - np.percentile(a, 25),
        }

    # Unique genes per bc
    unique_genes_per_bc = filtered_mat.count_ge(
        axis=0, threshold=cr_constants.MIN_COUNTS_PER_GENE)
    unique_genes_stats = _summarize_per_barcode(unique_genes_per_bc)
    for stat, value in unique_genes_stats.iteritems():
        d['filtered_bcs_%s_unique_genes_detected' % stat] = value

    # Counts per bc
    counts_per_bc = filtered_mat.sum(axis=0)
    counts_per_bc_stats = _summarize_per_barcode(counts_per_bc)
    for stat, value in counts_per_bc_stats.iteritems():
        d['filtered_bcs_%s_counts' % stat] = value

    # Cumulative fraction of counts going to top bcs
    filt_total_umis = filtered_mat.sum()
    raw_total_umis = matrix.sum()
    d['filtered_bcs_cum_frac'] = tk_stats.robust_divide(
        filt_total_umis, raw_total_umis)

    # cDNA PCR duplication in top bcs
    dupe_candidate_h5_key = cr_utils.format_barcode_summary_h5_key(
        library_prefix, genome, cr_constants.TRANSCRIPTOME_REGION,
        cr_constants.CONF_MAPPED_BC_READ_TYPE)
    if dupe_candidate_h5_key in barcode_summary_h5:
        n_reads = barcode_summary_h5[dupe_candidate_h5_key][:][
            cell_bc_indices].sum()
        n_deduped_reads = filt_total_umis
    else:
        n_reads = 0
        n_deduped_reads = 0
    d['filtered_bcs_%s_dupe_reads_frac' %
      cr_constants.CDNA_PCR_DUPE_TYPE] = 1 - tk_stats.robust_divide(
          n_deduped_reads, n_reads)

    # Reads per top bc for the various read types (computed over top bcs)
    for read_type in cr_constants.MATRIX_REPORT_READ_TYPES:
        # Compute (n_reads)/(n_bcs) over all bcs and over top bcs
        per_bc_metric = 'filtered_bcs_%s_reads_per_filtered_bc' % read_type

        # Cumulative fraction of reads going to top bcs
        frac_metric = 'filtered_bcs_%s_reads_cum_frac' % read_type

        if read_type in cr_constants.MATRIX_USE_MATRIX_FOR_READ_TYPE:
            n_reads = filt_total_umis
            n_all_reads = raw_total_umis
        else:
            h5_key = cr_utils.format_barcode_summary_h5_key(
                library_prefix, genome, cr_constants.TRANSCRIPTOME_REGION,
                read_type)
            if h5_key in barcode_summary_h5:
                counts = barcode_summary_h5[h5_key][:]
                n_reads = counts[cell_bc_indices].sum()
                n_all_reads = counts.sum()
            else:
                n_reads = 0
                n_all_reads = 0
        d[per_bc_metric] = tk_stats.robust_divide(n_reads, n_cell_bcs)
        d[frac_metric] = tk_stats.robust_divide(n_reads, n_all_reads)
    return d