def empirical_dispersion(y, threshold=1e-4):
    """Estimate empirical dispersion"""
    assert threshold > 0
    from cellranger.analysis.stats import summarize_columns
    (mu, var) = summarize_columns(y.T)
    alpha_est = np.maximum(
        (var.squeeze() - mu.squeeze()) / (np.square(mu.squeeze() + 1e-100)),
        threshold)
    return alpha_est
Пример #2
0
def normalize_and_transpose(matrix):
    matrix.tocsc()

    m = analysis_stats.normalize_by_umi(matrix)

    # Use log counts
    m.data = np.log2(1 + m.data)

    # Transpose
    m = m.T

    # compute centering (mean) and scaling (stdev)
    (c, v) = analysis_stats.summarize_columns(m)
    # TODO: Inputs to this function shouldn't have zero variance columns
    v[np.where(v == 0.0)] = 1.0

    s = np.sqrt(v)
    return (m, c, s)
Пример #3
0
def run_lsa(matrix, lsa_features=None, lsa_bcs=None, n_lsa_components=None, random_state=None, discardPC=0, min_count_threshold=0):
    """ Run a LSA on the matrix using the IRLBA matrix factorization algorithm.  Prior to the LSA analysis, the counts
    are transformed by an inverse document frequency operation.

    If desired, only a subset of features (e.g. sample rows) can be selected for LSA analysis.  Each feature is ranked
    by its dispersion relative to other features that have a similar mean count.  The top `lsa_features` as ranked by
    this method will then be used for the LSA.

    One can also select to subset number of barcodes to use (e.g. sample columns), but in this case they are simply
    randomly sampled.

    Additionally one can choose to discard first N PCs (ranked by singular values/ variance explained). In this mode,
    the method automatically discovers N + n_lsa_components components

    Args:
        matrix (CountMatrix): The matrix to perform LSA on.
        lsa_features (int): Number of features to subset from matrix and use in LSA. The top lsa_features ranked by
                            dispersion are used
        lsa_bcs (int): Number of barcodes to randomly sample for the matrix.
        n_lsa_components (int): How many LSA components should be used.
        random_state (int): The seed for the RNG
        discardPC (int): number of components to discard
        min_count_threshold (int): The minimum sum of each row/column for that row/column to be passed to LSA
                                   (this filter is prior to any subsetting that occurs).
    Returns:
        A LSA object
    """
 
    if random_state is None:
        random_state = analysis_constants.RANDOM_STATE
    np.random.seed(0)

    # Threshold the rows/columns of matrix, will throw error if an empty matrix results.
    thresholded_matrix, _, thresholded_features = matrix.select_axes_above_threshold(min_count_threshold)

    # If requested, we can subsample some of the barcodes to get a smaller matrix for LSA
    lsa_bc_indices = np.arange(thresholded_matrix.bcs_dim)
    if lsa_bcs is None:
        lsa_bcs = thresholded_matrix.bcs_dim
        lsa_bc_indices = np.arange(thresholded_matrix.bcs_dim)
    elif lsa_bcs < thresholded_matrix.bcs_dim:
        lsa_bc_indices = np.sort(np.random.choice(np.arange(thresholded_matrix.bcs_dim), size=lsa_bcs, replace=False))
    elif lsa_bcs > thresholded_matrix.bcs_dim:
        msg = ("You requested {} barcodes but the matrix after thresholding only "
               "included {}, so the smaller amount is being used.").format(lsa_bcs, thresholded_matrix.bcs_dim)
        print(msg)
        lsa_bcs = thresholded_matrix.bcs_dim
        lsa_bc_indices = np.arange(thresholded_matrix.bcs_dim)

    # If requested, select fewer features to use by selecting the features with highest normalized dispersion
    if lsa_features is None:
        lsa_features = thresholded_matrix.features_dim
    elif lsa_features > thresholded_matrix.features_dim:
        msg = ("You requested {} features but the matrix after thresholding only included {} features,"
               "so the smaller amount is being used.").format(lsa_features, thresholded_matrix.features_dim)
        print(msg)
        lsa_features = thresholded_matrix.features_dim
    # Calc mean and variance of counts after normalizing
    # But don't transform to log space, in order to preserve the mean-variance relationship
    m = analysis_stats.normalize_by_umi(thresholded_matrix)
    # Get mean and variance of rows
    (mu, var) = analysis_stats.summarize_columns(m.T)
    dispersion = analysis_stats.get_normalized_dispersion(mu.squeeze(), var.squeeze())  # TODO set number of bins?
    lsa_feature_indices = np.argsort(dispersion)[-lsa_features:]

    # Now determine how many components.
    if n_lsa_components is None:
        n_lsa_components = analysis_constants.LSA_N_COMPONENTS_DEFAULT

    # increment number of components if we discard PCs
    n_lsa_components += discardPC

    likely_matrix_rank = min(lsa_features, lsa_bcs)
    if likely_matrix_rank < n_lsa_components:
        print(("There are fewer nonzero features or barcodes ({}) than requested "
               "LSA components ({}); reducing the number of components.").format(likely_matrix_rank, n_lsa_components))
        n_lsa_components = likely_matrix_rank

    if (likely_matrix_rank * 0.5) <= float(n_lsa_components):
        print("Requested number of LSA components is large relative to the matrix size, an exact approach to matrix factorization may be faster.")

    # perform idf transform, which is suited for lsa
    lsa_mat = thresholded_matrix.select_barcodes(lsa_bc_indices).select_features(lsa_feature_indices)
    lsa_norm_mat = normalize_and_transpose(lsa_mat)
    (u, d, v, _, _) = irlb(lsa_norm_mat, n_lsa_components, random_state=random_state)

    # project the matrix to complete the transform: X --> X*v = u*d
    full_norm_mat = normalize_and_transpose(matrix)
    # Get a coordinate map so we know which columns in the old matrix correspond to columns in the new
    org_cols_used = get_original_columns_used(thresholded_features, lsa_feature_indices)
    transformed_irlba_matrix = full_norm_mat[:, org_cols_used].dot(v)[:, discardPC:]
    irlba_components = np.zeros((n_lsa_components - discardPC, matrix.features_dim))
    irlba_components[:, org_cols_used] = v.T[discardPC:, :]

    # calc proportion of variance explained
    variance_explained = np.square(d)[discardPC:] / np.sum(lsa_norm_mat.data**2)
    n_lsa_components = n_lsa_components - discardPC

    features_selected = np.array([f.id for f in matrix.feature_ref.feature_defs])[org_cols_used]

    # sanity check dimensions
    assert transformed_irlba_matrix.shape == (matrix.bcs_dim, n_lsa_components)
    assert irlba_components.shape == (n_lsa_components, matrix.features_dim)
    assert variance_explained.shape == (n_lsa_components,)

    return LSA(transformed_irlba_matrix, irlba_components, variance_explained, dispersion, features_selected)
Пример #4
0
def run_lsa(matrix,
            lsa_features=None,
            lsa_bcs=None,
            n_lsa_components=None,
            random_state=None):
    if lsa_features is None:
        lsa_features = matrix.features_dim
    if lsa_bcs is None:
        lsa_bcs = matrix.bcs_dim
    if n_lsa_components is None:
        n_lsa_components = analysis_constants.LSA_N_COMPONENTS_DEFAULT
        if n_lsa_components > lsa_features:
            print "There are fewer nonzero features than LSA components; reducing the number of components."
            n_lsa_components = lsa_features
    if random_state is None:
        random_state = analysis_constants.RANDOM_STATE

    np.random.seed(0)

    # perform idf transform, which is suited for lsa
    full_norm_mat = normalize_and_transpose(matrix)

    # initialize LSA subsets
    lsa_bc_indices = np.arange(matrix.bcs_dim)
    lsa_feature_indices = np.arange(matrix.features_dim)

    # Calc mean and variance of counts after normalizing
    # Don't transform to log space in LSA
    # Dispersion is not exactly meaningful after idf transform. This is retained simply to follow PCA code
    m = analysis_stats.normalize_by_idf(matrix)
    (mu, var) = analysis_stats.summarize_columns(m.T)
    dispersion = analysis_stats.get_normalized_dispersion(
        mu.squeeze(), var.squeeze())  # TODO set number of bins?

    lsa_feature_indices = np.argsort(dispersion)[-lsa_features:]

    if lsa_bcs < matrix.bcs_dim:
        lsa_bc_indices = np.sort(
            np.random.choice(np.arange(matrix.bcs_dim),
                             size=lsa_bcs,
                             replace=False))

    lsa_mat, _, lsa_features_nonzero = matrix.select_barcodes(
        lsa_bc_indices).select_features(
            lsa_feature_indices).select_nonzero_axes()
    lsa_feature_nonzero_indices = lsa_feature_indices[lsa_features_nonzero]

    if lsa_mat.features_dim < 2 or lsa_mat.bcs_dim < 2:
        print "Matrix is too small for further downsampling - num_lsa_bcs and num_lsa_features will be ignored."
        lsa_mat, _, lsa_features_nonzero = matrix.select_nonzero_axes()
        lsa_feature_nonzero_indices = lsa_features_nonzero

    lsa_norm_mat = normalize_and_transpose(lsa_mat)

    (u, d, v, _, _) = irlb(lsa_norm_mat,
                           n_lsa_components,
                           random_state=random_state)

    # project the matrix to complete the transform: X --> X*v = u*d
    transformed_irlba_matrix = full_norm_mat[:,
                                             lsa_feature_nonzero_indices].dot(
                                                 v)
    irlba_components = np.zeros((n_lsa_components, matrix.features_dim))
    irlba_components[:, lsa_feature_nonzero_indices] = v.T

    # calc proportion of variance explained
    variance_explained = np.square(d) / np.sum(lsa_norm_mat.data**2)

    features_selected = np.array([
        f.id for f in matrix.feature_ref.feature_defs
    ])[lsa_feature_nonzero_indices]

    # sanity check dimensions
    assert transformed_irlba_matrix.shape == (matrix.bcs_dim, n_lsa_components)
    assert irlba_components.shape == (n_lsa_components, matrix.features_dim)
    assert variance_explained.shape == (n_lsa_components, )

    return LSA(transformed_irlba_matrix, irlba_components, variance_explained,
               dispersion, features_selected)
Пример #5
0
def run_plsa(matrix,
             temp_dir,
             plsa_features=None,
             plsa_bcs=None,
             n_plsa_components=None,
             random_state=None,
             threads=1,
             min_count_threshold=0):
    """ Run a PLSA on the matrix using the IRLBA matrix factorization algorithm.  Prior to the PLSA analysis, the
    matrix is not normalized at all.

    If desired, only a subset of features (e.g. sample rows) can be selected for PLSA analysis.  Each feature is ranked
    by its dispersion relative to other features that have a similar mean count.  The top `plsa_features` as ranked by
    this method will then be used for the PLSA.

    One *cannot* select to subset number of barcodes to use because of the intricacies of PLSA. It is still available as
    an optional input to match the API for lsa and pca subroutines included in this package.

    Args:
        matrix (CountMatrix): The matrix to perform PLSA on.
        plsa_features (int): Number of features to subset from matrix and use in PLSA. The top plsa_features ranked by
                            dispersion are used
        plsa_bcs (int): Number of barcodes to randomly sample for the matrix.
        n_plsa_components (int): How many PLSA components should be used.
        random_state (int): The seed for the RNG
        min_count_threshold (int): The minimum sum of each row/column for that row/column to be passed to PLSA
                                   (this filter is prior to any subsetting that occurs).
    Returns:
        A PLSA object
    """

    if not os.path.exists(temp_dir):
        raise Exception(
            'Temporary directory does not exist. Need it to run plsa binary. Aborting..'
        )

    if random_state is None:
        random_state = analysis_constants.RANDOM_STATE
    np.random.seed(0)

    # Threshold the rows/columns of matrix, will throw error if an empty matrix results.
    thresholded_matrix, thresholded_bcs, thresholded_features = matrix.select_axes_above_threshold(
        min_count_threshold)

    # If requested, we can subsample some of the barcodes to get a smaller matrix for PLSA
    if plsa_bcs is not None:
        msg = "PLSA method does not allow subsetting barcodes"
        print(msg)
    plsa_bcs = thresholded_matrix.bcs_dim
    plsa_bc_indices = np.arange(thresholded_matrix.bcs_dim)

    # If requested, select fewer features to use by selecting the features with highest normalized dispersion
    if plsa_features is None:
        plsa_features = thresholded_matrix.features_dim
    elif plsa_features > thresholded_matrix.features_dim:
        msg = (
            "You requested {} features but the matrix after thresholding only included {} features,"
            "so the smaller amount is being used.").format(
                plsa_features, thresholded_matrix.features_dim)
        print(msg)
        plsa_features = thresholded_matrix.features_dim
    # Calc mean and variance of counts after normalizing
    # But don't transform to log space, in order to preserve the mean-variance relationship
    m = analysis_stats.normalize_by_umi(thresholded_matrix)
    # Get mean and variance of rows
    (mu, var) = analysis_stats.summarize_columns(m.T)
    dispersion = analysis_stats.get_normalized_dispersion(
        mu.squeeze(), var.squeeze())  # TODO set number of bins?
    plsa_feature_indices = np.argsort(dispersion)[-plsa_features:]

    # Now determine how many components.
    if n_plsa_components is None:
        n_plsa_components = analysis_constants.PLSA_N_COMPONENTS_DEFAULT

    likely_matrix_rank = min(plsa_features, plsa_bcs)
    if likely_matrix_rank < n_plsa_components:
        print((
            "There are fewer nonzero features or barcodes ({}) than requested "
            "PLSA components ({}); reducing the number of components.").format(
                likely_matrix_rank, n_plsa_components))
        n_plsa_components = likely_matrix_rank

    if (likely_matrix_rank * 0.5) <= float(n_plsa_components):
        print(
            "Requested number of PLSA components is large relative to the matrix size, an exact approach to matrix factorization may be faster."
        )

    plsa_mat = thresholded_matrix.select_barcodes(
        plsa_bc_indices).select_features(plsa_feature_indices)

    # Write out sparse matrix without transforms
    # code picked up from save_mex
    plsa_mat.tocoo()
    out_matrix_fn = os.path.join(temp_dir, 'matrix.mtx')
    with open(out_matrix_fn, 'w') as stream:
        stream.write(
            np.compat.asbytes('%%MatrixMarket matrix {0} {1} {2}\n%%\n'.format(
                'coordinate', 'integer', 'symmetry')))
        stream.write(
            np.compat.asbytes(
                '%i %i %i\n' %
                (plsa_mat.m.shape[0], plsa_mat.m.shape[1], plsa_mat.m.nnz)))
        # write row, col, val in 1-based indexing
        for r, c, d in itertools.izip(plsa_mat.m.row + 1, plsa_mat.m.col + 1,
                                      plsa_mat.m.data):
            stream.write(np.compat.asbytes(("%i %i %i\n" % (r, c, d))))

    del plsa_mat

    # Run plsa module, reading in sparse matrix
    # Iters and tol are designed for 15PCs
    proc = tk_subproc.Popen([
        PLSA_BINPATH, out_matrix_fn, temp_dir, '--topics',
        str(n_plsa_components), '--iter',
        str(3000), '--tol',
        str(0.002), '--nt',
        str(threads)
    ],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    stdout_data, stderr_data = proc.communicate()
    if proc.returncode != 0:
        print stdout_data
        raise Exception(
            "%s returned error code while running plsa binary %d: %s" %
            (proc, proc.returncode, stderr_data))

    # Read back data
    transformed_plsa_em_matrix_file = os.path.join(temp_dir,
                                                   "transformed_matrix.csv")
    n_components_file = os.path.join(temp_dir, "components.csv")
    variance_explained_file = os.path.join(temp_dir, "topic_relevance.csv")
    org_rows_used = get_original_columns_used(thresholded_bcs, plsa_bc_indices)
    transformed_plsa_em_matrix = np.zeros((matrix.bcs_dim, n_plsa_components))
    transformed_plsa_em_matrix[org_rows_used, :] = np.genfromtxt(
        transformed_plsa_em_matrix_file, delimiter=",").astype('float64')
    org_cols_used = get_original_columns_used(thresholded_features,
                                              plsa_feature_indices)
    plsa_em_components = np.zeros((n_plsa_components, matrix.features_dim))
    plsa_em_components[:, org_cols_used] = np.genfromtxt(
        n_components_file, delimiter=",").astype('float64')
    variance_explained = np.genfromtxt(variance_explained_file,
                                       delimiter=",").astype('float64')

    # reorder components by variance explained as PLSA binary gives arbitrary order
    new_order = range(n_plsa_components)
    variance_explained, new_order = zip(
        *sorted(zip(variance_explained, new_order), reverse=True))
    variance_explained = np.array(variance_explained)
    plsa_em_components = plsa_em_components[new_order, :]
    transformed_plsa_em_matrix = transformed_plsa_em_matrix[:, new_order]

    # delete files
    cr_io.remove(transformed_plsa_em_matrix_file, allow_nonexisting=True)
    cr_io.remove(n_components_file, allow_nonexisting=True)
    cr_io.remove(variance_explained_file, allow_nonexisting=True)
    cr_io.remove(out_matrix_fn, allow_nonexisting=True)

    features_selected = np.array(
        [f.id for f in matrix.feature_ref.feature_defs])[org_cols_used]

    # sanity check dimensions
    assert transformed_plsa_em_matrix.shape == (matrix.bcs_dim,
                                                n_plsa_components)
    assert plsa_em_components.shape == (n_plsa_components, matrix.features_dim)
    assert variance_explained.shape == (n_plsa_components, )

    return PLSA(transformed_plsa_em_matrix, plsa_em_components,
                variance_explained, dispersion, features_selected)
Пример #6
0
def run_plsa(matrix,
             temp_dir,
             plsa_features=None,
             plsa_bcs=None,
             n_plsa_components=None,
             random_state=None,
             threads=1):
    if not os.path.exists(temp_dir):
        raise Exception(
            'Temporary directory does not exist. Need it to run plsa binary. Aborting..'
        )

    if plsa_features is None:
        plsa_features = matrix.features_dim
    if plsa_bcs is None:
        plsa_bcs = matrix.bcs_dim
    if n_plsa_components is None:
        n_plsa_components = analysis_constants.PLSA_N_COMPONENTS_DEFAULT
        if n_plsa_components > plsa_features:
            print "There are fewer nonzero features than PLSA components; reducing the number of components."
            n_plsa_components = plsa_features
    if random_state is None:
        random_state = analysis_constants.RANDOM_STATE

    np.random.seed(random_state)

    # initialize PLSA subsets
    plsa_bc_indices = np.arange(matrix.bcs_dim)
    plsa_feature_indices = np.arange(matrix.features_dim)

    # NOTE: This is retained simply to follow PCA code
    # Calc mean and variance of counts after normalizing
    # Don't transform to log space in PLSA
    # Dispersion is not exactly meaningful after idf transform.
    m = analysis_stats.normalize_by_idf(matrix)
    (mu, var) = analysis_stats.summarize_columns(m.T)
    dispersion = analysis_stats.get_normalized_dispersion(
        mu.squeeze(), var.squeeze())  # TODO set number of bins?

    plsa_feature_indices = np.argsort(dispersion)[-plsa_features:]

    if plsa_bcs < matrix.bcs_dim:
        plsa_bc_indices = np.sort(
            np.random.choice(np.arange(matrix.bcs_dim),
                             size=plsa_bcs,
                             replace=False))

    plsa_mat, _, plsa_features_nonzero = matrix.select_barcodes(
        plsa_bc_indices).select_features(
            plsa_feature_indices).select_nonzero_axes()
    plsa_feature_nonzero_indices = plsa_feature_indices[plsa_features_nonzero]

    if plsa_mat.features_dim < 2 or plsa_mat.bcs_dim < 2:
        print "Matrix is too small for further downsampling - num_plsa_bcs and num_plsa_features will be ignored."
        plsa_mat, _, plsa_features_nonzero = matrix.select_nonzero_axes()
        plsa_feature_nonzero_indices = plsa_features_nonzero

    ### Write out sparse matrix without transforms
    plsa_mat.tocoo()
    out_matrix_fn = os.path.join(temp_dir, 'matrix.mtx')
    sp_io.mmwrite(out_matrix_fn,
                  plsa_mat.m,
                  field='integer',
                  symmetry='general')

    ### Run plsa module, reading in sparse matrix
    proc = tk_subproc.Popen([
        PLSA_BINPATH,
        out_matrix_fn,
        temp_dir,
        '--topics',
        str(n_plsa_components),
        '--nt',
        str(threads),
    ],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    stdout_data, stderr_data = proc.communicate()
    if proc.returncode != 0:
        print stdout_data
        raise Exception(
            "%s returned error code while running plsa binary %d: %s" %
            (proc, proc.returncode, stderr_data))

    ### Read back data
    transformed_plsa_em_matrix_file = os.path.join(temp_dir,
                                                   "transformed_matrix.csv")
    n_components_file = os.path.join(temp_dir, "components.csv")
    variance_explained_file = os.path.join(temp_dir, "topic_relevance.csv")
    transformed_plsa_em_matrix = np.genfromtxt(transformed_plsa_em_matrix_file,
                                               delimiter=",").astype('float64')
    plsa_em_components = np.zeros((n_plsa_components, matrix.features_dim))
    plsa_em_components[:, plsa_feature_nonzero_indices] = np.genfromtxt(
        n_components_file, delimiter=",").astype('float64')
    variance_explained = np.genfromtxt(variance_explained_file,
                                       delimiter=",").astype('float64')

    ### reorder components by variance explained as PLSA binary gives arbitrary order
    new_order = range(n_plsa_components)
    variance_explained, new_order = zip(
        *sorted(zip(variance_explained, new_order), reverse=True))
    variance_explained = np.array(variance_explained)
    plsa_em_components = plsa_em_components[new_order, :]
    transformed_plsa_em_matrix = transformed_plsa_em_matrix[:, new_order]

    ### delete files
    cr_io.remove(transformed_plsa_em_matrix_file, allow_nonexisting=True)
    cr_io.remove(n_components_file, allow_nonexisting=True)
    cr_io.remove(variance_explained_file, allow_nonexisting=True)
    cr_io.remove(out_matrix_fn, allow_nonexisting=True)

    features_selected = np.array([
        f.id for f in matrix.feature_ref.feature_defs
    ])[plsa_feature_nonzero_indices]

    # sanity check dimensions
    assert transformed_plsa_em_matrix.shape == (matrix.bcs_dim,
                                                n_plsa_components)
    assert plsa_em_components.shape == (n_plsa_components, matrix.features_dim)
    assert variance_explained.shape == (n_plsa_components, )

    return PLSA(transformed_plsa_em_matrix, plsa_em_components,
                variance_explained, dispersion, features_selected)
Пример #7
0
def run_pca(matrix,
            pca_features=None,
            pca_bcs=None,
            n_pca_components=None,
            random_state=None,
            min_count_threshold=0):
    """ Run a PCA on the matrix using the IRLBA matrix factorization algorithm.  Prior to the PCA analysis, the
    matrix is modified so that all barcodes/columns have the same counts, and then the counts are transformed
    by a log2(1+X) operation.

    If desired, only a subset of features (e.g. sample rows) can be selected for PCA analysis.  Each feature is ranked
    by its dispersion relative to other features that have a similar mean count.  The top `pca_features` as ranked by
    this method will then be used for the PCA.

    One can also select to subset number of barcodes to use (e.g. sample columns), but in this case they are simply
    randomly sampled.

    Args:
        matrix (CountMatrix): The matrix to perform PCA on.
        pca_features (int): Number of features to subset from matrix and use in PCA. The top pca_features ranked by
                            dispersion are used
        pca_bcs (int): Number of barcodes to randomly sample for the matrix.
        n_pca_components (int): How many PCA components should be used.
        random_state (int): The seed for the RNG
        min_count_threshold (int): The minimum sum of each row/column for that row/column to be passed to PCA
                                   (this filter is prior to any subsetting that occurs).
    Returns:
        A PCA object
    """
    if random_state is None:
        random_state = analysis_constants.RANDOM_STATE
    np.random.seed(0)

    # Threshold the rows/columns of matrix, will throw error if an empty matrix results.
    thresholded_matrix, _, thresholded_features = matrix.select_axes_above_threshold(
        min_count_threshold)

    # If requested, we can subsample some of the barcodes to get a smaller matrix for PCA
    pca_bc_indices = np.arange(thresholded_matrix.bcs_dim)
    if pca_bcs is None:
        pca_bcs = thresholded_matrix.bcs_dim
        pca_bc_indices = np.arange(thresholded_matrix.bcs_dim)
    elif pca_bcs < thresholded_matrix.bcs_dim:
        pca_bc_indices = np.sort(
            np.random.choice(np.arange(thresholded_matrix.bcs_dim),
                             size=pca_bcs,
                             replace=False))
    elif pca_bcs > thresholded_matrix.bcs_dim:
        msg = (
            "You requested {} barcodes but the matrix after thresholding only "
            "included {}, so the smaller amount is being used.").format(
                pca_bcs, thresholded_matrix.bcs_dim)
        print(msg)
        pca_bcs = thresholded_matrix.bcs_dim
        pca_bc_indices = np.arange(thresholded_matrix.bcs_dim)

    # If requested, select fewer features to use by selecting the features with highest normalized dispersion
    if pca_features is None:
        pca_features = thresholded_matrix.features_dim
    elif pca_features > thresholded_matrix.features_dim:
        msg = (
            "You requested {} features but the matrix after thresholding only included {} features,"
            "so the smaller amount is being used.").format(
                pca_features, thresholded_matrix.features_dim)
        print(msg)
        pca_features = thresholded_matrix.features_dim
    # Calc mean and variance of counts after normalizing
    # But don't transform to log space, in order to preserve the mean-variance relationship
    m = analysis_stats.normalize_by_umi(thresholded_matrix)
    # Get mean and variance of rows
    (mu, var) = analysis_stats.summarize_columns(m.T)
    dispersion = analysis_stats.get_normalized_dispersion(
        mu.squeeze(), var.squeeze())  # TODO set number of bins?
    pca_feature_indices = np.argsort(dispersion)[-pca_features:]

    # Now determine how many components.
    if n_pca_components is None:
        n_pca_components = analysis_constants.PCA_N_COMPONENTS_DEFAULT
    likely_matrix_rank = min(pca_features, pca_bcs)
    if likely_matrix_rank < n_pca_components:
        if min_count_threshold == DEFAULT_RUNPCA_THRESHOLD:
            # Kick back to run_pca stage so it can retry with no threshold, this is for historical reasons
            raise MatrixRankTooSmallException()
        else:
            print((
                "There are fewer nonzero features or barcodes ({}) than requested "
                "PCA components ({}); reducing the number of components."
            ).format(likely_matrix_rank, n_pca_components))
            n_pca_components = likely_matrix_rank

    if (likely_matrix_rank * 0.5) <= float(n_pca_components):
        print(
            "Requested number of PCA components is large relative to the matrix size, an exact approach to matrix factorization may be faster."
        )

    # Note, after subsetting it is possible some rows/cols in pca_mat have counts below the threshold.
    # However, we are not performing a second thresholding as in practice subsetting is not used and we explain
    # that thresholding occurs prior to subsetting in the doc string.
    pca_mat = thresholded_matrix.select_barcodes(
        pca_bc_indices).select_features(pca_feature_indices)
    (pca_norm_mat, pca_center, pca_scale) = normalize_and_transpose(pca_mat)
    (u, d, v, _, _) = irlb(pca_norm_mat,
                           n_pca_components,
                           center=pca_center.squeeze(),
                           scale=pca_scale.squeeze(),
                           random_state=random_state)

    # make sure to project the matrix before centering, to avoid densification
    (full_norm_mat, full_center, full_scale) = normalize_and_transpose(matrix)
    sparsefuncs.inplace_column_scale(
        full_norm_mat, 1 / full_scale.squeeze())  # can have some zeros here
    # Get a coordinate map so we know which columns in the old matrix correspond to columns in the new
    org_cols_used = get_original_columns_used(thresholded_features,
                                              pca_feature_indices)
    transformed_irlba_matrix = full_norm_mat[:, org_cols_used].dot(v) - (
        full_center / full_scale)[:, org_cols_used].dot(v)
    irlba_components = np.zeros((n_pca_components, matrix.features_dim))
    irlba_components[:, org_cols_used] = v.T

    # calc proportion of variance explained
    variance_sum = len(
        pca_feature_indices
    )  # each feature has variance=1, mean=0 after normalization
    variance_explained = np.square(d) / (
        (len(pca_bc_indices) - 1) * variance_sum)
    features_selected = np.array(
        [f.id for f in matrix.feature_ref.feature_defs])[org_cols_used]

    # Now project back up the dispersion to return.
    full_dispersion = np.empty(matrix.features_dim)
    full_dispersion[:] = np.nan
    full_dispersion[thresholded_features] = dispersion

    # sanity check dimensions
    assert transformed_irlba_matrix.shape == (matrix.bcs_dim, n_pca_components)
    assert irlba_components.shape == (n_pca_components, matrix.features_dim)
    assert variance_explained.shape == (n_pca_components, )

    return PCA(transformed_irlba_matrix, irlba_components, variance_explained,
               full_dispersion, features_selected)