def setUpClass(self): self.X_dense = data.load_10X(sparse=False) self.X_sparse = data.load_10X(sparse=True) self.X_numpy = self.X_dense.to_numpy() self.X_coo = self.X_sparse.sparse.to_coo() self.cell_names = self.X_dense.index self.gene_names = self.X_dense.columns
def test_10X(): X = data.load_10X() assert X.shape == (100, 100) assert isinstance(X, pd.SparseDataFrame) assert X.columns[0] == "Arl8b" X = data.load_10X(gene_labels='id', sparse=False) assert X.shape == (100, 100) assert isinstance(X, pd.DataFrame) assert not isinstance(X, pd.SparseDataFrame) assert X.columns[0] == "ENSMUSG00000030105" X = data.load_10X(gene_labels='both') assert X.shape == (100, 100) assert isinstance(X, pd.SparseDataFrame) assert X.columns[0] == "Arl8b (ENSMUSG00000030105)" assert_raise_message(ValueError, "gene_labels='invalid' not recognized. " "Choose from ['symbol', 'id', 'both']", data.load_10X, gene_labels='invalid') assert_raise_message( FileNotFoundError, "{} is not a directory".format( os.path.join(data.data_dir, "test_10X.zip")), scprep.io.load_10X, os.path.join(data.data_dir, "test_10X.zip")) assert_raise_message( FileNotFoundError, "'matrix.mtx', 'genes.tsv', and 'barcodes.tsv' must be present " "in {}".format(data.data_dir), scprep.io.load_10X, data.data_dir)
def test_gene_expression_filter_sample_label(): X = data.load_10X(sparse=False) genes = np.arange(10) sample_labels = pd.DataFrame(np.arange(X.shape[0]), index=X.index) X_filtered, sample_labels = scprep.filter.filter_gene_set_expression( X, genes, percentile=90, sample_labels=sample_labels) assert X_filtered.shape[0] == len(sample_labels)
def test_combine_batches_uncommon_genes(): X = data.load_10X() Y = X.iloc[:, :X.shape[1] // 2] assert_warns_message( UserWarning, "Input data has inconsistent column names. " "Subsetting to {} common columns.".format(Y.shape[1]), scprep.utils.combine_batches, [X, Y], ['x', 'y'])
def test_combine_batches(): X = data.load_10X() Y = pd.concat([X, scprep.utils.select_rows(X, np.arange(X.shape[0] // 2))]) Y2, sample_labels = scprep.utils.combine_batches( [X, scprep.utils.select_rows(X, np.arange(X.shape[0] // 2))], batch_labels=[0, 1]) assert utils.assert_matrix_class_equivalent(Y, Y2) utils.assert_all_equal(Y, Y2) assert np.all(Y.index == Y2.index) assert np.all(sample_labels == np.concatenate( [np.repeat(0, X.shape[0]), np.repeat(1, X.shape[0] // 2)])) Y2, sample_labels = scprep.utils.combine_batches( [X, scprep.utils.select_rows(X, np.arange(X.shape[0] // 2))], batch_labels=[0, 1], append_to_cell_names=True) assert np.all(Y.index == np.array([i[:-2] for i in Y2.index])) assert np.all( np.core.defchararray.add("_", sample_labels.astype(str)) == np.array( [i[-2:] for i in Y2.index], dtype=str)) transform = lambda X: scprep.utils.combine_batches( [X, scprep.utils.select_rows(X, np.arange(X.shape[0] // 2))], batch_labels=[0, 1])[0] matrix.test_matrix_types(X, utils.assert_transform_equals, matrix._indexable_matrix_types, Y=Y, transform=transform, check=utils.assert_all_equal)
def test_combine_batches_errors(): X = data.load_10X() assert_warns_message( UserWarning, "append_to_cell_names only valid for pd.DataFrame input. " "Got coo_matrix", scprep.utils.combine_batches, [X.to_coo(), X.iloc[:X.shape[0] // 2].to_coo()], batch_labels=[0, 1], append_to_cell_names=True) assert_raise_message( TypeError, "Expected data all of the same class. Got SparseDataFrame, coo_matrix", scprep.utils.combine_batches, [X, X.iloc[:X.shape[0] // 2].to_coo()], batch_labels=[0, 1]) assert_raise_message( ValueError, "Expected data all with the same number of columns. " "Got {}, {}".format(X.shape[1], X.shape[1] // 2), scprep.utils.combine_batches, [X, scprep.utils.select_cols(X, np.arange(X.shape[1] // 2))], batch_labels=[0, 1]) assert_raise_message( ValueError, "Expected data (2) and batch_labels (1) to be the same length.", scprep.utils.combine_batches, [X, scprep.utils.select_rows(X, np.arange(X.shape[0] // 2))], batch_labels=[0]) assert_raise_message(ValueError, "Expected data to contain pandas DataFrames, " "scipy sparse matrices or numpy arrays. Got str", scprep.utils.combine_batches, ["hello", "world"], batch_labels=[0, 1])
def test_10X_HDF5(): X = data.load_10X() # tables backend h5_file = os.path.join(data.data_dir, "test_10X.h5") X_hdf5 = scprep.io.load_10X_HDF5(h5_file) assert isinstance(X_hdf5, pd.SparseDataFrame) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # hdf5 backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='h5py') assert isinstance(X_hdf5, pd.SparseDataFrame) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) assert_raise_message(ValueError, "Genome invalid not found in {}. " "Available genomes: GRCh38".format(h5_file), scprep.io.load_10X_HDF5, filename=h5_file, genome="invalid") assert_raise_message(ValueError, "Expected backend in ['tables', 'h5py']. Got invalid", scprep.io.load_10X_HDF5, filename=h5_file, backend="invalid") assert_raise_message(ValueError, "gene_labels='invalid' not recognized. " "Choose from ['symbol', 'id', 'both']", scprep.io.load_10X_HDF5, filename=h5_file, gene_labels='invalid')
def test_deprecated(): X = data.load_10X() assert_warns_message( FutureWarning, "`scprep.utils.select_cols` is deprecated. Use " "`scprep.select.select_cols` instead.", scprep.utils.select_cols, X, [1, 2, 3]) assert_warns_message( FutureWarning, "`scprep.utils.select_rows` is deprecated. Use " "`scprep.select.select_rows` instead.", scprep.utils.select_rows, X, [1, 2, 3]) assert_warns_message(FutureWarning, "`scprep.utils.get_gene_set` is deprecated. Use " "`scprep.select.get_gene_set` instead.", scprep.utils.get_gene_set, X, starts_with="D") assert_warns_message(FutureWarning, "`scprep.utils.get_cell_set` is deprecated. Use " "`scprep.select.get_cell_set` instead.", scprep.utils.get_cell_set, X, starts_with="A") assert_warns_message(FutureWarning, "`scprep.utils.subsample` is deprecated. Use " "`scprep.select.subsample` instead.", scprep.utils.subsample, X, n=10)
def test_10X_HDF5_cellranger3(): X = data.load_10X() h5_file = os.path.join(data.data_dir, "test_10X_cellranger3.h5") # explicit tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend="tables") assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit h5py backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend="h5py") assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # automatic tables backend with mock.patch.dict(sys.modules, {"h5py": None}): X_hdf5 = scprep.io.load_10X_HDF5(h5_file) assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # automatic h5py backend with mock.patch.dict(sys.modules, {"tables": None}): X_hdf5 = scprep.io.load_10X_HDF5(h5_file) assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index)
def test_mean_difference(): X = data.load_10X() X = scprep.filter.filter_empty_genes(X) Y = scprep.stats.mean_difference(X.iloc[:20], X.iloc[20:100]) assert np.allclose(np.max(Y), 16.8125) assert np.allclose(np.min(Y), -0.5625) def test_fun(X, **kwargs): return scprep.stats.mean_difference( scprep.select.select_rows(X, idx=np.arange(20)), scprep.select.select_rows(X, idx=np.arange(20, 100)), **kwargs, ) matrix.test_all_matrix_types( X, utils.assert_transform_equals, Y=Y, transform=test_fun, check=utils.assert_all_close, ) utils.assert_raises_message( ValueError, "Expected X and Y to have the same number of columns. " "Got shapes {}, {}".format(X.shape, X.iloc[:, :10].shape), scprep.stats.mean_difference, X, X.iloc[:, :10], )
def test_check_numeric_inplace(): X = data.load_10X() matrix.test_matrix_types( X, utils.assert_transform_unchanged, matrix._scipy_matrix_types + matrix._numpy_matrix_types + matrix._pandas_dense_matrix_types + [matrix.SparseDataFrame], transform=scprep.sanitize.check_numeric, copy=False, ) if matrix._pandas_0: matrix._ignore_pandas_sparse_warning() utils.assert_raises_message( TypeError, "pd.SparseDataFrame does not support " "copy=False. Please use copy=True.", scprep.sanitize.check_numeric, data=matrix.SparseDataFrame_deprecated(X), copy=False, ) matrix._reset_warnings() class TypeErrorClass(object): def astype(self, dtype): return X = TypeErrorClass() utils.assert_raises_message( TypeError, "astype() got an unexpected keyword argument 'copy'", scprep.sanitize.check_numeric, data=X, copy=None, )
def test_10X_HDF5_cellranger3(): X = data.load_10X() h5_file = os.path.join(data.data_dir, "test_10X_cellranger3.h5") # automatic tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file) assert isinstance(X_hdf5, pd.SparseDataFrame) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='tables') assert isinstance(X_hdf5, pd.SparseDataFrame) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit h5py backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='h5py') assert isinstance(X_hdf5, pd.SparseDataFrame) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # automatic h5py backend tables = scprep.io.hdf5.tables del scprep.io.hdf5.tables X_hdf5 = scprep.io.load_10X_HDF5(h5_file) assert isinstance(X_hdf5, pd.SparseDataFrame) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) scprep.io.hdf5.tables = tables
def test_10X_HDF5(): X = data.load_10X() h5_file = os.path.join(data.data_dir, "test_10X.h5") # automatic tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file) assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend="tables") assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit h5py backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend="h5py") assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # automatic h5py backend tables = scprep.io.hdf5.tables del scprep.io.hdf5.tables X_hdf5 = scprep.io.load_10X_HDF5(h5_file) assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) scprep.io.hdf5.tables = tables
def test_check_numeric_copy(): X = data.load_10X() matrix.test_all_matrix_types( X, utils.assert_transform_unchanged, transform=scprep.sanitize.check_numeric, copy=True, )
def test_10X_zip_url(): X = data.load_10X() filename = "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X.zip" X_zip = scprep.io.load_10X_zip(filename) assert scprep.utils.is_sparse_dataframe(X_zip) assert np.sum(np.sum(X != X_zip)) == 0 np.testing.assert_array_equal(X.columns, X_zip.columns) np.testing.assert_array_equal(X.index, X_zip.index)
def test_remove_empty_cells_sample_label(): X = data.load_10X(sparse=False) sample_labels = np.arange(X.shape[0]) sample_labels_filt = sample_labels[X.sum(1) > 0] X_filtered, sample_labels = scprep.filter.remove_empty_cells( X, sample_labels=sample_labels) assert X_filtered.shape[0] == len(sample_labels) assert np.all(sample_labels == sample_labels_filt)
def test_10X_zip(filename): X = data.load_10X() filename = os.path.join(data.data_dir, filename) X_zip = scprep.io.load_10X_zip(filename) assert scprep.utils.is_sparse_dataframe(X_zip) assert np.sum(np.sum(X != X_zip)) == 0 np.testing.assert_array_equal(X.columns, X_zip.columns) np.testing.assert_array_equal(X.index, X_zip.index)
def test_remove_empty_cells(): X = data.load_10X(sparse=False) X_filtered = scprep.filter.remove_empty_cells(X) assert X_filtered.shape[1] == X.shape[1] assert not np.any(X_filtered.sum(1) == 0) matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=X_filtered, transform=scprep.filter.remove_empty_cells)
def test_download_url(): X = data.load_10X() scprep.io.download.download_url( "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X/matrix.mtx.gz", "url_test.mtx.gz", ) Y = scprep.io.load_mtx("url_test.mtx.gz").T assert (X.sparse.to_coo() - Y).nnz == 0 os.remove("url_test.mtx.gz")
def test_library_size_filter_error(): X = data.load_10X(sparse=True) assert_raise_message( ValueError, "Expected `keep_cells` in ['above', 'below']. Got invalid", scprep.filter.filter_library_size, X, 100, keep_cells='invalid')
def test_remove_rare_genes(): X = data.load_10X(sparse=False) X_filtered = scprep.filter.remove_rare_genes(X) assert X_filtered.shape[0] == X.shape[0] assert not np.any(X_filtered.sum(0) < 5) matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=X_filtered, transform=scprep.filter.remove_rare_genes)
def test_10X(): X = data.load_10X() assert X.shape == (100, 100) assert scprep.utils.is_sparse_dataframe(X) assert X.columns[0] == "Arl8b" X = data.load_10X(gene_labels="id", sparse=False) assert X.shape == (100, 100) assert isinstance(X, pd.DataFrame) assert not scprep.utils.is_sparse_dataframe(X) assert X.columns[0] == "ENSMUSG00000030105" X = data.load_10X(gene_labels="both") assert X.shape == (100, 100) assert scprep.utils.is_sparse_dataframe(X) assert X.columns[0] == "Arl8b (ENSMUSG00000030105)" X_cellranger3 = scprep.io.load_10X(os.path.join(data.data_dir, "test_10X_cellranger3"), gene_labels="both") np.testing.assert_array_equal(X.index, X_cellranger3.index) np.testing.assert_array_equal(X.columns, X_cellranger3.columns) np.testing.assert_array_equal(X.index, X_cellranger3.index) utils.assert_raises_message( ValueError, "gene_labels='invalid' not recognized. " "Choose from ['symbol', 'id', 'both']", data.load_10X, gene_labels="invalid", ) utils.assert_raises_message( FileNotFoundError, "{} is not a directory".format( os.path.join(data.data_dir, "test_10X.zip")), scprep.io.load_10X, os.path.join(data.data_dir, "test_10X.zip"), ) utils.assert_raises_message( FileNotFoundError, "'matrix.mtx(.gz)', '[genes/features].tsv(.gz)', and " "'barcodes.tsv(.gz)' must be present " "in {}".format(data.data_dir), scprep.io.load_10X, data.data_dir, )
def test_gene_expression_filter_warning(): X = data.load_10X(sparse=True) genes = np.arange(10) gene_outside_range = 100 no_genes = 'not_a_gene' assert_warns_message(UserWarning, "`percentile` expects values between 0 and 100." "Got 0.9. Did you mean 90.0?", scprep.filter.filter_gene_set_expression, X, genes, percentile=0.90, keep_cells='below') assert_raise_message( ValueError, "Only one of `cutoff` and `percentile` should be given.", scprep.filter.filter_gene_set_expression, X, genes, percentile=0.90, cutoff=50) assert_raise_message(ValueError, "Expected `keep_cells` in ['above', 'below']. " "Got neither", scprep.filter.filter_gene_set_expression, X, genes, percentile=90.0, keep_cells='neither') assert_warns_message(UserWarning, "`percentile` expects values between 0 and 100." "Got 0.9. Did you mean 90.0?", scprep.filter.filter_gene_set_expression, X, genes, percentile=0.90, keep_cells='below') assert_raise_message( ValueError, "One of either `cutoff` or `percentile` must be given.", scprep.filter.filter_gene_set_expression, X, genes, cutoff=None, percentile=None) assert_raise_message(KeyError, "the label [not_a_gene] is not in the [columns]", scprep.filter.filter_gene_set_expression, X, no_genes, percentile=90.0, keep_cells='below') assert_warns_message(UserWarning, "Selecting 0 columns", scprep.utils.select_cols, X, (X.sum(axis=0) < 0))
def test_library_size_filter(): X = data.load_10X(sparse=True) X_filtered = scprep.filter.filter_library_size(X, 100) assert X_filtered.shape[1] == X.shape[1] assert not np.any(X_filtered.sum(1) <= 100) matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=X_filtered, transform=partial( scprep.filter.filter_library_size, cutoff=100))
def test_differential_expression_error(): X = data.load_10X() utils.assert_raises_message( ValueError, "Expected `direction` in ['up', 'down', 'both']. " "Got invalid", scprep.stats.differential_expression, X, X, direction="invalid", ) utils.assert_raises_message( ValueError, "Expected `measure` in ['difference', 'emd', 'ttest', 'ranksum']. " "Got invalid", scprep.stats.differential_expression, X, X, measure="invalid", ) utils.assert_raises_message( ValueError, "Expected `X` and `Y` to be matrices. " "Got shapes {}, {}".format(X.shape, X.iloc[0].shape), scprep.stats.differential_expression, X, X.iloc[0], ) utils.assert_raises_message( ValueError, "Expected gene_names to have length {}. " "Got {}".format(X.shape[0], X.shape[0] // 2), scprep.stats.differential_expression, X.sparse.to_coo(), X.sparse.to_coo(), gene_names=np.arange(X.shape[0] // 2), ) utils.assert_raises_message( ValueError, "Expected gene_names to have length {}. " "Got {}".format(X.shape[0], X.shape[0] // 2), scprep.stats.differential_expression_by_cluster, X.sparse.to_coo(), np.random.choice(2, X.shape[0], replace=True), gene_names=np.arange(X.shape[0] // 2), ) utils.assert_warns_message( UserWarning, "Input data has inconsistent column names. " "Subsetting to 20 common columns.", scprep.stats.differential_expression, X, X.iloc[:, :20], )
def test_download_zip(): X = data.load_10X() scprep.io.download.download_and_extract_zip( "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X.zip", "zip_test", ) Y = scprep.io.load_10X("zip_test/test_10X") assert np.all(X == Y) assert np.all(X.index == Y.index) assert np.all(X.columns == Y.columns) shutil.rmtree("zip_test")
def test_combine_batches_rangeindex(): X = data.load_10X() X = X.reset_index(drop=True) Y = X.iloc[:X.shape[0] // 2] data_combined, labels = scprep.utils.combine_batches([X, Y], ["x", "y"]) assert isinstance(data_combined.index, pd.RangeIndex) assert np.all(np.sort(data_combined.columns) == np.sort(X.columns)) assert np.all(data_combined.iloc[:100][np.sort(X.columns)].to_numpy() == X[ np.sort(X.columns)].to_numpy()) assert np.all(data_combined.iloc[100:][np.sort(X.columns)].to_numpy() == Y[ np.sort(X.columns)].to_numpy())
def test_unzip_destination(): X = data.load_10X() filename = os.path.join(data.data_dir, "test_10X.zip") tmp_filename = "zip_extract_test.zip" shutil.copyfile(filename, tmp_filename) scprep.io.download.unzip(tmp_filename, destination="zip_test") assert not os.path.isfile(tmp_filename) Y = scprep.io.load_10X("zip_test/test_10X") assert np.all(X == Y) assert np.all(X.index == Y.index) assert np.all(X.columns == Y.columns) shutil.rmtree("zip_test")
def test_plot_histogram(): X = data.load_10X() scprep.plot.plot_library_size(X, cutoff=1000, log=True) fig, ax = plt.subplots() scprep.plot.plot_gene_set_expression(X, genes=scprep.utils.get_gene_set( X, starts_with="D"), percentile=90, log='y', ax=ax) assert_raise_message(TypeError, "Expected ax as a matplotlib.axes.Axes. Got ", scprep.plot.plot_library_size, X, ax="invalid")
def test_differential_expression_by_cluster(): measure = "difference" direction = "up" X = data.load_10X() np.random.seed(42) clusters = np.random.choice(4, X.shape[0], replace=True) result = scprep.stats.differential_expression_by_cluster( X, clusters, measure=measure, direction=direction) for cluster in range(4): r = scprep.stats.differential_expression( scprep.select.select_rows(X, idx=clusters == cluster), scprep.select.select_rows(X, idx=clusters != cluster), measure=measure, direction=direction, ) assert np.all(result[cluster] == r)