def test_removing_columns_Y(): # Set up a simple DatasetMatrix dm = DatasetMatrix('testmatrix') configure_default_datasetmatrix(dm) # Remove the first column from the Y matrix. dm.delete_column_Y(0) expected_X = default_matrix_X() expected_Y = scipy.sparse.csr_matrix(numpy.array([ [102], [202], [302], [402]])) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert default_row_labels() == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert ['coly1'] == dm.column_labels_Y # Remove the last remaining column from the Y matrix. dm.delete_column_Y(0) expected_X = default_matrix_X() expected_Y = scipy.sparse.csr_matrix((4, 0)) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert default_row_labels() == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert [] == dm.column_labels_Y folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_columns_Y') check_saving_and_loading(dm, folder)
def test_exds_build(): folder = testutil.ensure_empty_tmp_subfolder( 'test_exds_repository__test_build') definition = default_exds_definition(folder) exds = definition.create_exds() exds.build(finalize_and_save=False) # Make sure 'training_set_size = 0.25' has been properly taken into # account. assert 16 == exds.total_row_count assert 4 == len(exds.train_rows) assert 12 == len(exds.test_rows) assert 4 == exds.matrix_train.X.get_shape()[0] assert 4 == exds.matrix_train.Y.get_shape()[0] assert 12 == exds.matrix_test.X.get_shape()[0] assert 12 == exds.matrix_test.Y.get_shape()[0] # Reconstruct the list of all row indices, to make sure the split is # consistent. all_rows = set(exds.train_rows) | set(exds.test_rows) assert set(range(16)) == all_rows assert 0 == len(set(exds.train_rows) & set(exds.test_rows)) # Ensure that any row of exds.matrix is found either in # exds.matrix_train or exds.matrix_test. # First try for X. for row in range(15): original_row = exds.matrix.X.getrow(row) if row in exds.train_rows: train_row = exds.matrix_train.X.getrow(exds.train_rows.index(row)) assert DatasetMatrix.sparse_equal(original_row, train_row) is True elif row in exds.test_rows: test_row = exds.matrix_test.X.getrow(exds.test_rows.index(row)) assert DatasetMatrix.sparse_equal(original_row, test_row) is True else: raise AssertionError( "Row {} not found in neither train nor test X matrices".format( row)) # Do the same for Y. for row in range(15): original_row = exds.matrix.X.getrow(row) if row in exds.train_rows: train_row = exds.matrix_train.X.getrow(exds.train_rows.index(row)) assert DatasetMatrix.sparse_equal(original_row, train_row) is True elif row in exds.test_rows: test_row = exds.matrix_test.X.getrow(exds.test_rows.index(row)) assert DatasetMatrix.sparse_equal(original_row, test_row) is True else: raise AssertionError( "Row {} not found in neither train nor test Y matrices".format( row))
def test_generating_the_datasetmatrix__wordcount(): configuration = default_configuration() source = RCV1v2DatasetSource(configuration) datasetmatrix = source.create_dataset_matrix('rcv1v2_test') expected_X = default_document_term_matrix() calculated_X = datasetmatrix.X assert DatasetMatrix.sparse_equal(expected_X, calculated_X) is True expected_Y = default_document_topic_matrix() calculated_Y = datasetmatrix.Y assert DatasetMatrix.sparse_equal(expected_Y, calculated_Y) is True assert default_all_documentIDs__as_row_labels() == datasetmatrix.row_labels assert default_words() == datasetmatrix.column_labels_X assert default_topics() == datasetmatrix.column_labels_Y
def test_keeping_rows(): # Set up a simple DatasetMatrix dm = DatasetMatrix('testmatrix') configure_default_datasetmatrix(dm) # Empty lists are not allowed. with pytest.raises(ValueError): dm.keep_rows([]) # Keep rows 1 and 3. dm.keep_rows([1, 3]) expected_X = scipy.sparse.csr_matrix(numpy.array([ [5, 6, 7, 8], [13, 14, 15, 16]])) expected_Y = scipy.sparse.csr_matrix(numpy.array([ [201, 202], [401, 402]])) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert ['row1', 'row3'] == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y # Keep row 0 of the remaining 2 (labeled 'row1'). dm.keep_rows([0]) expected_X = scipy.sparse.csr_matrix(numpy.array([ [5, 6, 7, 8]])) expected_Y = scipy.sparse.csr_matrix(numpy.array([ [201, 202]])) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert ['row1'] == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__keeping_rows') check_saving_and_loading(dm, folder)
def test_removing_columns_X(): # Set up a simple DatasetMatrix dm = DatasetMatrix('testmatrix') configure_default_datasetmatrix(dm) # Remove the third column from the X matrix. dm.delete_column_X(2) expected_X = scipy.sparse.csr_matrix(numpy.array([ [1, 2, 4], [5, 6, 8], [9, 10, 12], [13, 14, 16]])) expected_Y = default_matrix_Y() assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert default_row_labels() == dm.row_labels assert ['colx0', 'colx1', 'colx3'] == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y # Remove the last column from the X matrix. dm.delete_column_X(2) expected_X = scipy.sparse.csr_matrix(numpy.array([ [1, 2], [5, 6], [9, 10], [13, 14]])) expected_Y = default_matrix_Y() assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert default_row_labels() == dm.row_labels assert ['colx0', 'colx1'] == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_columns_X') check_saving_and_loading(dm, folder)
def test_selecting_columns_X(): # Set up a simple DatasetMatrix dm = DatasetMatrix('testmatrix') configure_default_datasetmatrix(dm) # Empty lists are not allowed. with pytest.raises(ValueError): dm.select_columns_X([]) # Create new datasetmatrix where X has only columns 1 and 2. dm = dm.select_columns_X([1, 2], 'test_matrix_selected_colsX') expected_X = scipy.sparse.csr_matrix(numpy.array([ [2, 3], [6, 7], [10, 11], [14, 15]])) expected_Y = default_matrix_Y() assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert default_row_labels() == dm.row_labels assert ['colx1', 'colx2'] == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y # Select X column 0 from the resulting datasetmatrix. dm = dm.select_columns_X([0], 'test_matrix_selected_colsX_2') expected_X = scipy.sparse.csr_matrix(numpy.array([ [2], [6], [10], [14]])) expected_Y = default_matrix_Y() assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert default_row_labels() == dm.row_labels assert ['colx1'] == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__selecting_colsX') check_saving_and_loading(dm, folder)
def test_removing_rows(): # Set up a simple DatasetMatrix dm = DatasetMatrix('testmatrix') configure_default_datasetmatrix(dm) # Remove the third row. Affects X and Y at the same time. dm.delete_row(2) expected_X = scipy.sparse.csr_matrix(numpy.array([ [1, 2, 3, 4], [5, 6, 7, 8], [13, 14, 15, 16]])) expected_Y = scipy.sparse.csr_matrix(numpy.array([ [101, 102], [201, 202], [401, 402]])) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert ["row0", "row1", "row3"] == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y # Remove the first row. dm.delete_row(0) expected_X = scipy.sparse.csr_matrix(numpy.array([ [5, 6, 7, 8], [13, 14, 15, 16]])) expected_Y = scipy.sparse.csr_matrix(numpy.array([ [201, 202], [401, 402]])) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert ["row1", "row3"] == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_rows') check_saving_and_loading(dm, folder) dm.unfinalize() # Remove both remaining rows. dm.delete_row(0) dm.delete_row(0) expected_X = scipy.sparse.csr_matrix((0, 4)) expected_Y = scipy.sparse.csr_matrix((0, 2)) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert [] == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_rows') check_saving_and_loading(dm, folder)