def test_relpath(self): """Test if specifying datapaths relative to schema workds.""" df = pd.DataFrame(np.random.randint(low=1, high=10, size=(10, 2)), columns="a b".split()) schema = {'data': {"dataframe_rules": {"drop_duplicates": False}}} with DummyProjectFactory(schema, df) as project: loaded = project.load_dataset("data") self.assertDataFrameEqual(loaded, df)
def test_exclude_cols(self): """Test if importing data with excluded columns works.""" filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", "iris.csv") df = pd.read_csv(filepath) specs = {"data": {'exclude_columns': ['Species']}} with DummyProjectFactory(specs, df) as project: loaded = project.load_dataset("data") self.assertNotIn('Species', loaded.columns)
def test_column_postprocessors(self): """Test if postprocessors work on column data properly.""" filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", "iris.csv") df = pd.read_csv(filepath) col_rules = {'Species': {'postprocessors': [_dummy_postproc]}} schema = {"data": {'column_rules': col_rules}} with DummyProjectFactory(schema, df) as project: loaded = project.load_dataset("data") processed = loaded['Species'] self.assertNotIn("setosa", processed.unique())
def test_integer_col_na_values(self): """Test if the Loader can load columns with integers and NAs. This is necessary because NaNs cannot be represented by integers.""" x = map(str, range(20)) x[13] = "" df = pd.DataFrame.from_dict(dict(a=x, b=x)) specs = dict(delimiter=',', dtypes={'a': int, 'b': int}) schema = dict(data=specs) with DummyProjectFactory(schema, df) as project: df = project.load_dataset("data") self.assertEqual(df['a'].dtype, float) self.assertEqual(df['b'].dtype, float)
def test_min_dropna_on_cols(self): """Test if specifying a minimum value for a column also drops the NaNs.""" x1 = np.random.rand(10, 2) / 10 x2 = np.random.rand(10, 2) x = np.r_[x1, x2] np.random.shuffle(x) df = pd.DataFrame(x, columns="col_a col_b".split()) df.loc[3, "col_a"] = np.nan df.loc[7, "col_b"] = np.nan schema = {'data': {"column_rules": {"col_a": {"min": 0.1}}}} with DummyProjectFactory(schema, df) as project: loaded = project.load_dataset("data") self.assertFalse(np.any(pd.isnull(loaded))) self.assertGreater(loaded['col_a'].min(), 0.1)
def test_index_col(self): """Test if specifying the index_col works.""" df = pd.read_csv(self.expected_specs['iris']['filepath_or_buffer']) specs = { "data": { 'index_col': 'Species', 'dataframe_rules': { 'drop_duplicates': False } } } with DummyProjectFactory(specs, df) as project: df = project.load_dataset('data') for specie in ['setosa', 'versicolor', 'virginica']: self.assertEqual(df.ix[specie].shape[0], 50)
def test_indexcol_not_in_usecols(self): """ Test if the specified index column is added to the usecols argument.""" df = pd.read_csv(self.data_specs['iris']['path']) schema = { 'data': { 'index_col': 'Species', 'use_columns': ['Sepal Length', 'Petal Width'] } } with DummyProjectFactory(schema, df) as project: df = project.load_dataset("data") self.assertEqual(df.index.name, "Species") self.assertItemsEqual(df.columns, ['Sepal Length', 'Petal Width'])
def test_multiindex(self): """Test if providing a list of indices in the schema returns a proper multiindexed dataframe.""" orgdf = pd.read_table( self.expected_specs['person_activity']['filepath_or_buffer']) index_cols = ['sequence_name', 'tag'] schema = {"data": {'index_col': index_cols, 'delimiter': '\t'}} with DummyProjectFactory(schema, orgdf, sep="\t") as project: df = project.load_dataset('data') self.assertTrue(isinstance(df.index, pd.MultiIndex)) self.assertEqual(len(df.index.levels), 2) seq_name, tags = df.index.levels for col in index_cols: x = orgdf[col].unique().tolist() y = df.index.get_level_values(col).unique().tolist() self.assertItemsEqual(x, y)
def test_load_dataset_wrong_dtypes_in_spec(self): """Test if the Loader can safely load columns that have a wrongly specified data type in the schema. """ # Make a file with two columns, both specified as integers in the # dtypes, but one has random string types. x = np.random.randint(0, 10, size=(100, 2)) dframe = pd.DataFrame(x, columns=['a', 'b']) _ix = np.random.randint(0, 100, size=(5, )) dframe['b'][_ix] = "aa" specs = dict(delimiter=',', dtypes={'a': int, 'b': int}) schema = dict(data=specs) with DummyProjectFactory(schema, dframe) as project: with warnings.catch_warnings(record=True) as catcher: dframe = project.load_dataset("data") assert len(catcher) == 1 assert issubclass(catcher[-1].category, UserWarning)
def test_min_nan(self): """Test if the minimum rules work when data contains NaNs.""" s = pd.Series(np.random.rand(10, )) s.loc[3] = np.nan schema = { 'data': { 'header': None, 'column_rules': { '0': { 'min': 0.2 } } } } with DummyProjectFactory(schema, s) as project: df = project.load_dataset("data") self.assertFalse(np.any(pd.isnull(df[0])))
def test_global_na_reps(self): """Test is specifying a global NA value for a dataset works.""" df = pd.DataFrame(np.random.rand(10, 10)) ix = np.random.randint(0, df.shape[0], size=(5, )) ix = np.unique(ix) for i in xrange(ix.shape[0]): df.iloc[ix[i], ix[i]] = "foobar" schema = { "data": { 'na_values': "foobar", 'dataframe_rules': { 'drop_na': False, 'drop_duplicates': False } } } with DummyProjectFactory(schema, df) as project: df = project.load_dataset("data") self.assertEqual(pd.isnull(df).sum().sum(), ix.shape[0])
def test_na_reps_list(self): """Test if NA values work when specified as a list.""" df = pd.DataFrame(np.random.rand(10, 2)) ix = np.random.randint(0, df.shape[0], size=(5, )) ix = np.unique(ix) df.iloc[ix, 0] = "foo" df.iloc[ix, 1] = "bar" schema = { "data": { 'na_values': ["foo", "bar"], 'dataframe_rules': { 'drop_na': False, 'drop_duplicates': False } } } with DummyProjectFactory(schema, df) as project: df = project.load_dataset("data") self.assertEqual(pd.isnull(df).sum().sum(), ix.shape[0] * 2)
def test_nrows_shuffling(self): """Test if the shuffle parameter works with the nrows parameter.""" X = np.c_[np.arange(10), np.arange(10)] ix = range(5) + "a b c d e".split() df = pd.DataFrame(X, index=ix) schema = { 'data': { "index_col": "index", 'nrows': { 'count': 5, "shuffle": True } } } with DummyProjectFactory(schema, df, index_label="index") as project: df = project.load_dataset("data") for row_label in "a b c d e".split(): self.assertNotIn(row_label, df.index) self.assertFalse(np.all(df.index == range(5)))
def test_index_column_rules(self): """Test if column rules specified for index columns are enforced.""" schema = { 'data': { 'index_col': 'Species', 'dataframe_rules': { 'drop_duplicates': False }, 'column_rules': { 'Species': { 'regex': '.*e.*' } } } } df = pd.read_csv(self.data_specs['iris']['path']) with DummyProjectFactory(schema, df) as project: df = project.load_dataset("data") self.assertEqual(df.index.name.lower(), 'species') self.assertNotIn("virginica", df.index.unique())
def test_min_max_datetime(self): """Test if minmax rules work on datetime columns.""" dates = pd.date_range("01/01/2015", "12/31/2015") x = np.random.rand(365, ) df = pd.DataFrame.from_dict(dict(day=dates, data=x)) schema = { 'data': { 'parse_dates': 'day', 'column_rules': { 'day': { 'min': '04/01/2015', 'max': "11/30/2015" } } } } with DummyProjectFactory(schema, df) as project: newdf = project.load_dataset('data') self.assertEqual(newdf['day'].min(), pd.to_datetime("04/01/2015")) self.assertEqual(newdf['day'].max(), pd.to_datetime("11/30/2015"))
def test_index_column_exclude(self): """Test if values are excluded from index column if so specified.""" df = pd.DataFrame.from_dict({ 'index': np.arange(10), 'col_a': np.arange(10) }) schema = { 'data': { 'index_col': 'index', 'column_rules': { 'index': { 'exclude': [1, 2] } } } } with DummyProjectFactory(schema, df) as project: df = project.load_dataset("data") self.assertItemsEqual(df.shape, (8, 1)) self.assertEqual(df.index.name, "index") self.assertNotIn(1, df.index) self.assertNotIn(2, df.index)