def test_colnames_as_callable(self): """Test if column names work when specified as a callable.""" schema = deepcopy(self.basespecs['iris']) translator = lambda x: "_".join([s.lower() for s in x.split()]) schema['column_names'] = translator ideal = {'column_names': translator} validator = SchemaValidator(specification=schema) validator.get_parser_args() self.assertKwargsEqual(validator.df_rules, ideal)
def test_colnames_as_dict(self): """Test if the column names work when specified as a dictionary.""" schema = deepcopy(self.basespecs['iris']) namemap = {'Sepal Length': 'slength', 'Sepal Width': 'swidth', 'Petal Width': 'pwidth', 'Petal Length': 'plength', 'Species': 'spcs'} schema['column_names'] = namemap ideal = {'column_names': namemap} validator = SchemaValidator(specification=schema) validator.get_parser_args() self.assertKwargsEqual(validator.df_rules, ideal)
def test_colnames_as_dict(self): """Test if the column names work when specified as a dictionary.""" schema = deepcopy(self.basespecs['iris']) namemap = { 'Sepal Length': 'slength', 'Sepal Width': 'swidth', 'Petal Width': 'pwidth', 'Petal Length': 'plength', 'Species': 'spcs' } schema['column_names'] = namemap ideal = {'column_names': namemap} validator = SchemaValidator(specification=schema) validator.get_parser_args() self.assertKwargsEqual(validator.df_rules, ideal)
def test_exclude_columns(self): schema = deepcopy(self.basespecs['iris']) schema['exclude_columns'] = ['Sepal Length', 'Petal Width'] validator = SchemaValidator(specification=schema) loaded = pd.read_csv(**validator.get_parser_args()) self.assertItemsEqual(loaded.columns, ['Petal Length', 'Sepal Width', 'Species'])
def test_converter(self): """Test if the SeriesValidator properly applies converters.""" schema = deepcopy(self.basespecs['iris']) schema['converters'] = {'Sepal Width': lambda x: int(float(x))} validator = SchemaValidator(specification=schema) filtered = pd.read_csv(**validator.get_parser_args())['Sepal Width'] self.assertTrue(filtered.dtype == np.int)
def test_multifile_dataset_schema(self): """Test if a dataset schema with multiple files works properly.""" duplicate_iris_path = self.basespecs['iris']['path'].replace("iris", "iris2") # Copy the file dframe = pd.read_csv(self.basespecs['iris']['path']) dframe.to_csv(duplicate_iris_path, index=False) # Create the news chema schema = {'nrows': [150, 150], 'path': [duplicate_iris_path, self.basespecs['iris']['path']]} for key, value in self.basespecs['iris'].iteritems(): if key not in schema: schema[key] = value try: validator = SchemaValidator(specification=schema) self.assertTrue(validator.is_multifile) self.assertItemsEqual(validator.filepath, schema['path']) self.assertItemsEqual(validator.nrows, schema['nrows']) validated_args = validator.get_parser_args() self.assertTrue(isinstance(validated_args, list)) self.assertEqual(len(validated_args), 2) finally: os.unlink(duplicate_iris_path)
def test_random_rows_selection(self): """Test if the validator correctly produces the function argument required for selecting a range of rows from a dataset.""" self.basespecs['iris']['nrows'] = {'range': [25, 75]} validator = SchemaValidator(specification=self.basespecs['iris']) parser_args = validator.get_parser_args() self.assertEqual(parser_args['skiprows'], 25) self.assertEqual(parser_args['nrows'], 50)
def test_header(self): """Test if the header option works.""" schema = deepcopy(self.basespecs['iris']) schema['header'] = 1 validator = SchemaValidator(specification=schema) loaded = pd.read_csv(**validator.get_parser_args()) self.assertItemsEqual(loaded.columns, ['5.1', '3.5', '1.4', '0.2', 'setosa'])
def test_validator_specfile_name_iris(self): """Test if the validator works when providing specifle and name for the iris dataset. """ validator = SchemaValidator(specfile=self.specfile, name="iris") validated_parser_args = validator.get_parser_args() self.assertKwargsEqual(validated_parser_args, self.ideal_iris_parser_args)
def test_index(self): """Test if specifying the index_col works.""" specs = deepcopy(self.basespecs['iris']) index_col = "Species" specs['index_col'] = index_col validator = SchemaValidator(specification=specs) parser_args = validator.get_parser_args() self.assertItemsEqual(parser_args['index_col'], index_col)
def test_parse_dates_list(self): """Test if arguments to `parse_dates` are put into a list.""" specs = deepcopy(self.basespecs['person_activity']) specs['parse_dates'] = specs['parse_dates'][0] validator = SchemaValidator(specification=specs) parser_args = validator.get_parser_args() self.assertTrue(isinstance(parser_args['parse_dates'], list)) df = pd.read_csv(**parser_args) self.assertEqual(df['date'].dtype, np.dtype('<M8[ns]'))
def test_validator_with_specdict_iris(self): """Check if the validator works when only the specification is supplied as a dictionary for the iris dataset. """ validator = SchemaValidator(specification=self.basespecs['iris']) self.assertFalse(validator.is_multifile) validated_parser_args = validator.get_parser_args() self.assertKwargsEqual(validated_parser_args, self.ideal_iris_parser_args)
def test_pandas_defaults_empty_specs(self): """Test if the validator falls back to pandas defaults for empty specs. """ schema = dict(path=op.join(op.abspath(op.dirname(__file__)), "testdata", "iris.csv")) validator = SchemaValidator(specification=schema) ideal = pd.read_csv(schema['path']) actual = pd.read_csv(**validator.get_parser_args()) self.assertDataFrameEqual(ideal, actual)
def test_validator_with_specdist_activity(self): """Check if the validator works when only the specification is supplied as a dictionary for the person activity dataset. """ validator = SchemaValidator( specification=self.basespecs['person_activity']) self.assertFalse(validator.is_multifile) validated = validator.get_parser_args() self.assertKwargsEqual(validated, self.ideal_activity_parser_args)
def setUpClass(cls): cls.maxDiff = None with open(TEST_DATA_DICT, 'r') as fileobj: basespecs = yaml.load(fileobj, Loader=yaml.CLoader) # Fix the paths in basespecs for _, specs in basespecs.iteritems(): rlpth = specs['path'] specs['path'] = op.join(op.abspath(op.dirname(__file__)), rlpth) cls._basespecs = basespecs iris_validator = SchemaValidator(specification=cls._basespecs['iris']) pa_validator = SchemaValidator( specification=cls._basespecs['person_activity']) iris_dframe = pd.read_csv(**iris_validator.get_parser_args()) pa_dframe = pd.read_csv(**pa_validator.get_parser_args()) cls.iris_dframe = iris_dframe cls.pa_dframe = pa_dframe
def test_validator_specfile_name_activity(self): """Test if the validator works when providing specifle and name for the activity dataset. """ validator = SchemaValidator(specfile=self.specfile, name="person_activity") validated_parser_args = validator.get_parser_args() self.assertKwargsEqual(validated_parser_args, self.ideal_activity_parser_args)
def setUpClass(cls): cls.maxDiff = None with open(TEST_DATA_DICT, 'r') as fileobj: basespecs = yaml.load(fileobj, Loader=Loader) # Fix the paths in basespecs for _, specs in basespecs.iteritems(): rlpth = specs['path'] specs['path'] = op.join(op.abspath(op.dirname(__file__)), rlpth) cls._basespecs = basespecs iris_validator = SchemaValidator(specification=cls._basespecs['iris']) pa_validator = SchemaValidator( specification=cls._basespecs['person_activity']) iris_dframe = pd.read_csv(**iris_validator.get_parser_args()) pa_dframe = pd.read_csv(**pa_validator.get_parser_args()) cls.iris_dframe = iris_dframe cls.pa_dframe = pa_dframe
def test_multiindex(self): """Test if validator accepts list of index columns for multiindexing.""" specs = deepcopy(self.basespecs['person_activity']) index_cols = ['tag', 'sequence_name'] specs['index_col'] = index_cols validator = SchemaValidator(specification=specs) parser_args = validator.get_parser_args() self.assertItemsEqual(parser_args['index_col'], index_cols)
def test_colnames_as_list(self): """Test if the column names option works when provided as a list.""" schema = deepcopy(self.basespecs['iris']) schema['header'] = 0 ideal = ['a', 'b', 'c', 'd', 'e'] schema['column_names'] = ideal validator = SchemaValidator(specification=schema) loaded = pd.read_csv(**validator.get_parser_args()) self.assertItemsEqual(loaded.columns, ideal)
def test_index(self): """Test if specifying the index_col works.""" specs = deepcopy(self.basespecs['iris']) index_col = "Species" specs['index_col'] = index_col del specs['column_rules']['Species'] validator = SchemaValidator(specification=specs) parser_args = validator.get_parser_args() self.assertItemsEqual(parser_args['index_col'], index_col)
def test_usecols(self): """Test if inferring the usecols argument works.""" specs = deepcopy(self.basespecs['iris']) specs['use_columns'] = ['Petal Length', 'Sepal Width', 'Species'] validator = SchemaValidator(specification=specs) df = pd.read_csv(**validator.get_parser_args()) for colname in specs['use_columns']: self.assertIn(colname, df) self.assertNotIn("Petal Width", df) self.assertNotIn("Sepal Length", df) self.assertEqual(df.shape[1], 3)
def test_na_values(self): """Test if adding NA values in the schema works properly.""" bad_iris_path = op.join(op.abspath(op.dirname(__file__)), "testdata", "bad_iris.csv") schema = deepcopy(self.basespecs['iris']) schema['path'] = bad_iris_path schema['column_rules']['Species']['unique_values'].append('unknown') schema['column_rules']['Species']['na_values'] = ['unknown'] validator = SchemaValidator(specification=schema) parser_args = validator.get_parser_args() self.assertDictEqual(parser_args.get("na_values"), {'Species': ['unknown']})
def test_validator_with_specfile_spec(self): """Check if the validator works when the specfile and specification are both provided. """ # This is necessary because the validator might have to write # specifications to the dictionary. validator = SchemaValidator(specification=self.basespecs['iris'], specfile=self.specfile) self.assertFalse(validator.is_multifile) validated_parser_args = validator.get_parser_args() self.assertKwargsEqual(validated_parser_args, self.ideal_iris_parser_args)
def setUpClass(cls): cls.maxDiff = None with open(TEST_DATA_DICT, 'r') as fileobj: basespecs = yaml.load(fileobj, Loader=Loader) # Fix the paths in basespecs for _, specs in basespecs.iteritems(): rlpth = specs['path'] specs['path'] = op.join(op.abspath(op.dirname(__file__)), rlpth) cls._basespecs = basespecs iris_validator = SchemaValidator(specification=cls._basespecs['iris']) pa_validator = SchemaValidator( specification=cls._basespecs['person_activity']) iris_dframe = pd.read_csv(**iris_validator.get_parser_args()) pa_dframe = pd.read_csv(**pa_validator.get_parser_args()) cls.iris_dframe = iris_dframe cls.pa_dframe = pa_dframe cls.species_rules = {'unique_values': ['setosa', 'virginica', 'versicolor'], 'drop_duplicates': False, 'drop_na': False}
def test_colnames_as_callable(self): translator = lambda x: "_".join([s.lower() for s in x.split()]) self.basespecs['iris']['column_names'] = translator schema_val = SchemaValidator(specification=self.basespecs['iris']) parser_args = schema_val.get_parser_args() df = pd.read_csv(**parser_args) rules = {} rules.update(schema_val.df_rules) df_val = DataFrameValidator(data=df, rules=rules) data = df_val.clean() ideal = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'] self.assertItemsEqual(data.columns, ideal)
def test_colnames_as_list(self): """Test if the column names option works when provided as a list.""" schema = deepcopy(self.basespecs['iris']) schema['header'] = 0 ideal = ['a', 'b', 'c', 'd', 'e'] schema['column_names'] = ideal validator = SchemaValidator(specification=schema) df = pd.read_csv(**validator.get_parser_args()) rules = {} rules.update(validator.df_rules) df_val = DataFrameValidator(data=df, rules=rules) data = df_val.clean() self.assertItemsEqual(data.columns, ideal)
def setUpClass(cls): cls.maxDiff = None with open(TEST_DATA_DICT, 'r') as fileobj: basespecs = yaml.load(fileobj, Loader=Loader) # Fix the paths in basespecs for _, specs in basespecs.iteritems(): rlpth = specs['path'] specs['path'] = op.join(op.abspath(op.dirname(__file__)), rlpth) cls._basespecs = basespecs iris_validator = SchemaValidator(specification=cls._basespecs['iris']) pa_validator = SchemaValidator( specification=cls._basespecs['person_activity']) iris_dframe = pd.read_csv(**iris_validator.get_parser_args()) pa_dframe = pd.read_csv(**pa_validator.get_parser_args()) cls.iris_dframe = iris_dframe cls.pa_dframe = pa_dframe cls.species_rules = { 'unique_values': ['setosa', 'virginica', 'versicolor'], 'drop_duplicates': False, 'drop_na': False }
def test_colnames_as_dict(self): """Test if column names gotten from SchemaValidator are implemented.""" namemap = {'Sepal Length': 'slength', 'Sepal Width': 'swidth', 'Petal Width': 'pwidth', 'Petal Length': 'plength', 'Species': 'spcs'} self.basespecs['iris']['column_names'] = namemap schema_val = SchemaValidator(specification=self.basespecs['iris']) parser_args = schema_val.get_parser_args() df = pd.read_csv(**parser_args) rules = {} rules.update(schema_val.df_rules) df_val = DataFrameValidator(data=df, rules=rules) data = df_val.clean() self.assertItemsEqual(data.columns, namemap.values())
def test_colnames_as_callable(self): translator = lambda x: "_".join([s.lower() for s in x.split()]) self.basespecs['iris']['column_names'] = translator schema_val = SchemaValidator(specification=self.basespecs['iris']) parser_args = schema_val.get_parser_args() df = pd.read_csv(**parser_args) rules = {} rules.update(schema_val.df_rules) df_val = DataFrameValidator(data=df, rules=rules) data = df_val.clean() ideal = [ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species' ] self.assertItemsEqual(data.columns, ideal)
def test_colnames_as_dict(self): """Test if column names gotten from SchemaValidator are implemented.""" namemap = { 'Sepal Length': 'slength', 'Sepal Width': 'swidth', 'Petal Width': 'pwidth', 'Petal Length': 'plength', 'Species': 'spcs' } self.basespecs['iris']['column_names'] = namemap schema_val = SchemaValidator(specification=self.basespecs['iris']) parser_args = schema_val.get_parser_args() df = pd.read_csv(**parser_args) rules = {} rules.update(schema_val.df_rules) df_val = DataFrameValidator(data=df, rules=rules) data = df_val.clean() self.assertItemsEqual(data.columns, namemap.values())
def test_global_na_values(self): """Test if specifying a global NA value for a dataset works.""" tempdir = tempfile.mkdtemp() df = pd.DataFrame(np.random.rand(10, 10)) ix = np.random.randint(0, df.shape[0], size=(5, )) ix = np.unique(ix) for i in xrange(ix.shape[0]): df.iloc[ix[i], ix[i]] = "foobar" fpath = op.join(tempdir, "test_na.csv") df.to_csv(fpath, index=False) schema = {'path': fpath, 'na_values': "foobar"} try: validator = SchemaValidator(specification=schema) parser_args = validator.get_parser_args() self.assertEqual(parser_args['na_values'], "foobar") df = pd.read_csv(**parser_args) self.assertEqual(pd.isnull(df).sum().sum(), ix.shape[0]) finally: shutil.rmtree(tempdir)
def test_global_na_values(self): """Test if specifying a global NA value for a dataset works.""" tempdir = tempfile.mkdtemp() df = pd.DataFrame(np.random.rand(10, 10)) ix = np.random.randint(0, df.shape[0], size=(5,)) ix = np.unique(ix) for i in xrange(ix.shape[0]): df.iloc[ix[i], ix[i]] = "foobar" fpath = op.join(tempdir, "test_na.csv") df.to_csv(fpath, index=False) schema = {'path': fpath, 'na_values': "foobar"} try: validator = SchemaValidator(specification=schema) parser_args = validator.get_parser_args() self.assertEqual(parser_args['na_values'], "foobar") df = pd.read_csv(**parser_args) self.assertEqual(pd.isnull(df).sum().sum(), ix.shape[0]) finally: shutil.rmtree(tempdir)
def test_timestamp_cols_combine(self): """Test if the schema for combining datetime columns works.""" tempdir = tempfile.mkdtemp() outpath = op.join(tempdir, "data.csv") rng = pd.date_range('1/1/2011', periods=72, freq='H') rng = [str(x).split() for x in rng] date = [x[0] for x in rng] time = [x[1] for x in rng] data = pd.DataFrame({'Date': date, 'Time': time, 'X': np.random.rand(len(date),)}) data.to_csv(outpath, index=False) specs = dict(path=outpath, parse_dates={'Date_Time': ['Date', 'Time']}) validator = SchemaValidator(specification=specs) try: loaded = pd.read_csv(**validator.get_parser_args()) x = " ".join((date[0], time[0])) self.assertEqual(loaded['Date_Time'].dtype, np.datetime64(x, 'ns').dtype) finally: shutil.rmtree(tempdir)
def test_timestamp_cols_combine(self): """Test if the schema for combining datetime columns works.""" tempdir = tempfile.mkdtemp() outpath = op.join(tempdir, "data.csv") rng = pd.date_range('1/1/2011', periods=72, freq='H') rng = [str(x).split() for x in rng] date = [x[0] for x in rng] time = [x[1] for x in rng] data = pd.DataFrame({ 'Date': date, 'Time': time, 'X': np.random.rand(len(date), ) }) data.to_csv(outpath, index=False) specs = dict(path=outpath, parse_dates={'Date_Time': ['Date', 'Time']}) validator = SchemaValidator(specification=specs) try: loaded = pd.read_csv(**validator.get_parser_args()) x = " ".join((date[0], time[0])) self.assertEqual(loaded['Date_Time'].dtype, np.datetime64(x, 'ns').dtype) finally: shutil.rmtree(tempdir)