def load_dataset(self, dataset_name): """Load and return the dataset. :param dataset_name: Name of the dataset :type dataset_name: str :return: A pandas DataFrame containing the dataset. :rtype: pandas.DataFrame """ validator = self.validators[dataset_name] column_rules = self.column_rules.get(dataset_name, {}) df_rules = self.df_rules.get(dataset_name, {}) args = validator.get_parser_args() if isinstance(args, dict): df = self._load(args) df_validator = DataFrameValidator(data=df, rules=df_rules, column_rules=column_rules) return df_validator.clean() else: dfs = [] for argset in args: self._update_parser(argset) _df = self.parser(**argset) df_validator = DataFrameValidator(data=_df, column_rules=column_rules) dfs.append(df_validator.clean()) return pd.concat(dfs, axis=0)
def test_column_rules(self): """Test if the DataFrame validator reads and enforces the column rules properly. """ dframe_val = DataFrameValidator(data=self.iris_dframe.copy(), column_rules=self.basespecs['iris']['column_rules']) cleaned = dframe_val.clean() self.assertDataFrameEqual(cleaned, self.iris_dframe.drop_duplicates()) dframe_val = DataFrameValidator(data=self.pa_dframe.copy(), column_rules=self.basespecs['person_activity']['column_rules']) cleaned = dframe_val.clean() self.assertDataFrameEqual(cleaned, self.pa_dframe.drop_duplicates())
def load_dataset(self, dataset_name): """Load and return a dataset. :param dataset_name: Name of the dataset :type dataset_name: str :return: A pandas DataFrame containing the dataset. :rtype: pandas.DataFrame :Example: >>> demo_project = Project('pysemantic_demo') >>> iris = demo_project.load_dataset('iris') >>> type(iris) pandas.core.DataFrame """ validator = self.validators[dataset_name] column_rules = self.column_rules.get(dataset_name, {}) df_rules = self.df_rules.get(dataset_name, {}) parser_args = validator.get_parser_args() df_rules.update(validator.df_rules) logger.info("Attempting to load dataset {} with args:".format( dataset_name)) if validator.is_spreadsheet: parser_args.pop('usecols', None) logger.info(json.dumps(parser_args, cls=TypeEncoder)) if isinstance(parser_args, dict): with ParseErrorHandler(parser_args, self) as handler: df = handler.load() if df is None: raise ParserArgumentError("No valid parser arguments were " + "inferred from the schema.") if validator.is_spreadsheet and isinstance(validator.sheetname, list): df = pd.concat(df.itervalues(), axis=0) logger.info("Success!") df_validator = DataFrameValidator(data=df, rules=df_rules, column_rules=column_rules) logger.info("Commence cleaning dataset:") logger.info("DataFrame rules:") logger.info(json.dumps(df_rules, cls=TypeEncoder)) logger.info("Column rules:") logger.info(json.dumps(column_rules, cls=TypeEncoder)) return df_validator.clean() else: dfs = [] for argset in parser_args: with ParseErrorHandler(argset, self) as handler: _df = handler.load() df_validator = DataFrameValidator(data=_df, column_rules=column_rules) dfs.append(df_validator.clean()) df = pd.concat(dfs, axis=0) return df.set_index(np.arange(df.shape[0]))
def test_unique_values(self): """Test if the validator checks for the unique values.""" validator = DataFrameValidator(data=self.iris_dframe, column_rules={'Species': self.species_rules}) cleaned = validator.clean() self.assertItemsEqual(cleaned.Species.unique(), ['setosa', 'versicolor', 'virginica'])
def test_drop_duplicates(self): """Test if the DataFrameValidator is dropping duplicates properly.""" col_rules = self.basespecs['iris'].get('column_rules') data = self.iris_dframe.copy() _data = pd.concat((data, data)) validator = DataFrameValidator(data=_data, column_rules=col_rules) cleaned = validator.clean() self.assertDataFrameEqual(cleaned, data.drop_duplicates())
def test_unique_values(self): """Test if the validator checks for the unique values.""" validator = DataFrameValidator( data=self.iris_dframe, column_rules={'Species': self.species_rules}) cleaned = validator.clean() self.assertItemsEqual(cleaned.Species.unique(), ['setosa', 'versicolor', 'virginica'])
def load_dataset(self, dataset_name): """Load and return a dataset. :param dataset_name: Name of the dataset :type dataset_name: str :return: A pandas DataFrame containing the dataset. :rtype: pandas.DataFrame :Example: >>> demo_project = Project('pysemantic_demo') >>> iris = demo_project.load_dataset('iris') >>> type(iris) pandas.core.DataFrame """ validator = self.validators[dataset_name] column_rules = self.column_rules.get(dataset_name, {}) df_rules = self.df_rules.get(dataset_name, {}) parser_args = validator.get_parser_args() df_rules.update(validator.df_rules) logger.info("Attempting to load dataset {} with args:".format( dataset_name)) logger.info(json.dumps(parser_args, cls=TypeEncoder)) if isinstance(parser_args, dict): df = self._load(parser_args) if validator.is_spreadsheet and isinstance(validator.sheetname, list): df = pd.concat(df.itervalues(), axis=0) logger.info("Success!") df_validator = DataFrameValidator(data=df, rules=df_rules, column_rules=column_rules) logger.info("Commence cleaning dataset:") logger.info("DataFrame rules:") logger.info(json.dumps(df_rules, cls=TypeEncoder)) logger.info("Column rules:") logger.info(json.dumps(column_rules, cls=TypeEncoder)) return df_validator.clean() else: dfs = [] for argset in parser_args: self._update_parser(argset) _df = self.parser(**argset) df_validator = DataFrameValidator(data=_df, column_rules=column_rules) dfs.append(df_validator.clean()) df = pd.concat(dfs, axis=0) return df.set_index(np.arange(df.shape[0]))
def test_column_exclude_rules(self): """Test if the validator drops values excluded from columns.""" col_rules = deepcopy(self.basespecs['iris']['column_rules']) col_rules['Species']['exclude'] = ['virginica', 'versicolor'] dframe_val = DataFrameValidator(data=self.iris_dframe.copy(), column_rules=col_rules, rules={'drop_duplicates': False}) cleaned_species = dframe_val.clean()['Species'] self.assertItemsEqual(cleaned_species.unique().tolist(), ['setosa']) self.assertEqual(cleaned_species.shape[0], 50)
def test_colnames_as_list(self): """Test if the column names option works when provided as a list.""" schema = deepcopy(self.basespecs['iris']) schema['header'] = 0 ideal = ['a', 'b', 'c', 'd', 'e'] schema['column_names'] = ideal validator = SchemaValidator(specification=schema) df = pd.read_csv(**validator.get_parser_args()) rules = {} rules.update(validator.df_rules) df_val = DataFrameValidator(data=df, rules=rules) data = df_val.clean() self.assertItemsEqual(data.columns, ideal)
def test_colnames_as_callable(self): translator = lambda x: "_".join([s.lower() for s in x.split()]) self.basespecs['iris']['column_names'] = translator schema_val = SchemaValidator(specification=self.basespecs['iris']) parser_args = schema_val.get_parser_args() df = pd.read_csv(**parser_args) rules = {} rules.update(schema_val.df_rules) df_val = DataFrameValidator(data=df, rules=rules) data = df_val.clean() ideal = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'] self.assertItemsEqual(data.columns, ideal)
def test_bad_unique_values(self): """Test if the validator drops values not specified in the schema.""" # Add some bogus values noise = np.random.choice(['lily', 'petunia'], size=(50,)) species = np.hstack((self.iris_dframe.Species.values, noise)) np.random.shuffle(species) species = pd.Series(species) validator = DataFrameValidator(data=pd.DataFrame({'Species': species}), column_rules={'Species': self.species_rules}) cleaned = validator.clean() self.assertItemsEqual(cleaned.Species.unique(), ['setosa', 'versicolor', 'virginica'])
def test_colnames_as_dict(self): """Test if column names gotten from SchemaValidator are implemented.""" namemap = {'Sepal Length': 'slength', 'Sepal Width': 'swidth', 'Petal Width': 'pwidth', 'Petal Length': 'plength', 'Species': 'spcs'} self.basespecs['iris']['column_names'] = namemap schema_val = SchemaValidator(specification=self.basespecs['iris']) parser_args = schema_val.get_parser_args() df = pd.read_csv(**parser_args) rules = {} rules.update(schema_val.df_rules) df_val = DataFrameValidator(data=df, rules=rules) data = df_val.clean() self.assertItemsEqual(data.columns, namemap.values())
def test_bad_unique_values(self): """Test if the validator drops values not specified in the schema.""" # Add some bogus values noise = np.random.choice(['lily', 'petunia'], size=(50, )) species = np.hstack((self.iris_dframe.Species.values, noise)) np.random.shuffle(species) species = pd.Series(species) validator = DataFrameValidator( data=pd.DataFrame({'Species': species}), column_rules={'Species': self.species_rules}) cleaned = validator.clean() self.assertItemsEqual(cleaned.Species.unique(), ['setosa', 'versicolor', 'virginica'])
def test_colnames_as_callable(self): translator = lambda x: "_".join([s.lower() for s in x.split()]) self.basespecs['iris']['column_names'] = translator schema_val = SchemaValidator(specification=self.basespecs['iris']) parser_args = schema_val.get_parser_args() df = pd.read_csv(**parser_args) rules = {} rules.update(schema_val.df_rules) df_val = DataFrameValidator(data=df, rules=rules) data = df_val.clean() ideal = [ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species' ] self.assertItemsEqual(data.columns, ideal)
def test_colnames_as_dict(self): """Test if column names gotten from SchemaValidator are implemented.""" namemap = { 'Sepal Length': 'slength', 'Sepal Width': 'swidth', 'Petal Width': 'pwidth', 'Petal Length': 'plength', 'Species': 'spcs' } self.basespecs['iris']['column_names'] = namemap schema_val = SchemaValidator(specification=self.basespecs['iris']) parser_args = schema_val.get_parser_args() df = pd.read_csv(**parser_args) rules = {} rules.update(schema_val.df_rules) df_val = DataFrameValidator(data=df, rules=rules) data = df_val.clean() self.assertItemsEqual(data.columns, namemap.values())
def load_dataset(self, dataset_name): """Load and return a dataset. :param dataset_name: Name of the dataset :type dataset_name: str :return: A pandas DataFrame containing the dataset. :rtype: pandas.DataFrame :Example: >>> demo_project = Project('pysemantic_demo') >>> iris = demo_project.load_dataset('iris') >>> type(iris) pandas.core.DataFrame """ if dataset_name not in self.validators: self._init_validate(dataset_name) validator = self.validators[dataset_name] column_rules = self.column_rules.get(dataset_name, {}) df_rules = self.df_rules.get(dataset_name, {}) parser_args = validator.get_parser_args() df_rules.update(validator.df_rules) logger.info("Attempting to load dataset {} with args:".format( dataset_name)) if validator.is_spreadsheet: parser_args.pop('usecols', None) logger.info(json.dumps(parser_args, cls=TypeEncoder)) if isinstance(parser_args, dict): if validator.is_mysql or validator.is_postgresql: if not ( parser_args.get('table_name') or parser_args.get('query')): raise ParserArgumentError( "No table_name or query was provided for the " "postgres configuration.") elif validator.sql_validator.chunksize is not None: df = self._sql_iterator(parser_args) else: df = self._sql_read(parser_args) else: with ParseErrorHandler(parser_args, self) as handler: df = handler.load() if df is None: raise ParserArgumentError("No valid parser arguments were " + "inferred from the schema.") if validator.is_spreadsheet and isinstance(validator.sheetname, list): df = pd.concat(df.itervalues(), axis=0) logger.info("Success!") df_validator = DataFrameValidator(data=df, rules=df_rules, column_rules=column_rules) logger.info("Commence cleaning dataset:") logger.info("DataFrame rules:") logger.info(json.dumps(df_rules, cls=TypeEncoder)) logger.info("Column rules:") logger.info(json.dumps(column_rules, cls=TypeEncoder)) return df_validator.clean() else: dfs = [] for argset in parser_args: with ParseErrorHandler(argset, self) as handler: _df = handler.load() df_validator = DataFrameValidator(data=_df, column_rules=column_rules) dfs.append(df_validator.clean()) df = pd.concat(dfs, axis=0) return df.set_index(np.arange(df.shape[0]))