def test_init(input_obj): if isinstance(input_obj, list): dataset = DataSet(input_obj, pandas=False) else: dataset = DataSet(input_obj) duper = Duplicator(dataset, **{'percentage': 50}) assert isinstance(duper, Duplicator) assert duper.num_rows > 0
def test_append(input_obj, rows, shape, kwargs): data = DataSet(input_obj, **kwargs) data.append(rows) if data.data_type in ['numpy', 'pandas']: assert data.records.shape == shape else: assert len(data.records) == shape[0] assert len(data.records[0]) == shape[1]
def test_sample(input_obj, percentage, columns, kwargs): data = DataSet(input_obj, **kwargs) sample = data.sample(percentage, columns=columns) assert len(sample) == 1 if columns and not kwargs: assert sample[0] in input_obj[0] elif columns: assert sample[0] in [0, 1, 2] elif isinstance(sample, pd.DataFrame): assert list(sample.T.to_dict().values())[0] in input_obj else: assert sample[0] in input_obj
def test_set_value(input_obj,cols,val): if isinstance(input_obj, list): dataset = DataSet(input_obj, pandas=False) else: dataset = DataSet(input_obj) noizer = NoiseMaker(dataset, **{'columns': cols, 'percentage': 50, 'noise': ['random']}) noizer.set_value(val) for col in noizer.columns: if noizer.dataset.data_type == 'pandas': assert noizer.dataset.records[noizer.dataset.records.iloc[:,col] == val].shape[0] >= 1 elif noizer.dataset.data_type == 'numpy': assert val in noizer.dataset.records[:,col] else: assert val in [r[col] for r in noizer.dataset.records]
def test_output_sql(input_obj, kwargs): data = DataSet(input_obj, **kwargs) assert isinstance(data, DataSet) assert data.db_uri == kwargs.get('db_uri') data.to_output() db = dataset_db.connect(kwargs.get('db_uri')) rows = list(db.query('select * from test;')) assert len(rows) == 2 # removing id assert sorted(list(rows[0].values())[1:]) == sorted( list(input_obj[0].values())) if os.path.exists(kwargs.get('db_uri').replace('sqlite:///', '')): os.remove(kwargs.get('db_uri').replace('sqlite:///', ''))
def test_run_strategy(input_obj, cols, noise, limits): if isinstance(input_obj, list): dataset = DataSet(input_obj, pandas=False) else: dataset = DataSet(input_obj) noizer = NoiseMaker(dataset, **{'columns': cols, 'percentage': 50, 'noise': noise}, limits=limits) noizer.run_strategy() assert isinstance(noizer, NoiseMaker) assert isinstance(noizer.columns, collections.Iterable) if dataset.data_type == 'pandas': assert not dataset.records.equals(dataset.input) elif dataset.data_type == 'numpy': assert not np.array_equal(dataset.records, dataset.input) else: assert dataset.records != dataset.input
def test_init(input_obj, cols): dataset = DataSet(input_obj) fuzzer = Fuzzer(dataset, **{'columns': cols, 'percentage': 50}) assert isinstance(fuzzer, Fuzzer) assert isinstance(fuzzer.columns, collections.Iterable) assert fuzzer.num_rows > 0 assert fuzzer.percentage == .5
def test_build_strategy(input_dict, output, percent, cols): dataset = DataSet(np.random.rand(20, 5)) strategy_obj = build_strategy(input_dict, dataset) assert isinstance(strategy_obj, output) assert strategy_obj.percentage == percent if cols: assert len(strategy_obj.columns) == cols
def test_run_strategy(input_obj, cols): if isinstance(input_obj, list): dataset = DataSet(input_obj, pandas=False) else: dataset = DataSet(input_obj) fuzzer = Fuzzer(dataset, **{'columns': cols, 'percentage': 50}) fuzzer.run_strategy() assert isinstance(fuzzer, Fuzzer) assert isinstance(fuzzer.columns, collections.Iterable) # TODO: need to add test that types are not equal as sometimes they test as equals if dataset.data_type == 'pandas': assert not dataset.records.equals(dataset.input) elif dataset.data_type == 'numpy': assert not np.array_equal(dataset.records, dataset.input) else: assert dataset.records != dataset.input
def test_init_from_obj(input_obj): data = DataSet(input_obj) if isinstance(input_obj, list): assert data.records.equals(pd.DataFrame(input_obj)) assert data.original == input_obj elif isinstance(input_obj, pd.DataFrame): assert data.records.equals(input_obj) assert data.input.equals(input_obj) assert data.original.equals(input_obj) else: assert np.array_equal(data.original, input_obj) assert np.array_equal(data.records, input_obj) assert np.array_equal(data.input, input_obj) assert data.records is not input_obj assert data.original is input_obj assert len(data) == 2 assert isinstance(data, collections.Iterable) count = 0 for line in data: if data.data_type == 'pandas': assert data[count].equals(data.records.iloc[count, :]) elif data.data_type == 'numpy': assert np.array_equal(data[count], data.records[count, :]) else: assert data[count] == line count += 1 assert count == 2
def test_helper_methods(): dataset = DataSet([[1, 2, 3]]) fuzzer = Fuzzer(dataset) assert fuzzer.fuzz_random() in [sql, metachars, files, delimiter, emoji] assert fuzzer.fuzz_str() in [ add_format, change_encoding, to_bytes, insert_boms ] assert fuzzer.fuzz_numeric() in [nanify, bigints, hexify]
def test_output(input_obj, output, output_type, kwargs): data = DataSet(input_obj, output=output, **kwargs) if output.startswith('file://'): assert data.output_filename == output.replace('file://', '') to_output = data.to_output() assert isinstance(to_output, output_type) if output_type == list and isinstance(input_obj, list): assert input_obj == to_output elif output_type == list: assert input_obj.tolist() == to_output elif output_type == pd.DataFrame: assert to_output.shape == (2, 3) elif output_type == np.ndarray and not kwargs: assert np.array_equal(to_output[0, :], [1, 2, 5]) elif output_type == np.ndarray: assert to_output[0] == {'a': 1, 'b': 2, 'd': 5} elif output_type == str: assert os.path.exists(to_output)
def test_init_no_pandas(input_obj, shape): if isinstance(input_obj, str): input_obj = 'file://{}'.format(input_obj) data = DataSet(input_obj, pandas=False) assert isinstance(data.records, list) assert isinstance(data.input, list) assert len(data.records) == shape[0] assert len(data.records[0]) == shape[1] assert data.original == input_obj
def test_init_sql(input_obj, kwargs): data = DataSet(input_obj, **kwargs) assert isinstance(data, DataSet) assert data.db_uri == kwargs.get('db_uri') assert data.query == kwargs.get('query') if kwargs.get('pandas') is None: assert isinstance(data.input, pd.DataFrame) else: assert isinstance(data.input, list)
def test_init(input_obj, cols): dataset = DataSet(input_obj) noizer = NoiseMaker(dataset, **{'columns': cols, 'percentage': 50, 'noise': ['random']}) assert isinstance(noizer, NoiseMaker) assert isinstance(noizer.columns, collections.Iterable) assert noizer.num_rows > 0 assert noizer.percentage == .5 with pytest.raises(Exception): NoiseMaker(dataset, **{'columns': cols, 'percentage': 50})
def test_duplicate_with_noise(input_obj): if isinstance(input_obj, list): dataset = DataSet(input_obj, pandas=False) else: dataset = DataSet(input_obj) duper = Duplicator(dataset, **{'percentage': 50, 'add_noise': True}) duper.run_strategy() assert isinstance(duper, Duplicator) if dataset.data_type == 'pandas': assert not dataset.records.equals(dataset.input) assert not dataset.records.duplicated().any() elif dataset.data_type == 'numpy': assert not np.array_equal(np.unique(dataset.records, axis=0), dataset.input) assert not np.array_equal(dataset.records, dataset.input) else: counter = collections.Counter([str(r) for r in dataset.records]) assert not any( [val if val[1] > 1 else None for val in counter.most_common()]) assert dataset.records != dataset.input
def test_init_from_file(input_file, shape): filename = 'file://{}'.format(input_file) data = DataSet(filename) assert data.records is not input_file assert data.input is not input_file assert data.data_type == 'pandas' assert isinstance(data.records, pd.DataFrame) assert isinstance(data.input, pd.DataFrame) assert data.records.shape == shape assert data.input.shape == shape assert data.original == filename assert data.input_filename == filename.replace('file://', '')
def test_duplicate(input_obj): if isinstance(input_obj, list): dataset = DataSet(input_obj, pandas=False) else: dataset = DataSet(input_obj) duper = Duplicator(dataset, **{'percentage': 50}) duper.run_strategy() assert isinstance(duper, Duplicator) if dataset.data_type == 'pandas': assert not dataset.records.equals(dataset.input) assert dataset.records.duplicated().any() assert len(dataset) > dataset.input.shape[0] elif dataset.data_type == 'numpy': unique = np.unique(dataset.records, axis=0) assert sorted(unique.ravel()) == sorted(dataset.input.ravel()) assert not np.array_equal(dataset.records, dataset.input) assert len(dataset) > dataset.input.shape[0] else: counter = collections.Counter([str(r) for r in dataset.records]) assert any( [val if val[1] > 1 else None for val in counter.most_common()]) assert dataset.records != dataset.input assert len(dataset) > len(dataset.input)
def fuzz_from_parser(parser): """ Fuzz using parser input. This will generate a `dataset.Dataset` from `parser.input`, apply any defined strategies and call `dataset.to_output`. Arguments: parser (`parsers.StrategyCLIParser` or `parsers.StrategyYAMLParser`): strategy parser Returns: dataset.to_output() """ dataset = DataSet(parser.input, output=parser.output, db_uri=parser.db_uri, query=parser.query, table=parser.table) for strategy in parser.strategies: strategy_obj = build_strategy(strategy, dataset) try: strategy_obj.run_strategy() except Exception: logging.exception('Error running strategy: %s', strategy) return dataset.to_output()
def noise(self, sample): """ Adds noise to the duplicate rows Parameteres: sample (list or obj): `dataset.Dataset.sample` Returns sample (list or obj): distorted rows TODO: - implement more noise options than just random """ sample_dataset = DataSet(sample.copy()) columns = sample_dataset.sample(self.percentage, columns=True) if sample_dataset.data_type == 'pandas': sample_dataset.records = \ sample_dataset.records.reset_index(drop=True) for column in columns: col = sample_dataset.column_idx(column) col_type = sample_dataset.column_dtype(col) func = None if 'float' in str(col_type): func = generate_random_float elif 'int' in str(col_type): func = generate_random_int if func: kwargs = { 'low': self.dataset.column_agg(col, min), 'high': self.dataset.column_agg(col, max) } if kwargs.get('low') == kwargs.get('high'): kwargs['high'] += 1 sample = self.apply_func_to_column(lambda x: func(x, **kwargs), col) elif col_type in [object, str]: sample = self.apply_func_to_column(messy_spaces, col, dataset=sample_dataset) return sample_dataset.records
def test_column_dtype(input_obj, column, col_type, kwargs): data = DataSet(input_obj, **kwargs) if isinstance(column, str): column = data.column_idx(column) assert data.column_dtype(column) == col_type
def test_init_errors(input_obj, error): with pytest.raises(error): DataSet(input_obj)
def test_output_errors(input_obj, output): with pytest.raises(NotImplementedError): data = DataSet(input_obj, output=output) data.to_output()
def test_bad_io(input_obj, output, kwargs): with pytest.raises(Exception): if 'data' in input_obj: input_obj = 'file://{}'.format(input_obj) data = DataSet(input_obj, output=output, **kwargs) data.to_output()
def test_column_agg(input_obj, column, agg, result, kwargs): data = DataSet(input_obj, **kwargs) if isinstance(column, str): column = data.column_idx(column) assert data.column_agg(column, agg) == result