def test_evaluate_tables_from_demo(): tables = load_demo(metadata=False) new_meta = Metadata() new_meta.add_table('users', data=tables['users'], primary_key='user_id') new_meta.add_table('sessions', data=tables['sessions'], primary_key='session_id', parent='users', foreign_key='user_id') transactions_fields = { 'timestamp': { 'type': 'datetime', 'format': '%Y-%m-%d' } } new_meta.add_table('transactions', tables['transactions'], fields_metadata=transactions_fields, primary_key='transaction_id', parent='sessions') sdv = SDV() sdv.fit(new_meta, tables=tables) sampled = sdv.sample_all() table_scores = dict() for table in new_meta.get_tables(): table_scores[table] = evaluate(sampled[table], real=tables[table], metadata=new_meta, table_name=table) evaluate(sampled, real=tables, metadata=new_meta)
def test_raises_error(self): """If the table names in both datasets are not equal, an error is raised.""" # Setup real = {'a': None, 'b': None} synth = {'a': None, 'x': None} metrics = [] descriptors = [] expected_error_message = "real and synthetic dataset must have the same tables" try: # Run evaluate(real, synth, metrics=metrics, descriptors=descriptors) except AssertionError as error: # Check assert error.args[0] == expected_error_message
def _score_dataset(dataset, datasets_path, output): start = datetime.now() try: if datasets_path is None: metadata, tables = load_demo(dataset, metadata=True) else: metadata = Metadata( os.path.join(datasets_path, dataset, 'metadata.json')) tables = metadata.load_tables() sdv = SDV() LOGGER.info('Modeling dataset %s', dataset) sdv.fit(metadata, tables) LOGGER.info('Sampling dataset %s', dataset) sampled = sdv.sample_all(10) LOGGER.info('Evaluating dataset %s', dataset) score = evaluate(sampled, metadata=metadata) LOGGER.info('%s: %s - ELAPSED: %s', dataset, score, datetime.now() - start) output.update({ 'dataset': dataset, 'score': score, }) except Exception as ex: error = '{}: {}'.format(type(ex).__name__, str(ex)) LOGGER.error('%s: %s - ELAPSED: %s', dataset, error, datetime.now() - start) output.update({'dataset': dataset, 'error': error})
def test_evaluate_dict_instance(self, descriptors_table_mock): """evaluate with dict instances""" # Setup descriptors_table_mock.return_value = pd.DataFrame({'foo': [1, 0]}) # Run real = {'a': [1, 0], 'b': [1, 0]} synth = {'a': [0, 1], 'b': [0, 1]} result = evaluate(real, synth) # Asserts assert descriptors_table_mock.call_count == 2 descriptors_table_mock.call_args_list == [ call([1, 0], [0, 1], DESCRIPTORS.values()), call([1, 0], [0, 1], DESCRIPTORS.values()) ] pd.testing.assert_series_equal( result, pd.Series({ 'mse': 1.0, 'rmse': 1.0, 'r2_score': -float("Inf") }))
def eval_model(self, original_data, new_data): with warnings.catch_warnings(): # ignore all caught warnings warnings.filterwarnings("ignore") #loaded = CTGAN.load(self.ctgan_model_path) #new_data = loaded.sample(nr_samples) eval = evaluate(new_data, original_data) print(f"evaluation of the model:\n {eval}") return eval
def _sample_uniform_regression(self): """ convert self.train_data[self.task.target] continuous column into a discrete binned distribution from max to min value of the continuous columns then run uniform classification sampling method on it to get the class_to_sample_size (bin_to_sample_size in this case) then use uniform_bin_draw iteratively and sample iteratively """ sampling_method = self.task.sampling_method_id bins = self.task.regression_bins original_data = self.train_data self.train_data = self.train_data.copy() self.train_data[self.task.target] = pd.cut( x=self.train_data[self.task.target], bins=bins) #synthetic_data, sampling_method, score_aggregate = self._sample_uniform_classification() class_to_sample_size = self._get_class_to_sample_size() self.train_data = original_data dtype = self.train_data[self.task.target].dtypes def int_uniform_bin_draw(interval): left = interval.left + 1 if interval.left.is_integer( ) else interval.left return random.randint(math.ceil(left), math.floor(interval.right)) def float_uniform_bin_draw(interval): return random.uniform(interval.left, interval.right) def uniform_bin_draw(interval): if pd.api.types.is_integer_dtype(dtype): return int_uniform_bin_draw(interval) else: return float_uniform_bin_draw(interval) rows = [] for class_name, sample_size in class_to_sample_size.items(): for i in range(sample_size): target_value = uniform_bin_draw(class_name) conditions = {self.task.target: target_value} data = self.generator.sample( 1, conditions=conditions) # get 1 sample data[self.task.target] = data[self.task.target].astype(dtype) rows.append(data) synthetic_data = pd.concat(rows) assert (self.train_data.dtypes.to_list() == synthetic_data.dtypes.to_list()) score_aggregate = evaluate(synthetic_data, self.train_data, aggregate=True) return synthetic_data, sampling_method, score_aggregate
def test_single_table(self, descriptors_mock): # Setup descriptors_mock.return_value = pd.DataFrame([ { 'a': 1, 'b': 2, 'c': 3 }, { 'a': 2, 'b': 4, 'c': 6 }, ]) real = pd.DataFrame([{'a': 'value'}]) synth = 'synth data' metric_1 = MagicMock(return_value=0, __name__='metric_1') metric_2 = MagicMock(return_value=1, __name__='metric_2') metrics = [metric_1, metric_2] descriptors = ['descriptor_1', 'descriptors_2'] expected_result = pd.Series({ 'metric_1': 0, 'metric_2': 1 }) # Run result = evaluate(real, synth, metrics=metrics, descriptors=descriptors) # Check assert result.equals(expected_result) descriptors_mock.assert_called_once_with(real, synth, descriptors) call_args_list = metric_1.call_args_list assert len(call_args_list) == 1 args, kwargs = call_args_list[0] assert kwargs == {} assert len(args) == 2 assert args[0].equals(pd.Series({'a': 1, 'b': 2, 'c': 3}, name=0)) assert args[1].equals(pd.Series({'a': 2, 'b': 4, 'c': 6}, name=1)) call_args_list = metric_1.call_args_list assert len(call_args_list) == 1 args, kwargs = call_args_list[0] assert kwargs == {} assert len(args) == 2 assert args[0].equals(pd.Series({'a': 1, 'b': 2, 'c': 3}, name=0)) assert args[1].equals(pd.Series({'a': 2, 'b': 4, 'c': 6}, name=1))
def _sample_original(self): assert (len(self.sample_method_info) == 2) train_data_size = self.train_data.shape[0] step_size = train_data_size // task.ORIGINAL_STEPS steps = int(self.sample_method_info[1]) + 1 sample_size = step_size * steps synthetic_data = self.generator.sample(sample_size) score_aggregate = evaluate(synthetic_data, self.train_data, aggregate=True) # score_column = make_score_column(score_aggregate) sampling_method_info = "original " + str(sample_size) + "/" + str( train_data_size) return synthetic_data, sampling_method_info, score_aggregate
def _sample_uniform_classification(self): sampling_method = "uniform" target = self.task.target class_to_sample_size = self._get_class_to_sample_size() all_sampled_data = [] for class_name, sample_size in class_to_sample_size.items(): if sample_size > 0: conditions = {target: class_name} data = self.generator.sample(sample_size, conditions=conditions) all_sampled_data.append(data) if len(all_sampled_data): synthetic_data = pd.concat(all_sampled_data) score_aggregate = evaluate(synthetic_data, self.train_data, aggregate=True) return synthetic_data, sampling_method, score_aggregate else: return None, sampling_method, None
def eval(data_pars=None, compute_pars=None, out_pars=None, **kw): """ Return metrics of the model when fitted. """ global model, session from sdv.evaluation import evaluate # data_pars['train'] = True Xval, yval = get_dataset(data_pars, task_type="eval") if model.model_pars['model_class'] in IMBLEARN_MODELS: Xnew, ynew = transform((Xval, yval), data_pars, compute_pars, out_pars) else: Xnew = transform(Xval, data_pars, compute_pars, out_pars) # log(data_pars) mpars = compute_pars.get("metrics_pars", {'aggregate': True}) if model.model_pars['model_class'] in SDV_MODELS: evals = evaluate(Xnew, Xval, **mpars ) return evals else: return None
def pd_augmentation_sdv(df, col=None, pars={}) : ''' Using SDV Variation Autoencoders, the function augments more data into the dataset params: df : (pandas dataframe) original dataframe col : column name for data enancement pars : (dict - optional) contains: n_samples : (int - optional) number of samples you would like to add, defaul is 10% primary_key : (String - optional) the primary key of dataframe aggregate : (boolean - optional) if False, prints SVD metrics, else it averages them path_model_save: saving location if save_model is set to True path_model_load: saved model location to skip training path_data_new : new data where saved returns: df_new : (pandas dataframe) df with more augmented data col : (list of strings) same columns ''' n_samples = pars.get('n_samples', max(1, int(len(df) * 0.10) ) ) ## Add 10% or 1 sample by default value primary_key = pars.get('colid', None) ### Custom can be created on the fly metrics_type = pars.get('aggregate', False) path_model_save = pars.get('path_model_save', 'data/output/ztmp/') model_name = pars.get('model_name', "TVAE") # model fitting if 'path_model_load' in pars: model = load(pars['path_model_load']) else: log('##### Training Started #####') model = {'TVAE' : TVAE, 'CTGAN' : CTGAN, 'PAR' : PAR}[model_name] if model_name == 'PAR': model = model(entity_columns = pars['entity_columns'], context_columns = pars['context_columns'], sequence_index = pars['sequence_index']) else: model = model(primary_key=primary_key) model.fit(df) log('##### Training Finshed #####') try: save(model, path_model_save ) log('model saved at: ', path_model_save ) except: log('saving model failed: ', path_model_save) log('##### Generating Samples #############') new_data = model.sample(n_samples) log_pd( new_data, n=7) log('######### Evaluation Results #########') if metrics_type == True: evals = evaluate(new_data, df, aggregate= True ) log(evals) else: evals = evaluate(new_data, df, aggregate= False ) log_pd(evals, n=7) # appending new data df_new = df.append(new_data) log(str(len(df_new) - len(df)) + ' new data added') if 'path_newdata' in pars : new_data.to_parquet( pars['path_newdata'] + '/features.parquet' ) log('###### df augmentation save on disk', pars['path_newdata'] ) log('###### augmentation complete ######') return df_new, col
embedding_dim=proposal['embedding_dim'], generator_dim=(proposal['gen'], proposal['gen']), discriminator_dim=(proposal['dim_gen'], proposal['dim_gen']), batch_size=proposal['batch_size'] * 10, epochs=proposal['epochs']) # Fit the CopulaGAN print("fit") model.fit(real) print("sample") # Create 40000 rows of data synth_data = model.sample(500, max_retries=300) # Evaluate the synthetic data against the real data score = evaluate(synthetic_data=synth_data, real_data=real) print(score) # If the new hyperparameters beat the best ones, store them along with the score if score > best_score: best_params = proposal best_score = score # Record the hyperparameters and score tuner.record(proposal, score) # except: # print(f"error on tuner proposal {_}") ## TRAINING LOOP END ## print('Best score obtained: ', best_score) print('Best parameters: ', best_params)
import pandas as pd import warnings # warnings.filterwarnings('ignore') # Generación del dataset de prueba blobs_params = dict(random_state=0, n_samples=50, n_features=2) dataset = make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=0.5, **blobs_params)[0] dataset = pd.DataFrame(data=dataset, columns=['col1', 'col2']) # Creación del generador con los parámetros deseados generator_params = {} generator = synthetic_generator('GaussianCopula', generator_params) # Entrenamiento del modelo generator.fit(dataset) # Generación de datos sintéticos n_muestras_nuevas = 10 synthetic_dataset = generator.sample(n_muestras_nuevas) # Evaluación del resultado print(evaluate(synthetic_dataset, dataset, aggregate=False).to_string(), end='\n') # Representación de los resultados ax = dataset.plot.scatter('col1', 'col2', c='#00ff00', label='Original data') synthetic_dataset.plot.scatter('col1', 'col2', c='#ff0000', ax=ax, label='Synthetic data') ax.legend() plt.show()
def benchmark(config='', dmin=5, dmax=6): from pmlb import fetch_data, classification_dataset_names from sdv.evaluation import evaluate for classification_dataset in classification_dataset_names[dmin:dmax]: X, y = fetch_data(classification_dataset, return_X_y=True) X_train_full, X_test, y_train_full, y_test = train_test_split( X, y, test_size=0.05, random_state=2021) X_train, X_valid, y_train, y_valid = train_test_split( X_train_full, y_train_full, random_state=2021) def post_process_fun(y): return int(y) def pre_process_fun(y): return int(y) ##### # y = y.astype('uint8') num_classes = len(np.unique(y)) print(np.unique(y)) model_pars = { 'model_pars': { 'original_dim': X.shape[1], 'class_num': num_classes, 'intermediate_dim': 64, 'intermediate_dim_2': 16, 'latent_dim': 3, 'Lambda1': 1, 'batch_size': 256, 'Lambda2': 200, 'Alpha': 0.075 }, 'post_process_fun': post_process_fun ### After prediction ########################################## , 'pre_process_pars': { 'y_norm_fun': pre_process_fun, ### Before training ########################## ### Pipeline for data processing ############################## 'pipe_list': [ #### coly target prorcessing { 'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, ], } } log(f'{classification_dataset} Metrics: ------------') column = [f'col_{i}' for i in range(X.shape[1])] real_df = pd.DataFrame(X_test, columns=column) ##### VAEMDN vae, vae_enc, vae_dec = VAEMDN(model_pars=model_pars['model_pars']) vae.fit([X_train_full, y_train_full], epochs=50) vae_data = vae.predict([X_test, y_test]) vae_df = pd.DataFrame(vae_data, columns=column) evl_vae = evaluate(real_df, vae_df, metrics=['LogisticDetection', 'CSTest', 'KSTest']) log(f'Evaluation on VAE: {evl_vae}') log("##### AE") basic_ae, ae_enc, ae_dec = AUTOENCODER_BASIC(X.shape[1]) basic_ae.fit(X_train_full, X_train_full, epochs=50) basic_data = basic_ae.predict(X_test) basic_df = pd.DataFrame(basic_data, columns=column) evl_ae = evaluate(real_df, basic_df, metrics=['LogisticDetection', 'CSTest', 'KSTest']) log(f'Evaluation on Basic_AE: {evl_ae}')
def pd_vae_augmentation(df, col=None, pars=None, n_samples=None, primary_key=None, aggregate=True): from sdv.demo import load_tabular_demo from sdv.tabular import TVAE from sdv.evaluation import evaluate # add 10% more samples if n_samples == None: if len(df) >= 10: log('samples amount not specified, adding 10%') n_samples = len(df) // 10 else: log('dataframe too small, adding only 1') n_samples = 1 # model fitting model = TVAE(primary_key=primary_key) model.fit(df) # generating new samples new_data = model.sample(n_samples) # log the evaluations evals = evaluate(new_data, df, aggregate=aggregate) log('######### Evaluation Results #########') if aggregate: log(evals) else: log_pd(evals, n=7) # appending new data df_new = df.append(new_data) return df_new, col # add 10% more samples if n_samples == None: if len(df) >= 10: log('samples amount not specified, adding 10%') n_samples = len(df) // 10 else: log('dataframe too small, adding only 1') n_samples = 1 # model fitting model = TVAE(primary_key=primary_key) model.fit(df) # generating new samples new_data = model.sample(n_samples) # log the evaluations evals = evaluate(new_data, df, aggregate=aggregate) log('######### Evaluation Results #########') if aggregate: log(evals) else: log_pd(evals, n=7) # appending new data df_new = df.append(new_data) return df_new, col
model.fit(df) log('##### Training Finshed #####') try: save(model, path_model_save ) log('model saved at: ', path_model_save ) except: log('saving model failed: ', path_model_save) log('##### Generating Samples #############') new_data = model.sample(n_samples) log_pd( new_data, n=7) log('######### Evaluation Results #########') if metrics_type == True: evals = evaluate(new_data, df, aggregate= True ) log(evals) else: evals = evaluate(new_data, df, aggregate= False ) log_pd(evals, n=7) # appending new data df_new = df.append(new_data) log(str(len(df_new) - len(df)) + ' new data added') if 'path_newdata' in pars : new_data.to_parquet( pars['path_newdata'] + '/features.parquet' ) log('###### df augmentation save on disk', pars['path_newdata'] ) log('###### augmentation complete ######') return df_new, col