Пример #1
0
def test_evaluate_tables_from_demo():
    tables = load_demo(metadata=False)

    new_meta = Metadata()
    new_meta.add_table('users', data=tables['users'], primary_key='user_id')
    new_meta.add_table('sessions',
                       data=tables['sessions'],
                       primary_key='session_id',
                       parent='users',
                       foreign_key='user_id')
    transactions_fields = {
        'timestamp': {
            'type': 'datetime',
            'format': '%Y-%m-%d'
        }
    }
    new_meta.add_table('transactions',
                       tables['transactions'],
                       fields_metadata=transactions_fields,
                       primary_key='transaction_id',
                       parent='sessions')

    sdv = SDV()
    sdv.fit(new_meta, tables=tables)

    sampled = sdv.sample_all()

    table_scores = dict()
    for table in new_meta.get_tables():
        table_scores[table] = evaluate(sampled[table],
                                       real=tables[table],
                                       metadata=new_meta,
                                       table_name=table)

    evaluate(sampled, real=tables, metadata=new_meta)
Пример #2
0
    def test_raises_error(self):
        """If the table names in both datasets are not equal, an error is raised."""
        # Setup
        real = {'a': None, 'b': None}
        synth = {'a': None, 'x': None}
        metrics = []
        descriptors = []
        expected_error_message = "real and synthetic dataset must have the same tables"

        try:
            # Run
            evaluate(real, synth, metrics=metrics, descriptors=descriptors)
        except AssertionError as error:
            # Check
            assert error.args[0] == expected_error_message
Пример #3
0
def _score_dataset(dataset, datasets_path, output):
    start = datetime.now()

    try:
        if datasets_path is None:
            metadata, tables = load_demo(dataset, metadata=True)
        else:
            metadata = Metadata(
                os.path.join(datasets_path, dataset, 'metadata.json'))
            tables = metadata.load_tables()

        sdv = SDV()
        LOGGER.info('Modeling dataset %s', dataset)
        sdv.fit(metadata, tables)

        LOGGER.info('Sampling dataset %s', dataset)
        sampled = sdv.sample_all(10)

        LOGGER.info('Evaluating dataset %s', dataset)
        score = evaluate(sampled, metadata=metadata)

        LOGGER.info('%s: %s - ELAPSED: %s', dataset, score,
                    datetime.now() - start)
        output.update({
            'dataset': dataset,
            'score': score,
        })

    except Exception as ex:
        error = '{}: {}'.format(type(ex).__name__, str(ex))
        LOGGER.error('%s: %s - ELAPSED: %s', dataset, error,
                     datetime.now() - start)
        output.update({'dataset': dataset, 'error': error})
Пример #4
0
    def test_evaluate_dict_instance(self, descriptors_table_mock):
        """evaluate with dict instances"""

        # Setup
        descriptors_table_mock.return_value = pd.DataFrame({'foo': [1, 0]})

        # Run
        real = {'a': [1, 0], 'b': [1, 0]}

        synth = {'a': [0, 1], 'b': [0, 1]}

        result = evaluate(real, synth)

        # Asserts
        assert descriptors_table_mock.call_count == 2

        descriptors_table_mock.call_args_list == [
            call([1, 0], [0, 1], DESCRIPTORS.values()),
            call([1, 0], [0, 1], DESCRIPTORS.values())
        ]

        pd.testing.assert_series_equal(
            result,
            pd.Series({
                'mse': 1.0,
                'rmse': 1.0,
                'r2_score': -float("Inf")
            }))
Пример #5
0
 def eval_model(self, original_data, new_data):
     with warnings.catch_warnings():
         # ignore all caught warnings
         warnings.filterwarnings("ignore")
         #loaded = CTGAN.load(self.ctgan_model_path)
         #new_data = loaded.sample(nr_samples)
         eval = evaluate(new_data, original_data)
         print(f"evaluation of the model:\n {eval}")
         return eval
Пример #6
0
    def _sample_uniform_regression(self):
        """
        convert self.train_data[self.task.target] continuous column into a
        discrete binned distribution from max to min value of the continuous columns

        then run uniform classification sampling method on it to get the class_to_sample_size
        (bin_to_sample_size in this case)

        then use uniform_bin_draw iteratively and sample iteratively
        """
        sampling_method = self.task.sampling_method_id
        bins = self.task.regression_bins
        original_data = self.train_data
        self.train_data = self.train_data.copy()
        self.train_data[self.task.target] = pd.cut(
            x=self.train_data[self.task.target], bins=bins)
        #synthetic_data, sampling_method, score_aggregate = self._sample_uniform_classification()
        class_to_sample_size = self._get_class_to_sample_size()

        self.train_data = original_data
        dtype = self.train_data[self.task.target].dtypes

        def int_uniform_bin_draw(interval):
            left = interval.left + 1 if interval.left.is_integer(
            ) else interval.left
            return random.randint(math.ceil(left), math.floor(interval.right))

        def float_uniform_bin_draw(interval):
            return random.uniform(interval.left, interval.right)

        def uniform_bin_draw(interval):
            if pd.api.types.is_integer_dtype(dtype):
                return int_uniform_bin_draw(interval)
            else:
                return float_uniform_bin_draw(interval)

        rows = []
        for class_name, sample_size in class_to_sample_size.items():
            for i in range(sample_size):
                target_value = uniform_bin_draw(class_name)
                conditions = {self.task.target: target_value}
                data = self.generator.sample(
                    1, conditions=conditions)  # get 1 sample
                data[self.task.target] = data[self.task.target].astype(dtype)
                rows.append(data)

        synthetic_data = pd.concat(rows)
        assert (self.train_data.dtypes.to_list() ==
                synthetic_data.dtypes.to_list())
        score_aggregate = evaluate(synthetic_data,
                                   self.train_data,
                                   aggregate=True)
        return synthetic_data, sampling_method, score_aggregate
Пример #7
0
    def test_single_table(self, descriptors_mock):
        # Setup
        descriptors_mock.return_value = pd.DataFrame([
            {
                'a': 1,
                'b': 2,
                'c': 3
            },
            {
                'a': 2,
                'b': 4,
                'c': 6
            },

        ])

        real = pd.DataFrame([{'a': 'value'}])
        synth = 'synth data'

        metric_1 = MagicMock(return_value=0, __name__='metric_1')
        metric_2 = MagicMock(return_value=1, __name__='metric_2')

        metrics = [metric_1, metric_2]
        descriptors = ['descriptor_1', 'descriptors_2']

        expected_result = pd.Series({
            'metric_1': 0,
            'metric_2': 1
        })

        # Run
        result = evaluate(real, synth, metrics=metrics, descriptors=descriptors)

        # Check
        assert result.equals(expected_result)
        descriptors_mock.assert_called_once_with(real, synth, descriptors)

        call_args_list = metric_1.call_args_list
        assert len(call_args_list) == 1
        args, kwargs = call_args_list[0]
        assert kwargs == {}
        assert len(args) == 2
        assert args[0].equals(pd.Series({'a': 1, 'b': 2, 'c': 3}, name=0))
        assert args[1].equals(pd.Series({'a': 2, 'b': 4, 'c': 6}, name=1))

        call_args_list = metric_1.call_args_list
        assert len(call_args_list) == 1
        args, kwargs = call_args_list[0]
        assert kwargs == {}
        assert len(args) == 2
        assert args[0].equals(pd.Series({'a': 1, 'b': 2, 'c': 3}, name=0))
        assert args[1].equals(pd.Series({'a': 2, 'b': 4, 'c': 6}, name=1))
Пример #8
0
    def _sample_original(self):
        assert (len(self.sample_method_info) == 2)
        train_data_size = self.train_data.shape[0]
        step_size = train_data_size // task.ORIGINAL_STEPS
        steps = int(self.sample_method_info[1]) + 1
        sample_size = step_size * steps
        synthetic_data = self.generator.sample(sample_size)
        score_aggregate = evaluate(synthetic_data,
                                   self.train_data,
                                   aggregate=True)
        # score_column = make_score_column(score_aggregate)

        sampling_method_info = "original " + str(sample_size) + "/" + str(
            train_data_size)
        return synthetic_data, sampling_method_info, score_aggregate
Пример #9
0
    def _sample_uniform_classification(self):
        sampling_method = "uniform"
        target = self.task.target
        class_to_sample_size = self._get_class_to_sample_size()
        all_sampled_data = []
        for class_name, sample_size in class_to_sample_size.items():
            if sample_size > 0:
                conditions = {target: class_name}
                data = self.generator.sample(sample_size,
                                             conditions=conditions)
                all_sampled_data.append(data)
        if len(all_sampled_data):
            synthetic_data = pd.concat(all_sampled_data)
            score_aggregate = evaluate(synthetic_data,
                                       self.train_data,
                                       aggregate=True)

            return synthetic_data, sampling_method, score_aggregate
        else:
            return None, sampling_method, None
Пример #10
0
def eval(data_pars=None, compute_pars=None, out_pars=None, **kw):
    """
       Return metrics of the model when fitted.
    """
    global model, session
    from sdv.evaluation import evaluate

    # data_pars['train'] = True
    Xval, yval         = get_dataset(data_pars, task_type="eval")

    if model.model_pars['model_class'] in IMBLEARN_MODELS:
        Xnew, ynew     = transform((Xval, yval), data_pars, compute_pars, out_pars)
    else:
        Xnew            = transform(Xval, data_pars, compute_pars, out_pars)
    
    # log(data_pars)
    mpars = compute_pars.get("metrics_pars", {'aggregate': True})

    if model.model_pars['model_class'] in SDV_MODELS:
        evals = evaluate(Xnew, Xval, **mpars )
        return evals
    else:
        return None
Пример #11
0
def pd_augmentation_sdv(df, col=None, pars={})  :
    '''
    Using SDV Variation Autoencoders, the function augments more data into the dataset
    params:
            df          : (pandas dataframe) original dataframe
            col : column name for data enancement
            pars        : (dict - optional) contains:
                n_samples     : (int - optional) number of samples you would like to add, defaul is 10%
                primary_key   : (String - optional) the primary key of dataframe
                aggregate  : (boolean - optional) if False, prints SVD metrics, else it averages them
                path_model_save: saving location if save_model is set to True
                path_model_load: saved model location to skip training
                path_data_new  : new data where saved
    returns:
            df_new      : (pandas dataframe) df with more augmented data
            col         : (list of strings) same columns
    '''
    n_samples       = pars.get('n_samples', max(1, int(len(df) * 0.10) ) )   ## Add 10% or 1 sample by default value
    primary_key     = pars.get('colid', None)  ### Custom can be created on the fly
    metrics_type    = pars.get('aggregate', False)
    path_model_save = pars.get('path_model_save', 'data/output/ztmp/')
    model_name      = pars.get('model_name', "TVAE")

    # model fitting
    if 'path_model_load' in pars:
            model = load(pars['path_model_load'])
    else:
            log('##### Training Started #####')

            model = {'TVAE' : TVAE, 'CTGAN' : CTGAN, 'PAR' : PAR}[model_name]
            if model_name == 'PAR':
                model = model(entity_columns = pars['entity_columns'],
                              context_columns = pars['context_columns'],
                              sequence_index = pars['sequence_index'])
            else:
                model = model(primary_key=primary_key)
            model.fit(df)
            log('##### Training Finshed #####')
            try:
                 save(model, path_model_save )
                 log('model saved at: ', path_model_save  )
            except:
                 log('saving model failed: ', path_model_save)

    log('##### Generating Samples #############')
    new_data = model.sample(n_samples)
    log_pd( new_data, n=7)


    log('######### Evaluation Results #########')
    if metrics_type == True:
      evals = evaluate(new_data, df, aggregate= True )
      log(evals)
    else:
      evals = evaluate(new_data, df, aggregate= False )
      log_pd(evals, n=7)

    # appending new data
    df_new = df.append(new_data)
    log(str(len(df_new) - len(df)) + ' new data added')

    if 'path_newdata' in pars :
        new_data.to_parquet( pars['path_newdata'] + '/features.parquet' )
        log('###### df augmentation save on disk', pars['path_newdata'] )

    log('###### augmentation complete ######')
    return df_new, col
Пример #12
0
                      embedding_dim=proposal['embedding_dim'],
                      generator_dim=(proposal['gen'], proposal['gen']),
                      discriminator_dim=(proposal['dim_gen'],
                                         proposal['dim_gen']),
                      batch_size=proposal['batch_size'] * 10,
                      epochs=proposal['epochs'])

    # Fit the CopulaGAN
    print("fit")
    model.fit(real)
    print("sample")
    # Create 40000 rows of data
    synth_data = model.sample(500, max_retries=300)

    # Evaluate the synthetic data against the real data
    score = evaluate(synthetic_data=synth_data, real_data=real)
    print(score)
    # If the new hyperparameters beat the best ones, store them along with the score
    if score > best_score:
        best_params = proposal
        best_score = score

    # Record the hyperparameters and score
    tuner.record(proposal, score)
    # except:
    #   print(f"error on tuner proposal {_}")

## TRAINING LOOP END ##

print('Best score obtained: ', best_score)
print('Best parameters: ', best_params)
Пример #13
0
import pandas as pd
import warnings

# warnings.filterwarnings('ignore')

# Generación del dataset de prueba
blobs_params = dict(random_state=0, n_samples=50, n_features=2)
dataset = make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=0.5, **blobs_params)[0]
dataset = pd.DataFrame(data=dataset, columns=['col1', 'col2'])

# Creación del generador con los parámetros deseados
generator_params = {}
generator = synthetic_generator('GaussianCopula', generator_params)

# Entrenamiento del modelo
generator.fit(dataset)

# Generación de datos sintéticos
n_muestras_nuevas = 10
synthetic_dataset = generator.sample(n_muestras_nuevas)

# Evaluación del resultado
print(evaluate(synthetic_dataset, dataset, aggregate=False).to_string(), end='\n')

# Representación de los resultados
ax = dataset.plot.scatter('col1', 'col2', c='#00ff00', label='Original data')
synthetic_dataset.plot.scatter('col1', 'col2', c='#ff0000', ax=ax, label='Synthetic data')
ax.legend()
plt.show()

Пример #14
0
def benchmark(config='', dmin=5, dmax=6):
    from pmlb import fetch_data, classification_dataset_names
    from sdv.evaluation import evaluate

    for classification_dataset in classification_dataset_names[dmin:dmax]:
        X, y = fetch_data(classification_dataset, return_X_y=True)

        X_train_full, X_test, y_train_full, y_test = train_test_split(
            X, y, test_size=0.05, random_state=2021)
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train_full, y_train_full, random_state=2021)

        def post_process_fun(y):
            return int(y)

        def pre_process_fun(y):
            return int(y)

        #####
        # y = y.astype('uint8')
        num_classes = len(np.unique(y))
        print(np.unique(y))
        model_pars = {
            'model_pars': {
                'original_dim': X.shape[1],
                'class_num': num_classes,
                'intermediate_dim': 64,
                'intermediate_dim_2': 16,
                'latent_dim': 3,
                'Lambda1': 1,
                'batch_size': 256,
                'Lambda2': 200,
                'Alpha': 0.075
            },
            'post_process_fun':
            post_process_fun  ### After prediction  ##########################################
            ,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun,  ### Before training  ##########################

                ### Pipeline for data processing ##############################
                'pipe_list': [  #### coly target prorcessing
                    {
                        'uri': 'source/prepro.py::pd_coly',
                        'pars': {},
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    },
                    {
                        'uri': 'source/prepro.py::pd_colnum_bin',
                        'pars': {},
                        'cols_family': 'colnum',
                        'cols_out': 'colnum_bin',
                        'type': ''
                    },
                    {
                        'uri': 'source/prepro.py::pd_colcat_bin',
                        'pars': {},
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_bin',
                        'type': ''
                    },
                ],
            }
        }

        log(f'{classification_dataset} Metrics: ------------')
        column = [f'col_{i}' for i in range(X.shape[1])]
        real_df = pd.DataFrame(X_test, columns=column)

        ##### VAEMDN
        vae, vae_enc, vae_dec = VAEMDN(model_pars=model_pars['model_pars'])
        vae.fit([X_train_full, y_train_full], epochs=50)
        vae_data = vae.predict([X_test, y_test])
        vae_df = pd.DataFrame(vae_data, columns=column)
        evl_vae = evaluate(real_df,
                           vae_df,
                           metrics=['LogisticDetection', 'CSTest', 'KSTest'])
        log(f'Evaluation on VAE: {evl_vae}')

        log("##### AE")
        basic_ae, ae_enc, ae_dec = AUTOENCODER_BASIC(X.shape[1])
        basic_ae.fit(X_train_full, X_train_full, epochs=50)
        basic_data = basic_ae.predict(X_test)
        basic_df = pd.DataFrame(basic_data, columns=column)
        evl_ae = evaluate(real_df,
                          basic_df,
                          metrics=['LogisticDetection', 'CSTest', 'KSTest'])
        log(f'Evaluation on Basic_AE: {evl_ae}')
Пример #15
0
def pd_vae_augmentation(df, col=None, pars=None, n_samples=None, primary_key=None, aggregate=True):
    
    from sdv.demo import load_tabular_demo
    from sdv.tabular import TVAE
    from sdv.evaluation import evaluate

    # add 10% more samples
    if n_samples == None:
        if len(df) >= 10:
          log('samples amount not specified, adding 10%')
          n_samples = len(df) // 10
        else:
          log('dataframe too small, adding only 1')
          n_samples = 1
    
    # model fitting
    model = TVAE(primary_key=primary_key)
    model.fit(df)
    
    # generating new samples
    new_data = model.sample(n_samples)
    
    # log the evaluations
    evals = evaluate(new_data, df, aggregate=aggregate)
    log('######### Evaluation Results #########')
    if aggregate:
      log(evals)
    else:
      log_pd(evals, n=7)
    
    # appending new data    
    df_new = df.append(new_data)
    
    return df_new, col

    # add 10% more samples
    if n_samples == None:
        if len(df) >= 10:
          log('samples amount not specified, adding 10%')
          n_samples = len(df) // 10
        else:
          log('dataframe too small, adding only 1')
          n_samples = 1
    
    # model fitting
    model = TVAE(primary_key=primary_key)
    model.fit(df)
    
    # generating new samples
    new_data = model.sample(n_samples)
    
    # log the evaluations
    evals = evaluate(new_data, df, aggregate=aggregate)
    log('######### Evaluation Results #########')
    if aggregate:
      log(evals)
    else:
      log_pd(evals, n=7)
    
    # appending new data    
    df_new = df.append(new_data)
    
    return df_new, col
Пример #16
0
            model.fit(df)
            log('##### Training Finshed #####')
            try:
                 save(model, path_model_save )
                 log('model saved at: ', path_model_save  )
            except:
                 log('saving model failed: ', path_model_save)

    log('##### Generating Samples #############')
    new_data = model.sample(n_samples)
    log_pd( new_data, n=7)
    
   
    log('######### Evaluation Results #########')
    if metrics_type == True:
      evals = evaluate(new_data, df, aggregate= True )        
      log(evals)
    else:
      evals = evaluate(new_data, df, aggregate= False )        
      log_pd(evals, n=7)
    
    # appending new data    
    df_new = df.append(new_data)
    log(str(len(df_new) - len(df)) + ' new data added')
    
    if 'path_newdata' in pars :
        new_data.to_parquet( pars['path_newdata'] + '/features.parquet' ) 
        log('###### df augmentation save on disk', pars['path_newdata'] )    
    
    log('###### augmentation complete ######')
    return df_new, col