def independent_synthesis(json = 'adult'): data, categorical_columns, ordinal_columns = load_dataset(json) synthesizer = IndependentSynthesizer() synthesizer.fit(data, categorical_columns, ordinal_columns) sampled = synthesizer.sample(10) print(sampled) np.savetxt('test.txt', sampled, delimiter=',') return sampled
def independent_benchmark(json = 'adult'): train, test, meta, categoricals, ordinals = load_dataset(json, benchmark=True) synthesizer = IndependentSynthesizer() synthesizer.fit(train, categoricals, ordinals) sampled = synthesizer.sample(300) print('Sampled Data for 300 records\n') scores = evaluate(train, test, sampled, meta) print('\nEvaluation Scores from evaluate function:\n') return scores
def compute_benchmark(synthesizer, datasets=DEFAULT_DATASETS, iterations=3): """Compute the scores of a synthesizer over a list of datasets. The results are returned in a raw format as a ``pandas.DataFrame`` containing: - One row for each dataset+scoring method (for example, a classifier) - One column for each computed metric - The columns: - dataset - distance - name (of the scoring method) - iteration For example, evaluating a synthesizer on the ``adult`` and ``asia`` datasets with 2 iterations produces a table similar to this:: dataset name iter distance accuracy f1 syn_likelihood test_likelihood adult DecisionTree... 0 0.0 0.79 0.65 NaN NaN adult AdaBoost... 0 0.0 0.85 0.67 NaN NaN adult Logistic... 0 0.0 0.79 0.66 NaN NaN adult MLP... 0 0.0 0.84 0.67 NaN NaN adult DecisionTree... 1 0.0 0.80 0.66 NaN NaN adult AdaBoost... 1 0.0 0.86 0.68 NaN NaN adult Logistic... 1 0.0 0.79 0.65 NaN NaN adult MLP... 1 0.0 0.84 0.64 NaN NaN asia Bayesian ... 0 0.0 NaN NaN -2.23 -2.24 asia Bayesian ... 1 0.0 NaN NaN -2.23 -2.24 """ results = list() for dataset_name in datasets: LOGGER.info('Evaluating dataset %s', dataset_name) train, test, meta, categoricals, ordinals = load_dataset( dataset_name, benchmark=True) for iteration in range(iterations): try: start = timer() synthesized = synthesizer(train, categoricals, ordinals) end = timer() scores = compute_scores(train, test, synthesized, meta) scores['dataset'] = dataset_name scores['iteration'] = iteration scores['exec_time (s)'] = end - start results.append(scores) except Exception: LOGGER.exception( 'Error computing scores for %s on dataset %s - iteration %s', _get_synthesizer_name(synthesizer), dataset_name, iteration) return pd.concat(results, sort=False)
def benchmark(synthesizer, datasets=DEFAULT_DATASETS, repeat=3): results = list() for name in datasets: LOGGER.info('Evaluating dataset %s', name) train, test, meta, categoricals, ordinals = load_dataset( name, benchmark=True) for iteration in range(repeat): synthesized = synthesizer(train, categoricals, ordinals) scores = evaluate(train, test, synthesized, meta) scores['dataset'] = name scores['iter'] = iteration results.append(scores) return pd.concat(results)
def benchCLBNSynthesizer(): from sdgym.synthesizers import CLBNSynthesizer from sdgym.evaluate import evaluate from sdgym.data import load_dataset train, test, meta, categoricals, ordinals = load_dataset('adult', benchmark=True) synthesizer = CLBNSynthesizer() synthesizer.fit(train, categoricals, ordinals) sampled = synthesizer.sample(300) scores = evaluate(train, test, sampled, meta) #scores = scores.append(evaluate(train, test, sampled, meta)) #scores = scores.append(evaluate(train, test, sampled, meta)) print('\nEvaluation Scores from evaluate function:\n') print(scores) scores['Synth'] = 'CLBNSynthesizer' scores.to_csv('CLBNBench.csv')
def benchmark(synthesizer, datasets=DEFAULT_DATASETS, repeat=3, prefix='tmp'): print(datasets) results = list() for name in datasets: try: print('Evaluating dataset %s', name) train, test, meta, categoricals, ordinals = load_dataset( name, benchmark=True) for iteration in range(repeat): synthesized = synthesizer(train, categoricals, ordinals) scores = evaluate(train, test, synthesized, meta) scores['dataset'] = name scores['iter'] = iteration results.append(scores) print(results) with open(f'{prefix}_{name}.pickle', 'wb') as f: pickle.dump(results, f) except KeyError: print("Here is the KeyError") continue return pd.concat(results)
def _score_synthesizer_on_dataset(name, synthesizer, dataset_name, iteration, cache_dir): try: LOGGER.info('Evaluating %s on dataset %s; iteration %s; %s', name, dataset_name, iteration, _used_memory()) train, test, meta, categoricals, ordinals = load_dataset( dataset_name, benchmark=True) if isinstance(synthesizer, type) and issubclass( synthesizer, BaseSynthesizer): synthesizer = synthesizer().fit_sample LOGGER.info('Running %s on dataset %s; iteration %s; %s', name, dataset_name, iteration, _used_memory()) synthesized = synthesizer(train, categoricals, ordinals) LOGGER.info('Scoring %s on dataset %s; iteration %s; %s', name, dataset_name, iteration, _used_memory()) scores = compute_scores(train, test, synthesized, meta) scores['dataset'] = dataset_name scores['iteration'] = iteration scores['synthesizer'] = name if cache_dir: csv_name = f'{name}_{dataset_name}_{iteration}.csv' scores.to_csv(os.path.join(cache_dir, csv_name)) return scores except Exception: LOGGER.exception('Error running %s on dataset %s; iteration %s', name, dataset_name, iteration) finally: LOGGER.info('Finished %s on dataset %s; iteration %s; %s', name, dataset_name, iteration, _used_memory())
def uniform_synthesis(json='adult'): data, categorical_columns, ordinal_columns = load_dataset(json) synthesizer = UniformSynthesizer() synthesizer.fit(data, categorical_columns, ordinal_columns) sampled = synthesizer.sample(10) sampled
def identity_synthesis(json='adult'): data, categorical_columns, ordinal_columns = load_dataset(json) synthesizer = IdentitySynthesizer() synthesizer.fit(data, categorical_columns, ordinal_columns) sampled = synthesizer.sample(10) sampled
def clbn_synthesis(json = 'adult'): data, categorical_columns, ordinal_columns = load_dataset(json) synthesizer = CLBNSynthesizer() synthesizer.fit(data, categorical_columns, ordinal_columns) sampled = synthesizer.sample(10) return sampled
def independent_synthesis(json = 'adult'): data, categorical_columns, ordinal_columns = load_dataset(json) synthesizer = IndependentSynthesizer() synthesizer.fit(data, categorical_columns, ordinal_columns) sampled = synthesizer.sample(10) return sampled
def get_label_col(dataset_name): _, _, meta, _, _ = load_dataset(dataset_name, benchmark=True) for idx, c in enumerate(meta['columns']): if c['name'] == 'label': return idx return -1