示例#1
0
def test_synthesizer_sample():
    data = pd.DataFrame({"discrete": np.random.choice(["a", "b", "c"], 100)})
    discrete_columns = ["discrete"]

    ctgan = CTGANSynthesizer()
    ctgan.fit(data, discrete_columns, epochs=1)

    samples = ctgan.sample(1000, "discrete", "a")
    assert isinstance(samples, pd.DataFrame)
示例#2
0
def test_synthesizer_sample():
    data = pd.DataFrame({'discrete': np.random.choice(['a', 'b', 'c'], 100)})
    discrete_columns = ['discrete']

    ctgan = CTGANSynthesizer()
    ctgan.fit(data, discrete_columns, epochs=1)

    samples = ctgan.sample(1000, 'discrete', 'a')
    assert isinstance(samples, pd.DataFrame)
示例#3
0
文件: __main__.py 项目: elisim/CTGAN
def main():
    args = _parse_args()

    if args.tsv:
        data, discrete_columns = read_tsv(args.data, args.metadata)
    else:
        data, discrete_columns = read_csv(
            args.data, args.metadata, args.header, args.discrete
        )

    if args.load:
        model = CTGANSynthesizer.load(args.load)
    else:
        model = CTGANSynthesizer()
    model.fit(data, discrete_columns, args.epochs)

    if args.save is not None:
        model.save(args.save)

    num_samples = args.num_samples or len(data)

    if args.sample_condition_column is not None:
        assert args.sample_condition_column_value is not None

    sampled = model.sample(
        num_samples, args.sample_condition_column, args.sample_condition_column_value
    )

    if args.tsv:
        write_tsv(sampled, args.metadata, args.output)
    else:
        sampled.to_csv(args.output, index=False)
示例#4
0
    def test_sample(self):
        np.random.seed(0)
        tf.random.set_seed(0)
        data, discrete = generate_data(self._vars['batch_size'])

        model = CTGANSynthesizer(
            batch_size=self._vars['batch_size'], pac=self._vars['pac'])
        self.assertIsNotNone(model)

        model.train(data, discrete, epochs=1)
        output = model.sample(self._n_samples).values
        expected_output = np.array([[0.4139329, 3.0]])
        np.testing.assert_almost_equal(
            output, expected_output, decimal=self._vars['decimal'])
示例#5
0
def test_ctgan_numpy():
    data = pd.DataFrame({
        "continuous": np.random.random(100),
        "discrete": np.random.choice(["a", "b", "c"], 100),
    })
    discrete_columns = [1]

    ctgan = CTGANSynthesizer()
    ctgan.fit(data.values, discrete_columns, epochs=1)

    sampled = ctgan.sample(100)

    assert sampled.shape == (100, 2)
    assert isinstance(sampled, np.ndarray)
    assert set(np.unique(sampled[:, 1])) == {"a", "b", "c"}
示例#6
0
def test_ctgan_dataframe():
    data = pd.DataFrame({
        "continuous": np.random.random(100),
        "discrete": np.random.choice(["a", "b", "c"], 100),
    })
    discrete_columns = ["discrete"]

    ctgan = CTGANSynthesizer()
    ctgan.fit(data, discrete_columns, epochs=1)

    sampled = ctgan.sample(100)

    assert sampled.shape == (100, 2)
    assert isinstance(sampled, pd.DataFrame)
    assert set(sampled.columns) == {"continuous", "discrete"}
    assert set(sampled["discrete"].unique()) == {"a", "b", "c"}
示例#7
0
def test_ctgan_dataframe():
    data = pd.DataFrame({
        'continuous': np.random.random(100),
        'discrete': np.random.choice(['a', 'b', 'c'], 100)
    })
    discrete_columns = ['discrete']

    ctgan = CTGANSynthesizer()
    ctgan.fit(data, discrete_columns, epochs=1)

    sampled = ctgan.sample(100)

    assert sampled.shape == (100, 2)
    assert isinstance(sampled, pd.DataFrame)
    assert set(sampled.columns) == {'continuous', 'discrete'}
    assert set(sampled['discrete'].unique()) == {'a', 'b', 'c'}
示例#8
0
    def test_model_to_disk(self):
        np.random.seed(0)
        tf.random.set_seed(0)
        data, discrete = generate_data(self._vars['batch_size'])

        model = CTGANSynthesizer(batch_size=self._vars['batch_size'],
                                 pac=self._vars['pac'])
        self.assertIsNotNone(model)
        model.train(data, discrete, epochs=1)
        model_path = os.path.join(self._current_dir, 'model_test.joblib')
        model.dump(model_path, overwrite=True)
        loaded_model = CTGANSynthesizer(file_path=model_path)
        self.assertIsNotNone(loaded_model)

        for attr, value in loaded_model.__dict__.items():
            self.assertTrue(attr in model.__dict__)
            if type(value) in [int, float, tuple]:
                self.assertEqual(value, model.__dict__[attr])

        np.testing.assert_equal(loaded_model._cond_generator.__dict__,
                                model._cond_generator.__dict__)

        for attr, value in loaded_model._transformer.__dict__.items():
            if isinstance(value, pd.Series):
                pd.testing.assert_series_equal(
                    value, model._transformer.__dict__[attr])
            elif isinstance(value, list) and isinstance(value[0], tf.Tensor):
                tf.assert_equal(value, model._transformer.__dict__[attr])
            else:
                np.testing.assert_equal(value,
                                        model._transformer.__dict__[attr])

        np.testing.assert_equal(loaded_model._generator.get_weights(),
                                model._generator.get_weights())
def main():
    args = _parse_args()

    if args.tsv:
        data, discrete_columns = read_tsv(args.data, args.metadata)
    else:
        data, discrete_columns = read_csv(args.data, args.metadata, args.header, args.discrete)

    model = CTGANSynthesizer()
    model.fit(data, discrete_columns, args.epochs)

    num_samples = args.num_samples or len(data)
    sampled = model.sample(num_samples)

    if args.tsv:
        write_tsv(sampled, args.metadata, args.output)
    else:
        sampled.to_csv(args.output, index=False)
示例#10
0
    def _assert_train_equal(self, data, discrete):
        model = CTGANSynthesizer(
            batch_size=self._vars['batch_size'], pac=self._vars['pac'])
        self.assertIsNotNone(model)
        model.train(data, discrete, epochs=1)
        outputs = {
            'output_tensor': [x.numpy()
                              for x in model._transformer.output_tensor],
            'cond_tensor': [x.numpy() for x in model._transformer.cond_tensor],
            'gen_weights':  model._generator.get_weights(),
            'crt_weights': model._critic.get_weights(),
        }

        idx = int(len(discrete) > 0)
        for o in outputs:
            for i in range(len(outputs[o])):
                np.testing.assert_almost_equal(
                    outputs[o][i], self._expected_values[idx][o][i],
                    decimal=self._vars['decimal'])
示例#11
0
def test_categorical_nan():
    data = pd.DataFrame({
        "continuous": np.random.random(30),
        # This must be a list (not a np.array) or NaN will be cast to a string.
        "discrete": [np.nan, "b", "c"] * 10,
    })
    discrete_columns = ["discrete"]

    ctgan = CTGANSynthesizer()
    ctgan.fit(data, discrete_columns, epochs=1)

    sampled = ctgan.sample(100)

    assert sampled.shape == (100, 2)
    assert isinstance(sampled, pd.DataFrame)
    assert set(sampled.columns) == {"continuous", "discrete"}

    # since np.nan != np.nan, we need to be careful here
    values = set(sampled["discrete"].unique())
    assert len(values) == 3
    assert any(pd.isnull(x) for x in values)
    assert {"b", "c"}.issubset(values)
示例#12
0
def test_log_frequency():

    data = pd.DataFrame({
        "continuous": np.random.random(1000),
        "discrete": np.repeat(["a", "b", "c"], [950, 25, 25]),
    })

    discrete_columns = ["discrete"]

    ctgan = CTGANSynthesizer()
    ctgan.fit(data, discrete_columns, epochs=100)

    sampled = ctgan.sample(10000)
    counts = sampled["discrete"].value_counts()
    assert counts["a"] < 6500

    ctgan = CTGANSynthesizer(log_frequency=False)
    ctgan.fit(data, discrete_columns, epochs=100)

    sampled = ctgan.sample(10000)
    counts = sampled["discrete"].value_counts()
    assert counts["a"] > 9000
示例#13
0
def test_log_frequency():

    data = pd.DataFrame({
        'continuous': np.random.random(1000),
        'discrete': np.repeat(['a', 'b', 'c'], [950, 25, 25])
    })

    discrete_columns = ['discrete']

    ctgan = CTGANSynthesizer()
    ctgan.fit(data, discrete_columns, epochs=100)

    sampled = ctgan.sample(10000)
    counts = sampled['discrete'].value_counts()
    assert counts['a'] < 6500

    ctgan = CTGANSynthesizer()
    ctgan.fit(data, discrete_columns, epochs=100, log_frequency=False)

    sampled = ctgan.sample(10000)
    counts = sampled['discrete'].value_counts()
    assert counts['a'] > 9000
示例#14
0
import numpy as np
import pandas as pd
import os
import sys
import tqdm
import pickle
import pathlib
from pathlib import Path


def get_domain_dims(DIR='us_import1'):
    with open('./generated_data_v1/{}/domain_dims.pkl'.format(DIR),
              'rb') as fh:
        domain_dims = pickle.load(fh)
    return domain_dims


def convert_np_to_pd(data_np, domain_dims):
    columns = list(domain_dims.keys())
    df = pd.DataFrame(data=data_np, columns=columns)
    return df, columns


real_data = np.load('./generated_data_v1/us_import1/pos_data.npy')
domain_dims = get_domain_dims()
data_df, columns = convert_np_to_pd(real_data, domain_dims)

ctgan_obj = CTGANSynthesizer()
ctgan_obj.fit(data, columns)
ctgan_obj.save('ctgan.pkl')