def fit_score_model(name, model_kwargs, train_data, test_data, continuous_columns, semsitive_column, sample_rows, store_samples): """Fit and score models using given params.""" for index, kwargs in enumerate(model_kwargs): logger.info('Training TGAN Model %d/%d', index + 1, len(model_kwargs)) tf.reset_default_graph() base_dir = os.path.join('experiments', name) output = os.path.join(base_dir, 'model_{}'.format(index)) model = TGANModel(continuous_columns, sensitive_column, output=output, **kwargs) model.fit(train_data) sampled_data = model.sample(sample_rows) if store_samples: dir_name = os.path.join(base_dir, 'data') if not os.path.isdir(dir_name): os.mkdir(dir_name) file_name = os.path.join(dir_name, 'model_{}.csv'.format(index)) sampled_data.to_csv(file_name, index=False, header=True) score, p_rules = evaluate_classification(sampled_data, test_data, continuous_columns, sensitive_column) model_kwargs[index]['score'] = score model_kwargs[index]['p-rules (train/test/all'] = p_rules return model_kwargs
def build_and_train(params): tf.reset_default_graph() gen_layers = [int(params['gen_layer_sizes'])] * int( params['gen_num_layers']) print(gen_layers) crit_layers = [int(params['crit_layer_sizes'])] * int( params['crit_num_layers']) print(crit_layers) d = params.get('dataset') continuous_columns = d.info.get('continuous_columns') print('Batch Size:' + str(params.get('batch_size'))) savestr = str(np.random.randint(1, 999999)) my_tgan = TGANModel(continuous_columns=continuous_columns, batch_size=int(params.get('batch_size')), z_dim=int(params.get('embedding_dim')), learning_rate=params.get('learning_rate'), num_gen_rnn=int(params.get('gen_num_layers')), num_gen_feature=int(params.get('gen_layer_sizes')), num_dis_layers=int(params.get('crit_num_layers')), num_dis_hidden=int(params.get('crit_layer_sizes')), max_epoch=EPOCHS, steps_per_epoch=50, restore_session=False, output=savestr) print('Fitting a TGAN model for {0} epochs...'.format(EPOCHS)) train_copy = d.train.copy() my_tgan.fit(train_copy) print('Successfully fitted a TGAN model') return my_tgan
def augment_tgan(csvfile): data = pd.read_csv(csvfile) cols = list(data) cols_num = list() for i in range(len(cols) - 1): cols_num.append(i) tgan = TGANModel(cols_num) tgan.fit(data) # now create number of samples (10%) num_samples = int(0.10 * len(data)) samples = tgan.sample(num_samples) print(samples)
def test___init__(self): """On init, arguments are set as attributes.""" # Setup continuous_columns = [] # Run instance = TGANModel(continuous_columns) # Check assert instance.continuous_columns == continuous_columns assert instance.log_dir == 'output/logs' assert instance.model_dir == 'output/model' assert instance.max_epoch == 5 assert instance.steps_per_epoch == 10000 assert instance.batch_size == 200 assert instance.z_dim == 200 assert instance.gpu is None assert instance.save_checkpoints is True assert instance.restore_session is True
if (number > index): index = number + 1 return path + "/" + name + "_" + str(index) + ".csv" # generate dinamically names for syntethic dataset (e.g. "synthetic_adult_1" , "synthetic_adult_2") pathToSave = getSavePath("Synthetic_data", "synthetic_adult") # number of samples that we desire to generate num_samples = 400 # trained model location model_path = 'models/Adult_2.pkl' # load tgan model that was previously trained tgan = TGANModel.load(model_path) # after fitting, we can sample some new synthetic data which is a pandas.DataFrame samples = tgan.sample(num_samples) print(pathToSave) print(samples) samples.head() # save generated data as csv file and remove index line (first line) samples.to_csv(pathToSave, index=False) # save the model. Use force = true to overwrite tgan.save(model_path, force=True)
continuous = [] for col in ori_data.columns: if ori_data[col].nunique() > 4: continuous.append(col) continuous_columns = continuous tgan = TGANModel(continuous_columns=continuous, output='2) synthetic data generation/tGAN/bioresponse/0/', gpu=0, max_epoch=1, steps_per_epoch=6000, save_checkpoints=True, restore_session=False, batch_size=256, z_dim=200, noise=0.2, l2norm=0.00001, learning_rate=0.001, num_gen_rnn=100, num_gen_feature=100, num_dis_layers=1, num_dis_hidden=100, optimizer='AdamOptimizer') tgan.fit(fraud_data) model_path = '2) synthetic data generation/tGAN/bioresponse/0/tGAN_bio_0_model.pkl' tgan.save(model_path, force=True) #force=True to overwrite model_path = '2) synthetic data generation/tGAN/bioresponse/0/tGAN_bio_0_model.pkl' loaded_tgan = TGANModel.load(model_path)
def main(params=None, optim=True): if params is None: params = { # Regular parameters 'training_set': 'ln', 'eval': 'all', # NN Hyperparameters 'embedding_dim': 128, 'gen_num_layers': 2, 'gen_layer_sizes': 256, 'crit_num_layers': 2, 'crit_layer_sizes': 256, 'learning_rate': 10**-6, 'batch_size': 500, 'training_iter': 1 } if optim: params.update( space ) # Overwrite NN hyperparameters with stochastic variant from top of file print('Starting TGAN main script with following parameters:') for key in params: print(key, params[key]) params['model'] = 'tgan' # Load dataset dataset = load_data(params.get('training_set')) params['dataset'] = dataset print('Successfully loaded dataset {0}'.format(params.get('training_set'))) if params['model'] in dataset.samples: # If we are here, we have already generated samples for this test setup (identifier/dataset/model) samples = dataset.samples.get(params['model']) else: # Train model and Generate samples if optim: # Optimize or load TGAN model filename = os.path.join(RESULT_DIR, params.get('training_set'), params.get('model') + '_optimized') if os.path.isfile(filename): my_tgan = TGANModel.load(filename) print('Successfully loaded old optimized TGAN model from {0}'. format(filename)) else: best, trials = optimize(params, filename + '.json') best['dataset'] = dataset my_tgan = build_and_train(best) my_tgan.save(filename) print('Saved the optimized TGAN model at {0}'.format(filename)) else: # Train or load CTGAN model filename = os.path.join(RESULT_DIR, params.get('training_set'), params.get('model') + '_default') if os.path.isfile(filename): # my_tgan = TGANModel.load(filename) print('Successfully loaded old TGAN model from {0}'.format( filename)) else: my_tgan = build_and_train(params=params) # my_tgan.save(filename) print('Saved the TGAN model at {0}'.format(filename)) # Sample from model print('Sampling from the TGAN model...') samples = sampler(my_tgan, params) save_samples(samples, params['training_set'], model=params.get('model'), force=True) print('Saved the TGAN samples') # Evaluate fitted model if params['eval'] == 'all': print('Starting MLE evaluation on samples...') discrete_columns, continuous_columns = dataset.get_columns() plot_predictions_by_dimension(real=dataset.train, samples=samples, data_test=dataset.test, discrete_columns=discrete_columns, continuous_columns=continuous_columns, dataset=params.get('training_set'), model=params.get('model')) print('Plotting marginals of real and sample data...') plot_marginals(dataset.train, samples, params.get('training_set'), params.get('model')) print('Plotting association matrices...') diff = plot_association(dataset, samples, params.get('training_set'), params.get('model')) print(diff) alist = params.get('training_set').split(sep='-', maxsplit=1) dataset = alist[0] basepath = os.path.join(RESULT_DIR, *alist, params.get('model')) filepath = os.path.join( basepath, '{0}_{1}_c_marginals.png'.format(dataset, params.get('model'))) save_json(diff, filepath)
import pandas as pd d = pd.read_csv('../data/berka/berka_cat.csv', sep=';') d = d.drop(['trans_bank_partner', 'trans_account_partner'], axis=1) continuous_columns = [0, 1, 2, 3, 9] from tgan.model import TGANModel tgan = TGANModel(continuous_columns, restore_session=False, max_epoch=50, steps_per_epoch=1000, batch_size=1000) tgan.fit(d) model_path = 'demo/my_model' tgan.save(model_path)
import json import pandas as pd from tgan.model import TGANModel with open(str(sys.argv[1]), 'r') as f: config = json.load(f) df_train = pd.read_pickle(config['df_train']) cont_columns = config['continuous_cols'] tgan = TGANModel(cont_columns, batch_size=config['batch_size'], z_dim=config['z_dim'], num_gen_rnn=config['num_gen_rnn'], num_gen_feature=config['num_gen_feature'], num_dis_layers=config['num_dis_layers'], num_dis_hidden=config['num_dis_hidden'], learning_rate=config['learning_rate'], noise=config['noise'], max_epoch=config['max_epoch'], steps_per_epoch=config['steps_per_epoch']) model_path = config['model_path'] start_time = time.time() # fit the TGAN tgan.fit(df_train) print("--- %s seconds ---" % (time.time() - start_time)) tgan.save(model_path, force=True)
# model save location model_path = 'models/Adult_2.pkl' # the TGAN model need to know which dataset columns are the type of continuous columns. continuous_columns = [0, 2, 4, 10, 11, 12] # set nn parameters, like epoch, batch size and loss function tgan = TGANModel(continuous_columns, output='output', max_epoch=10, steps_per_epoch=400, save_checkpoints=True, restore_session=False, batch_size=200, z_dim=200, noise=0.2, l2norm=0.00001, learning_rate=0.001, num_gen_rnn=100, num_gen_feature=100, num_dis_layers=1, num_dis_hidden=100, optimizer='AdamOptimizer') # train phase tgan.fit(data) print("Fitted model!!!") # after fitting, we can sample some new synthetic data which is a pandas.DataFrame samples = tgan.sample(num_samples)