def main(): args = _parse_args() if args.tsv: data, discrete_columns = read_tsv(args.data, args.metadata) else: data, discrete_columns = read_csv( args.data, args.metadata, args.header, args.discrete ) if args.load: model = CTGANSynthesizer.load(args.load) else: model = CTGANSynthesizer() model.fit(data, discrete_columns, args.epochs) if args.save is not None: model.save(args.save) num_samples = args.num_samples or len(data) if args.sample_condition_column is not None: assert args.sample_condition_column_value is not None sampled = model.sample( num_samples, args.sample_condition_column, args.sample_condition_column_value ) if args.tsv: write_tsv(sampled, args.metadata, args.output) else: sampled.to_csv(args.output, index=False)
import numpy as np import pandas as pd import os import sys import tqdm import pickle import pathlib from pathlib import Path def get_domain_dims(DIR='us_import1'): with open('./generated_data_v1/{}/domain_dims.pkl'.format(DIR), 'rb') as fh: domain_dims = pickle.load(fh) return domain_dims def convert_np_to_pd(data_np, domain_dims): columns = list(domain_dims.keys()) df = pd.DataFrame(data=data_np, columns=columns) return df, columns real_data = np.load('./generated_data_v1/us_import1/pos_data.npy') domain_dims = get_domain_dims() data_df, columns = convert_np_to_pd(real_data, domain_dims) ctgan_obj = CTGANSynthesizer() ctgan_obj.fit(data, columns) ctgan_obj.save('ctgan.pkl')