Exemplo n.º 1
0
def optimize(space, file_path=None, max_evals=5):
    if space.get('model') == 'ctgan':
        from gan_thesis.models.ctgan.synthesizer import build_and_train, sampler, optim_loss
    elif space.get('model') == 'tgan':
        from gan_thesis.models.tgan.synthesizer import build_and_train, sampler, optim_loss
    # elif space.get('model') == 'wgan':
    # from gan_thesis.models.wgan.synthesizer import build_and_train, sampler, optim_loss

    def objective(params):
        """Objective function for GAN Hyperparameter Tuning"""

        #  with HiddenPrints():  # Suppresses normal print functions
        my_gan = build_and_train(params)
        samples = sampler(my_gan, params)
        loss = optim_loss(samples.data, params)

        params['loss'] = loss
        # save_json(params, os.path.join(__file__, ))

        del my_gan, samples

        # Dictionary with information for evaluation
        return {'loss': loss, 'params': params, 'status': STATUS_OK}

    # Trials object to track progress
    bayes_trials = Trials()

    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=max_evals)
    if file_path is not None:
        save_json(best, file_path)

    return best, bayes_trials
Exemplo n.º 2
0
def main(params=None, optim=False):
    if params is None:
        params = {
            # Regular parameters
            'training_set': 'cat_mix_gauss-test1',
            'eval': 'all',
            # NN Hyperparameters
            'EPOCHS': EPOCHS,
            'embedding_dim': 128,
            'gen_num_layers': 2,
            'gen_layer_sizes': 256,
            'crit_num_layers': 2,
            'crit_layer_sizes': 256,
            'mode': 'wgan-gp',
            'gp_const': 10,
            'n_critic': 5,
            'batch_size': 500,
            'hard': False,
            'temp_anneal': False
        }

    if optim:
        params.update(
            space
        )  # Overwrite NN hyperparameters with stochastic variant from top of file

    print('Starting wgan-gp main script with following parameters:')
    for key in params:
        print(key, params[key])
    params['model'] = 'wgan'

    # Load dataset
    print(params.get('training_set'))
    dataset = load_data(params.get('training_set'))
    params['dataset'] = dataset

    print('Successfully loaded dataset {0}'.format(params.get('training_set')))

    alist = params.get('training_set').split(sep='-', maxsplit=1)

    basepath = os.path.join(RESULT_DIR, *alist, params.get('model'))
    filepath = os.path.join(
        basepath, '{0}_{1}_ass_diff.json'.format(alist[0],
                                                 params.get('model')))
    if params.get('log_directory') != None:
        params['log_directory'] = os.path.join(basepath,
                                               params['log_directory'])
    else:
        params['log_directory'] = basepath

    if optim:
        # Optimize or load wgan model
        filename = os.path.join(RESULT_DIR, params.get('training_set'),
                                params.get('model') + '_optimized')
        if os.path.isfile(filename):
            my_wgan = load_model(filename)
            print(
                'Successfully loaded old optimized wgan model from {0}'.format(
                    filename))
        else:
            best, trials = optimize(params, filename + '.json')
            my_wgan = build_and_train(best)
            save_model(my_wgan, filename, force=True)
            print('Saved the optimized wgan model at {0}'.format(filename))
    else:
        # Train or load wgan model
        filename = os.path.join(RESULT_DIR, params.get('training_set'),
                                params.get('model'))

        my_wgan = build_and_train(params=params)
        # try:
        #     save_model(my_wgan, filename, force = True)
        #     print('Saved the wgan model at {0}'.format(filename))
        # except Exception as e:
        #     print('Model was not saved due to an error: {0}'.format(e))
        #     #os.remove(filename)

        #save_model(my_wgan, filename, force=True)
        #print('Saved the wgan model at {0}'.format(filename))

    # Sample from model
    print('Sampling from the wgan model...')
    samples = sampler(my_wgan, params)
    save_samples(samples, params['training_set'], model='wgan')
    print('Saved the wgan samples')

    # Evaluate fitted model
    if params['eval'] == 'all':
        print('Starting MLE evaluation on samples...')
        discrete_columns, continuous_columns = dataset.get_columns()
        plot_predictions_by_dimension(real=dataset.train,
                                      samples=samples,
                                      data_test=dataset.test,
                                      discrete_columns=discrete_columns,
                                      continuous_columns=continuous_columns,
                                      dataset=params.get('training_set'),
                                      model='wgan')
        print('Plotting marginals of real and sample data...')
        plot_marginals(dataset.train, samples, params.get('training_set'),
                       'wgan')
        print('Plotting association matrices...')
        diff = plot_association(dataset, samples, params.get('training_set'),
                                params.get('model'))
        print(diff)
        save_json(diff, filepath)
Exemplo n.º 3
0
def main(params=None, optim=True):
    if params is None:
        params = {
            # Regular parameters
            'training_set': 'mvn-test2',
            'eval': 'all',
            # NN Hyperparameters
            'embedding_dim': 128,
            'gen_num_layers': 2,
            'gen_layer_sizes': 256,
            'crit_num_layers': 2,
            'crit_layer_sizes': 256,
            'l2scale': 10**-6,
            'batch_size': 500
        }

    if optim:
        params.update(space)  # Overwrite NN hyperparameters with stochastic variant from top of file

    print('Starting CTGAN main script with following parameters:')
    for key in params:
        print(key, params[key])
    params['model'] = 'ctgan'

    # Load dataset
    dataset = load_data(params.get('training_set'))
    params['dataset'] = dataset
    print('Successfully loaded dataset {0}'.format(params.get('training_set')))

    if params['model'] in dataset.samples:
        #  If we are here, we have already generated samples for this test setup (identifier/dataset/model)
        samples = dataset.samples.get(params['model'])
    else:
        if optim:
            # Optimize or load CTGAN model
            filename = os.path.join(RESULT_DIR, params.get('training_set'), params.get('model') + '_optimized')
            if os.path.isfile(filename):
                my_ctgan = load_model(filename)
                print('Successfully loaded old optimized CTGAN model from {0}'.format(filename))
            else:
                best, trials = optimize(params, filename+'.json')
                best['dataset'] = dataset
                my_ctgan = build_and_train(best)
                save_model(my_ctgan, filename, force=True)
                print('Saved the optimized CTGAN model at {0}'.format(filename))
        else:
            # Train or load CTGAN model
            filename = os.path.join(RESULT_DIR, params.get('training_set'), params.get('model') + '_default')
            if os.path.isfile(filename):
                # my_ctgan = load_model(filename)
                print('Successfully loaded old CTGAN model from {0}'.format(filename))
            else:
                my_ctgan = build_and_train(params=params)
                # save_model(my_ctgan, filename, force=True)
                print('Saved the CTGAN model at {0}'.format(filename))

        # Sample from model
        print('Sampling from the CTGAN model...')
        samples = sampler(my_ctgan, params)
        save_samples(samples, params['training_set'], model=params.get('model'), force=True)
        print('Saved the CTGAN samples')

    # Evaluate fitted model
    if params['eval'] == 'all':
        print('Starting MLE evaluation on samples...')
        discrete_columns, continuous_columns = dataset.get_columns()
        plot_predictions_by_dimension(real=dataset.train, samples=samples, data_test=dataset.test,
                                      discrete_columns=discrete_columns, continuous_columns=continuous_columns,
                                      dataset=params.get('training_set'), model=params.get('model'))
        print('Plotting marginals of real and sample data...')
        plot_marginals(dataset.train, samples, params.get('training_set'), params.get('model'))
        print('Plotting association matrices...')
        diff = plot_association(dataset, samples, params.get('training_set'), params.get('model'))
        print(diff)
        alist = params.get('training_set').split(sep='-', maxsplit=1)
        dataset = alist[0]
        basepath = os.path.join(RESULT_DIR, *alist, params.get('model'))
        filepath = os.path.join(basepath, '{0}_{1}_c_marginals.png'.format(dataset, params.get('model')))

        save_json(diff, filepath)
Exemplo n.º 4
0
def plot_all_association(complete_dataset,
                         dataset,
                         force=True,
                         pass_tgan=True):
    alist = dataset.split(sep='-', maxsplit=1)
    base_path = os.path.join(RESULT_DIR, *alist)
    if not os.path.exists(base_path):
        os.makedirs(base_path)

    file_path = os.path.join(base_path,
                             'real_{0}_association.csv'.format(dataset))
    if os.path.exists(file_path):
        association_real = pd.read_csv(file_path)
        association_real = association_real.iloc[:, 1:]
        association_real = association_real.set_index(association_real.columns)
        print('loaded real association matrix')
    else:
        association_real = association(complete_dataset)
        association_real.to_csv(file_path)
    n_col = len(association_real.columns.to_list())

    diff = {}

    file_path = os.path.join(base_path,
                             'wgan_{0}_association.csv'.format(dataset))
    if os.path.exists(file_path):
        association_wgan = pd.read_csv(file_path)
        association_wgan = association_wgan.iloc[:, 1:]
        association_wgan = association_wgan.set_index(association_wgan.columns)
        print('loaded WGAN association matrix')

    else:
        samples_wgan = complete_dataset.samples.get('wgan')
        samples_dataset = Dataset(None, None, samples_wgan,
                                  complete_dataset.info, None)
        association_wgan = association(samples_dataset)
        association_wgan.to_csv(
            os.path.join(base_path,
                         'wgan_{0}_association.csv'.format(dataset)))
    diff['wgan'] = association_difference(association_real=association_real,
                                          association_samples=association_wgan)
    diff['wgan_norm'] = diff['wgan'] / (
        0.5 * len(association_real.columns.to_list()) *
        (len(association_real.columns.to_list()) - 1))

    file_path = os.path.join(base_path,
                             'ctgan_{0}_association.csv'.format(dataset))
    if os.path.exists(file_path):
        association_ctgan = pd.read_csv(file_path)
        association_ctgan = association_ctgan.iloc[:, 1:]
        association_ctgan = association_ctgan.set_index(
            association_ctgan.columns)
        print('loaded CTGAN association matrix')
    else:
        samples_ctgan = complete_dataset.samples.get('ctgan')
        samples_dataset = Dataset(None, None, samples_ctgan,
                                  complete_dataset.info, None)
        association_ctgan = association(samples_dataset)
        association_ctgan.to_csv(
            os.path.join(base_path,
                         'ctgan_{0}_association.csv'.format(dataset)))
    diff['ctgan'] = association_difference(
        association_real=association_real,
        association_samples=association_ctgan)
    diff['ctgan_norm'] = diff['ctgan'] / (
        0.5 * len(association_real.columns.to_list()) *
        (len(association_real.columns.to_list()) - 1))

    file_path = os.path.join(base_path,
                             'tgan_{0}_association.csv'.format(dataset))
    if pass_tgan:
        if os.path.exists(file_path):
            association_tgan = pd.read_csv(file_path)
            association_tgan = association_tgan.iloc[:, 1:]
            association_tgan = association_tgan.set_index(
                association_tgan.columns)
            print('loaded TGAN association matrix')
        else:
            samples_tgan = complete_dataset.samples.get('tgan')
            samples_dataset = Dataset(None, None, samples_tgan,
                                      complete_dataset.info, None)
            association_tgan = association(samples_dataset)
            association_tgan.to_csv(
                os.path.join(base_path,
                             'tgan_{0}_association.csv'.format(dataset)))
        diff['tgan'] = association_difference(
            association_real=association_real,
            association_samples=association_tgan)
        diff['tgan_norm'] = diff['tgan'] / (
            0.5 * len(association_real.columns.to_list()) *
            (len(association_real.columns.to_list()) - 1))

    colormap = sns.diverging_palette(20, 220, n=256)
    mask = np.triu(np.ones_like(association_real, dtype=np.bool))

    if pass_tgan:
        fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20, 6))
    else:
        fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 6))

    cbar_ax = fig.add_axes([.94, .5, .02, .4])

    ax1.set_title('Real')
    ax1.set_aspect('equal')
    chart = sns.heatmap(association_real,
                        vmin=-1,
                        vmax=1,
                        mask=mask,
                        annot=False,
                        cmap=colormap,
                        ax=ax1,
                        cbar=False)

    chart.set_yticklabels(labels=chart.get_yticklabels(), rotation=0)

    ax2.set_title('WGAN')
    ax2.set_aspect('equal')

    sns.heatmap(association_wgan,
                vmin=-1,
                vmax=1,
                mask=mask,
                annot=False,
                cmap=colormap,
                ax=ax2,
                cbar=False)

    ax3.set_title('CTGAN')
    ax3.set_aspect('equal')

    if pass_tgan:
        sns.heatmap(association_ctgan,
                    vmin=-1,
                    vmax=1,
                    mask=mask,
                    annot=False,
                    cmap=colormap,
                    ax=ax3,
                    cbar=False)
    else:
        sns.heatmap(association_ctgan,
                    vmin=-1,
                    vmax=1,
                    mask=mask,
                    annot=False,
                    cmap=colormap,
                    ax=ax3,
                    cbar=True,
                    cbar_ax=cbar_ax)

    if pass_tgan:
        ax4.set_title('TGAN')
        ax4.set_aspect('equal')

        sns.heatmap(association_tgan,
                    vmin=-1,
                    vmax=1,
                    mask=mask,
                    annot=False,
                    cmap=colormap,
                    ax=ax4,
                    cbar=True,
                    cbar_ax=cbar_ax)

    plt.subplots_adjust(wspace=0.1)
    plt.tight_layout()

    alist = dataset.split(sep='-', maxsplit=1)
    dataset = alist[0]
    basepath = os.path.join(RESULT_DIR, *alist)
    filepath = os.path.join(basepath,
                            '{0}_all_association.png'.format(dataset))
    if not os.path.exists(basepath):
        os.makedirs(basepath)
    if os.path.isfile(filepath) and force:
        os.remove(filepath)

    plt.savefig(filepath)
    plt.close()

    filepath = os.path.join(basepath,
                            '{0}_euclidian_distance.json'.format(dataset))
    save_json(diff, filepath)