def main(_): # Problem parameters num_contexts = 2000 # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = 'mushroom' # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_pnoise = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, ) hparams_alpha_div = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, alpha=1.0, k=20, prior_variance=0.1) hparams_gp = tf.contrib.training.HParams(num_actions=num_actions, num_outputs=num_actions, context_dim=context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=1000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False) algos = [ UniformSampling('Uniform Sampling', hparams), UniformSampling('Uniform Sampling 2', hparams), FixedPolicySampling('fixed1', [0.75, 0.25], hparams), FixedPolicySampling('fixed2', [0.25, 0.75], hparams), PosteriorBNNSampling('RMS', hparams_rms, 'RMSProp'), PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), BootstrappedBNNSampling('BootRMS', hparams_rms), ParameterNoiseSampling('ParamNoise', hparams_pnoise), PosteriorBNNSampling('BBAlphaDiv', hparams_alpha_div, 'AlphaDiv'), PosteriorBNNSampling('MultitaskGP', hparams_gp, 'GP'), ] # Run contextual bandit problem t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type)
def main(_): # Problem parameters num_contexts = 20000 nb_simulations = 2 l_sizes = [50, 50] plt_dir = "plots/" dict_dir = "dicts/" # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = 'adult' # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear_finite_memory = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=1) hparams_nlinear_finite_memory_no_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=0, sigma_prior_flag=0) hparams_nlinear_finite_memory_no_sig_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=0) hparams_ucb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, optimizer='RMS', training_freq=1, training_freq_network=50, training_epochs=100, lambda_prior=0.25, delta=0.01, lamb=0.01, mu=1, S=1) algos = [ #UniformSampling('Uniform Sampling', hparams), #FixedPolicySampling('fixed1', [0.75, 0.25], hparams), PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), #NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory', hparams_nlinear_finite_memory), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory_noP', hparams_nlinear_finite_memory_no_prior), #NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noSigP', hparams_nlinear_finite_memory_no_sig_prior), #NeuralUCBSampling('NeuralUCB', hparams_ucb) ] regrets = {} rewards = {} for a in algos: regrets[a.name] = np.zeros((nb_simulations, num_contexts)) rewards[a.name] = np.zeros(nb_simulations) rewards['opt_reward'] = np.zeros(nb_simulations) for k in range(nb_simulations): algos = [ #UniformSampling('Uniform Sampling', hparams), #FixedPolicySampling('fixed1', [0.75, 0.25], hparams), PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), #NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory', hparams_nlinear_finite_memory), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory_noP', hparams_nlinear_finite_memory_no_prior), #NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noSigP', hparams_nlinear_finite_memory_no_sig_prior), #NeuralUCBSampling('NeuralUCB', hparams_ucb) ] # Run contextual bandit problem t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type) for j, a in enumerate(algos): regrets[a.name][k, :] = np.cumsum(opt_rewards - h_rewards[:, j]) rewards[a.name][k] = np.sum(h_rewards[:, j]) rewards['opt_reward'][k] = np.sum(opt_rewards) save_plot(algos, regrets, data_type, num_contexts, plt_dir) np.save(dict_dir + 'dict_' + data_type + '.npy', rewards)
def main(_): np.random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) dt = datetime.datetime.now() timestr = '{}-{}-{}-{}'.format(dt.month, dt.day, dt.hour, dt.minute) FLAGS.logdir = os.path.join(FLAGS.logdir, timestr) # Problem parameters num_contexts = FLAGS.num_context # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = FLAGS.bandit # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals layer_sizes = [int(i) for i in FLAGS.layers.split(',')] # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=layer_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=2000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=20) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=layer_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=2000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=layer_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=True, freq_summary=2000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=20, initial_training_steps=2000, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_gp = tf.contrib.training.HParams(num_actions=num_actions, num_outputs=num_actions, context_dim=context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=2000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False) hparams_fsvgd = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, # activation=tf.nn.relu, layer_sizes=layer_sizes, batch_size=512, activate_decay=False, initial_lr=0.1, lr=FLAGS.lr, n_mm_sample=4, mm_n_particles=40, mm_jitter=FLAGS.mm_jitter, # max_grad_norm=5.0, show_training=True, freq_summary=2000, buffer_s=-1, initial_pulls=2, optimizer='Adam', use_sigma_exp_transform=True, cleared_times_trained=20, initial_training_steps=2000, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, n_particles=20, interp_batch_size=FLAGS.interp_batch_size, prior_variance=FLAGS.prior_variance) algos = [ UniformSampling('Uniform Sampling', hparams), # UniformSampling('Uniform Sampling 2', hparams), # FixedPolicySampling('fixed1', [0.75, 0.25], hparams), # FixedPolicySampling('fixed2', [0.25, 0.75], hparams), PosteriorBNNSampling('fSVGD', hparams_fsvgd, 'SVGD'), # PosteriorBNNSampling('RMS', hparams_rms, 'RMSProp'), # PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), # NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), # NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), # LinearFullPosteriorSampling('LinFullPost', hparams_linear), BootstrappedBNNSampling('BootRMS', hparams_rms), # ParameterNoiseSampling('ParamNoise', hparams_pnoise), # PosteriorBNNSampling('BBAlphaDiv', hparams_alpha_div, 'AlphaDiv'), PosteriorBNNSampling('MultitaskGP', hparams_gp, 'GP'), ] # Run contextual bandit problem t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos, opt_rewards) _, h_rewards = results # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type)
def run_iter(): # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = FLAGS.dataset # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_luga = tf.contrib.training.HParams(num_actions=num_actions, num_contexts=num_contexts, context_dim=context_dim, activation=tf.nn.relu, latent_dim=50, batch_size=512, initial_lr=2e-4, show_training=False, lr_decay=False, freq_summary=10000, buffer_s=-1, initial_pulls=2, training_freq=20, training_epochs=40, lambda_prior=0.25, show_loss=False, kl=1.0, recon=1.0, psigma=1.0, glnoise=False) hparams_sivi1 = tf.contrib.training.HParams(num_actions=num_actions, num_contexts=num_contexts, context_dim=context_dim, activation=tf.nn.relu, latent_dim=50, batch_size=512, initial_lr=1e-3, show_training=False, verbose=False, lr_decay=False, freq_summary=10000, buffer_s=-1, initial_pulls=2, training_freq=20, training_epochs=40, lambda_prior=0.25, show_loss=False, kl=1.0, recon=1.0, two_decoder=False, glnoise=False, psigma=1.25) hparams_lusi_abl_km = tf.contrib.training.HParams(num_actions=num_actions, num_contexts=num_contexts, context_dim=context_dim, activation=tf.nn.relu, latent_dim=50, batch_size=512, initial_lr=1e-3, show_training=False, verbose=False, lr_decay=False, freq_summary=10000, buffer_s=-1, initial_pulls=2, training_freq=20, training_epochs=40, lambda_prior=0.25, show_loss=False, km=1, onez=0, recon=1.0, two_decoder=False, glnoise=False, psigma=1.25) hparams_luga_abl_km = tf.contrib.training.HParams(num_actions=num_actions, num_contexts=num_contexts, context_dim=context_dim, activation=tf.nn.relu, latent_dim=50, batch_size=512, initial_lr=2e-4, show_training=False, lr_decay=False, freq_summary=10000, buffer_s=-1, initial_pulls=2, training_freq=20, training_epochs=40, lambda_prior=0.25, show_loss=False, km=1, onez=0, recon=1.0, psigma=1.0, glnoise=False) algos = [ UniformSampling('Uniform Sampling', hparams), #1 PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), #2 NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), #3 LinearFullPosteriorSampling('LinFullPost', hparams_linear), #4 PiposteriorBNNSampling('DGF', hparams_bbb, 'DGF'), #5 VariationalSampling_v4('LU_Gaussian', hparams_luga), #6 # A smaller learning rate like 3e-4 or 1e-4 will work better on the 'mushroom' dataset for LU_SIVI and LU_Gaussian VariationalSamplingSivi_dgf_v7("LU_SIVI", hparams_sivi1), #7 # For Ablation Study VariationalSampling_abl('LU_Gaussian_Ablation_multi_z_1m', hparams_luga_abl_km), VariationalSamplingSivi_dgf_abl("LU_SIVI_Ablation_multi_z_1m", hparams_lusi_abl_km) ] t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type) opt_rewards = opt_rewards.reshape([-1, 1]) regret_i = opt_rewards - h_rewards return regret_i
for layer_size in layer_sizes: for batch_size in batch_sizes: for optimizer in optimizers: print(batch_size, layer_size, optimizer, 'NG_bs%s_ls%ix50_%s' % (batch_size, len(layer_size), optimizer)) neural_greedy_protos.append(lambda param=[batch_size, layer_size, optimizer]: PosteriorBNNSampling( 'NG_bs%s_ls%ix50_%s' % (param[0], len(param[1]), param[2]), tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=param[1], batch_size=param[0], activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer=param[2], reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=50), 'RMSProp')) print(len(neural_greedy_protos)) random_proto = lambda: UniformSampling('Uniform Sampling', hparams) linThompson_proto = lambda: LinearFullPosteriorSampling('linThompson', hparams_linear)
def __init__(self, ontology): super(BanditTrackerTF, self).__init__(ontology) self.bc = BertClient(check_version=False, check_length=False, ip="compute-0-1.local") self.num_actions = 2 # Possible actions : Update state, Do Not update sate self.context_dim = 2049 # Concatenation of all features # Define hyper parameters to use in Contextual Bandit Algorithm hparams_linear = tf.contrib.training.HParams( num_actions=self.num_actions, context_dim=self.context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_dropout = tf.contrib.training.HParams( num_actions=self.num_actions, context_dim=self.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) self.food_dataset, self.food_opt_rewards, self.food_opt_actions, _, _ = self.get_dataset( pickle.load( open( "/home/l.fischer/DSTC2_Baselines/training_data/train_data_food_v2", "rb"))) self.area_dataset, self.area_opt_rewards, self.area_opt_actions, _, _ = self.get_dataset( pickle.load( open( "/home/l.fischer/DSTC2_Baselines/training_data/train_data_area_v2", "rb"))) self.price_dataset, self.price_opt_rewards, self.price_opt_actions, _, _ = self.get_dataset( pickle.load( open( "/home/l.fischer/DSTC2_Baselines/training_data/train_data_pricerange_v2", "rb"))) self.food_algo = PosteriorBNNSampling('Dropout_food', hparams_dropout, 'RMSProp') self.area_algo = PosteriorBNNSampling('Dropout_area', hparams_dropout, 'RMSProp') self.price_algo = PosteriorBNNSampling('Dropout_price', hparams_dropout, 'RMSProp') self.train()
class BanditTrackerTF(AbstractTracker): def __init__(self, ontology): super(BanditTrackerTF, self).__init__(ontology) self.bc = BertClient(check_version=False, check_length=False, ip="compute-0-1.local") self.num_actions = 2 # Possible actions : Update state, Do Not update sate self.context_dim = 2049 # Concatenation of all features # Define hyper parameters to use in Contextual Bandit Algorithm hparams_linear = tf.contrib.training.HParams( num_actions=self.num_actions, context_dim=self.context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_dropout = tf.contrib.training.HParams( num_actions=self.num_actions, context_dim=self.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) self.food_dataset, self.food_opt_rewards, self.food_opt_actions, _, _ = self.get_dataset( pickle.load( open( "/home/l.fischer/DSTC2_Baselines/training_data/train_data_food_v2", "rb"))) self.area_dataset, self.area_opt_rewards, self.area_opt_actions, _, _ = self.get_dataset( pickle.load( open( "/home/l.fischer/DSTC2_Baselines/training_data/train_data_area_v2", "rb"))) self.price_dataset, self.price_opt_rewards, self.price_opt_actions, _, _ = self.get_dataset( pickle.load( open( "/home/l.fischer/DSTC2_Baselines/training_data/train_data_pricerange_v2", "rb"))) self.food_algo = PosteriorBNNSampling('Dropout_food', hparams_dropout, 'RMSProp') self.area_algo = PosteriorBNNSampling('Dropout_area', hparams_dropout, 'RMSProp') self.price_algo = PosteriorBNNSampling('Dropout_price', hparams_dropout, 'RMSProp') self.train() def is_word_in_ontology(self, word, slot_type="food"): """ Returns a boolean saying whether a given word is present in the ontology :param word: The word to check if it's in the ontology :param slot_type: The type of slot in which to check if the word is present :return: a Boolean saying whether a given word is present in the ontology """ if slot_type == "food": return int(word in self.ontology["informable"]["food"]) elif slot_type == "area": return int(word in self.ontology["informable"]["area"]) else: return int(word in self.ontology["informable"]["pricerange"]) def get_dataset(self, data_object): # convert to np_array data_object["features"] = normalize(np.array(data_object["features"]), norm="l1") data_object["labels"] = np.array(data_object["labels"]) rewards = np.array([(0, 1) if label else (1, 0) for label in data_object["labels"]]) num_actions = 2 # Actions are : Update state, Do Not Update state context_dim = 2049 # noise_stds = [0.01 * (i + 1) for i in range(num_actions)] betas = np.random.uniform(-1, 1, (context_dim, num_actions)) betas /= np.linalg.norm(betas, axis=0) # rewards = np.random.randint(2, size=(10000, 2)) opt_actions = np.argmax(rewards, axis=1) opt_rewards = np.array( [rewards[i, act] for i, act in enumerate(opt_actions)]) return np.hstack( (data_object["features"], rewards)), opt_rewards, opt_actions, num_actions, context_dim def train(self): # Instantiate Contextual Bandit Object food_bandit = ContextualBandit(self.context_dim, self.num_actions) food_bandit.feed_data(self.food_dataset) # Training food bandit classifier print("Training food") for i in tqdm(range(self.food_dataset.shape[0])): context = food_bandit.context(i) action = self.food_algo.action(context) reward = food_bandit.reward(i, action) self.food_algo.update(context, action, reward) # Instantiate Contextual Bandit Object area_bandit = ContextualBandit(self.context_dim, self.num_actions) area_bandit.feed_data(self.area_dataset) # Training area bandit classifier print("Training area") for i in tqdm(range(self.area_dataset.shape[0])): context = area_bandit.context(i) action = self.area_algo.action(context) reward = area_bandit.reward(i, action) self.area_algo.update(context, action, reward) # Instantiate Contextual Bandit Object price_bandit = ContextualBandit(self.context_dim, self.num_actions) price_bandit.feed_data(self.price_dataset) # Training price bandit classifier print("Training price") for i in tqdm(range(self.price_dataset.shape[0])): context = price_bandit.context(i) action = self.price_algo.action(context) reward = price_bandit.reward(i, action) self.price_algo.update(context, action, reward) print("Training Complete") def addTurn(self, turn): """ Adds a turn to this tracker :param turn: The turn to process and add :return: A hypothesis of the current state of the dialog """ hyps = copy.deepcopy(self.hyps) goal_stats = defaultdict(lambda: defaultdict(float)) # Obtaining the best hypothesis from the ASR module best_asr_hyp = turn['input']["live"]['asr-hyps'][0]["asr-hyp"] # English stopwords set with punctuation stop = stopwords.words('english') + list(string.punctuation) # Tokenize the best hypothesis on the whitespaces tkns = word_tokenize(best_asr_hyp) # Remove stop words and also shingle the tokens processed_hyp = [ word for word in tkns if word not in stop ] # + [tup[0] + " " + tup[1] for tup in ngrams(tkns, 2)] # Manually change from "moderately"/"affordable" to "moderate" and "cheaper" to "cheap" for idx, word in enumerate(processed_hyp): if word == "moderately" or word == "affordable": processed_hyp[idx] = "moderate" if word == "cheaper": processed_hyp[idx] = "cheap" if processed_hyp: # Create an embedding of the user utterance using BERT sentence_embedding = np.array(self.bc.encode([best_asr_hyp]))[0] # Iterate through all the words in the user utterance to obtain the features needed for word in processed_hyp: # Create and embedding of the word, being iterated, using BERT word_embedding = np.array(self.bc.encode([word]))[0] # Check whether the current word is present in the ontology, in one of the slot types word_in_food_ontology = [ self.is_word_in_ontology(word, slot_type="food") ] word_in_area_ontology = [ self.is_word_in_ontology(word, slot_type="area") ] word_in_price_ontology = [ self.is_word_in_ontology(word, slot_type="price") ] # Concatenate the features together (the result is a vector of size 2049) food_features = np.concatenate( (word_embedding, sentence_embedding, word_in_food_ontology)) area_features = np.concatenate( (word_embedding, sentence_embedding, word_in_area_ontology)) price_features = np.concatenate( (word_embedding, sentence_embedding, word_in_price_ontology)) # Decide whether the current word should update one (or more) of the slot types update_food_slot = self.food_algo.action(food_features) update_area_slot = self.area_algo.action(area_features) update_price_slot = self.price_algo.action(price_features) if update_food_slot: goal_stats["food"][word] += 1.0 if update_area_slot: goal_stats["area"][word] += 1.0 if update_price_slot: goal_stats["pricerange"][word] += 1.0 # pick top values for each slot super(BanditTrackerTF, self).fill_goal_labels(goal_stats, hyps) super(BanditTrackerTF, self).fill_joint_goals(hyps) self.hyps = hyps return self.hyps
def main(_): # create dataset data_type = "job_bank" num_contexts = 2000 num_actions = 2 context_dim = 2 dataset = np.empty((num_contexts, 4), dtype=np.float) opt_actions = np.empty(num_contexts, dtype=np.int) opt_rewards = np.empty(num_contexts, dtype=np.float) for iter in range(num_contexts): ctx = context_bandit_gen_context() all_probs = [context_bandit_prob(ctx, a) for a in range(num_actions)] optimal = np.argmax(all_probs) rewards = [context_bandit_reward(ctx, a) for a in range(num_actions)] dataset[iter, :] = np.array(ctx.tolist() + rewards) opt_actions[iter] = optimal opt_rewards[iter] = all_probs[optimal] hparams = HParams(num_actions=num_actions) hparams_linear = HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3, verbose=False) hparams_dropout = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80, verbose=False) hparams_bbb = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, verbose=False) hparams_nlinear = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25, verbose=False) hparams_nlinear2 = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25, verbose=False) hparams_pnoise = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, verbose=False) hparams_alpha_div = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, alpha=1.0, k=20, prior_variance=0.1, verbose=False) hparams_gp = HParams(num_actions=num_actions, num_outputs=num_actions, context_dim=context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=1000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False, verbose=False) algos = [ UniformSampling('Uniform Sampling', hparams), FixedPolicySampling('Fixed 1', [0.75, 0.25], hparams), FixedPolicySampling('Fixed 2', [0.25, 0.75], hparams), PosteriorBNNSampling('RMS', hparams_rms, 'RMSProp'), PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), BootstrappedBNNSampling('BootRMS', hparams_rms), ParameterNoiseSampling('ParamNoise', hparams_pnoise), PosteriorBNNSampling('BBAlphaDiv', hparams_alpha_div, 'AlphaDiv'), PosteriorBNNSampling('MultitaskGP', hparams_gp, 'GP'), ] _, h_rewards, times = run_contextual_bandit(context_dim, num_actions, dataset, algos) display_results(algos, opt_rewards, opt_actions, h_rewards, times, data_type)
def main(argv): opts = get_options() print("Parameters: {}".format(opts)) address = ('localhost', opts.ipc_port) # family is deduced to be 'AF_INET' listener = Listener(address, authkey=b'bandit') conn = listener.accept() multiprocessing.current_process().authkey = b'bandit' print('connection accepted from', listener.last_accepted) # Create contextual bandit bandit = IPCBandit(conn) if opts.algorithm == "uniform": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions) policy = UniformSampling('Uniform Sampling', policy_parameters) elif opts.algorithm == "linear": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) policy = LinearFullPosteriorSampling('LinFullPost', policy_parameters) elif opts.algorithm == "rms": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) policy = PosteriorBNNSampling('RMS', policy_parameters, 'RMSProp') elif opts.algorithm == "bootrms": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) policy =BootstrappedBNNSampling('BootRMS', policy_parameters) elif opts.algorithm == "dropout": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) policy = PosteriorBNNSampling('Dropout', policy_parameters, 'RMSProp') elif opts.algorithm == "bbb": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) policy = PosteriorBNNSampling('BBB', policy_parameters, 'Variational') elif opts.algorithm == "neurallinear": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) policy = NeuralLinearPosteriorSampling('NeuralLinear', policy_parameters) elif opts.algorithm == "neurallinear2": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) policy = NeuralLinearPosteriorSampling('NeuralLinear2', policy_parameters) elif opts.algorithm == "pnoise": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, ) policy = ParameterNoiseSampling('ParamNoise', policy_parameters) elif opts.algorithm == "alpha_div": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, alpha=1.0, k=20, prior_variance=0.1) policy = PosteriorBNNSampling('BBAlphaDiv', policy_parameters, 'AlphaDiv') elif opts.algorithm == "gp": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, num_outputs=bandit.num_actions, context_dim=bandit.context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=1000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False) policy = PosteriorBNNSampling('MultitaskGP', policy_parameters, 'GP') else: raise Exception("Misspecified bandit algorithm.") print(policy) # Run the contextual bandit process while True: context = bandit.context() if context is None: break action = policy.action(context) reward = bandit.pull(action) if reward is None: break policy.update(context, action, reward) conn.close() listener.close()
reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, bootstrap=artificial_data_generator) hparams_lineps = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, lam=0.1, eps=0.05) random_proto = lambda: UniformSampling('Uniform Sampling', hparams) neural_greedy_proto = lambda: PosteriorBNNSampling('NeuralGreedy', hparams_rms, 'RMSProp') neural_greedy_proto_bootstrapped = lambda: PosteriorBNNSampling( 'NeuralGreedy_artificial_data', hparams_rms_bootstrapped, 'RMSProp') bootstrap_proto = lambda: BootstrappedBNNSampling('BootRMS', hparams_rmsb) bootstrap_proto_bootstrapped = lambda: BootstrappedBNNSampling( 'BootRMS_artificial_data', hparams_rmsb_bootstrapped) noise_proto = lambda: ParameterNoiseSampling('ParamNoise', hparams_pnoise) noise_proto_bootstrapped = lambda: ParameterNoiseSampling( 'ParamNoise_artificial_data', hparams_pnoise_bootstrapped) dropout_proto = lambda: PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp') dropout_proto_bootstrapped = lambda: PosteriorBNNSampling( 'Dropout_artificial_data', hparams_dropout_bootstrapped, 'RMSProp')