def main(_): # Problem parameters num_contexts = 20000 nb_simulations = 2 l_sizes = [50, 50] plt_dir = "plots/" dict_dir = "dicts/" # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = 'adult' # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear_finite_memory = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=1) hparams_nlinear_finite_memory_no_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=0, sigma_prior_flag=0) hparams_nlinear_finite_memory_no_sig_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=0) hparams_ucb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, optimizer='RMS', training_freq=1, training_freq_network=50, training_epochs=100, lambda_prior=0.25, delta=0.01, lamb=0.01, mu=1, S=1) algos = [ #UniformSampling('Uniform Sampling', hparams), #FixedPolicySampling('fixed1', [0.75, 0.25], hparams), PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), #NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory', hparams_nlinear_finite_memory), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory_noP', hparams_nlinear_finite_memory_no_prior), #NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noSigP', hparams_nlinear_finite_memory_no_sig_prior), #NeuralUCBSampling('NeuralUCB', hparams_ucb) ] regrets = {} rewards = {} for a in algos: regrets[a.name] = np.zeros((nb_simulations, num_contexts)) rewards[a.name] = np.zeros(nb_simulations) rewards['opt_reward'] = np.zeros(nb_simulations) for k in range(nb_simulations): algos = [ #UniformSampling('Uniform Sampling', hparams), #FixedPolicySampling('fixed1', [0.75, 0.25], hparams), PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), #NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory', hparams_nlinear_finite_memory), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory_noP', hparams_nlinear_finite_memory_no_prior), #NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noSigP', hparams_nlinear_finite_memory_no_sig_prior), #NeuralUCBSampling('NeuralUCB', hparams_ucb) ] # Run contextual bandit problem t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type) for j, a in enumerate(algos): regrets[a.name][k, :] = np.cumsum(opt_rewards - h_rewards[:, j]) rewards[a.name][k] = np.sum(h_rewards[:, j]) rewards['opt_reward'][k] = np.sum(opt_rewards) save_plot(algos, regrets, data_type, num_contexts, plt_dir) np.save(dict_dir + 'dict_' + data_type + '.npy', rewards)
def main(_): # Problem parameters num_contexts = 2000 # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = 'mushroom' # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_pnoise = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, ) hparams_alpha_div = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, alpha=1.0, k=20, prior_variance=0.1) hparams_gp = tf.contrib.training.HParams(num_actions=num_actions, num_outputs=num_actions, context_dim=context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=1000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False) algos = [ UniformSampling('Uniform Sampling', hparams), UniformSampling('Uniform Sampling 2', hparams), FixedPolicySampling('fixed1', [0.75, 0.25], hparams), FixedPolicySampling('fixed2', [0.25, 0.75], hparams), PosteriorBNNSampling('RMS', hparams_rms, 'RMSProp'), PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), BootstrappedBNNSampling('BootRMS', hparams_rms), ParameterNoiseSampling('ParamNoise', hparams_pnoise), PosteriorBNNSampling('BBAlphaDiv', hparams_alpha_div, 'AlphaDiv'), PosteriorBNNSampling('MultitaskGP', hparams_gp, 'GP'), ] # Run contextual bandit problem t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type)
def run_iter(): # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = FLAGS.dataset # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_luga = tf.contrib.training.HParams(num_actions=num_actions, num_contexts=num_contexts, context_dim=context_dim, activation=tf.nn.relu, latent_dim=50, batch_size=512, initial_lr=2e-4, show_training=False, lr_decay=False, freq_summary=10000, buffer_s=-1, initial_pulls=2, training_freq=20, training_epochs=40, lambda_prior=0.25, show_loss=False, kl=1.0, recon=1.0, psigma=1.0, glnoise=False) hparams_sivi1 = tf.contrib.training.HParams(num_actions=num_actions, num_contexts=num_contexts, context_dim=context_dim, activation=tf.nn.relu, latent_dim=50, batch_size=512, initial_lr=1e-3, show_training=False, verbose=False, lr_decay=False, freq_summary=10000, buffer_s=-1, initial_pulls=2, training_freq=20, training_epochs=40, lambda_prior=0.25, show_loss=False, kl=1.0, recon=1.0, two_decoder=False, glnoise=False, psigma=1.25) hparams_lusi_abl_km = tf.contrib.training.HParams(num_actions=num_actions, num_contexts=num_contexts, context_dim=context_dim, activation=tf.nn.relu, latent_dim=50, batch_size=512, initial_lr=1e-3, show_training=False, verbose=False, lr_decay=False, freq_summary=10000, buffer_s=-1, initial_pulls=2, training_freq=20, training_epochs=40, lambda_prior=0.25, show_loss=False, km=1, onez=0, recon=1.0, two_decoder=False, glnoise=False, psigma=1.25) hparams_luga_abl_km = tf.contrib.training.HParams(num_actions=num_actions, num_contexts=num_contexts, context_dim=context_dim, activation=tf.nn.relu, latent_dim=50, batch_size=512, initial_lr=2e-4, show_training=False, lr_decay=False, freq_summary=10000, buffer_s=-1, initial_pulls=2, training_freq=20, training_epochs=40, lambda_prior=0.25, show_loss=False, km=1, onez=0, recon=1.0, psigma=1.0, glnoise=False) algos = [ UniformSampling('Uniform Sampling', hparams), #1 PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), #2 NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), #3 LinearFullPosteriorSampling('LinFullPost', hparams_linear), #4 PiposteriorBNNSampling('DGF', hparams_bbb, 'DGF'), #5 VariationalSampling_v4('LU_Gaussian', hparams_luga), #6 # A smaller learning rate like 3e-4 or 1e-4 will work better on the 'mushroom' dataset for LU_SIVI and LU_Gaussian VariationalSamplingSivi_dgf_v7("LU_SIVI", hparams_sivi1), #7 # For Ablation Study VariationalSampling_abl('LU_Gaussian_Ablation_multi_z_1m', hparams_luga_abl_km), VariationalSamplingSivi_dgf_abl("LU_SIVI_Ablation_multi_z_1m", hparams_lusi_abl_km) ] t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type) opt_rewards = opt_rewards.reshape([-1, 1]) regret_i = opt_rewards - h_rewards return regret_i
max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer=param[2], reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=50), 'RMSProp')) print(len(neural_greedy_protos)) random_proto = lambda: UniformSampling('Uniform Sampling', hparams) linThompson_proto = lambda: LinearFullPosteriorSampling('linThompson', hparams_linear) linUCB_proto = lambda: LinUCB('linUCB', hparams_linucb) linEps_proto = lambda: LinEpsilon('LinEpsilon', hparams_lineps) algo_protos = neural_greedy_protos + [linUCB_proto, linEps_proto, linThompson_proto, random_proto] # for algo_proto in algo_protos: # algo = algo_proto() # print(algo.name, algo.hparams) # print (algo_protos[0]==algo_protos[1]) # Run experiments several times save and plot results benchmarker = Benchmarker(algo_protos, dataset_proto, num_actions, context_dim, nb_contexts=num_contexts, test_name='NNparams_linear_test1_0_10') benchmarker.run_experiments(50) benchmarker.save_results('./results/')
def main(_): # create dataset data_type = "job_bank" num_contexts = 2000 num_actions = 2 context_dim = 2 dataset = np.empty((num_contexts, 4), dtype=np.float) opt_actions = np.empty(num_contexts, dtype=np.int) opt_rewards = np.empty(num_contexts, dtype=np.float) for iter in range(num_contexts): ctx = context_bandit_gen_context() all_probs = [context_bandit_prob(ctx, a) for a in range(num_actions)] optimal = np.argmax(all_probs) rewards = [context_bandit_reward(ctx, a) for a in range(num_actions)] dataset[iter, :] = np.array(ctx.tolist() + rewards) opt_actions[iter] = optimal opt_rewards[iter] = all_probs[optimal] hparams = HParams(num_actions=num_actions) hparams_linear = HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3, verbose=False) hparams_dropout = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80, verbose=False) hparams_bbb = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, verbose=False) hparams_nlinear = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25, verbose=False) hparams_nlinear2 = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25, verbose=False) hparams_pnoise = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, verbose=False) hparams_alpha_div = HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, alpha=1.0, k=20, prior_variance=0.1, verbose=False) hparams_gp = HParams(num_actions=num_actions, num_outputs=num_actions, context_dim=context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=1000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False, verbose=False) algos = [ UniformSampling('Uniform Sampling', hparams), FixedPolicySampling('Fixed 1', [0.75, 0.25], hparams), FixedPolicySampling('Fixed 2', [0.25, 0.75], hparams), PosteriorBNNSampling('RMS', hparams_rms, 'RMSProp'), PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), BootstrappedBNNSampling('BootRMS', hparams_rms), ParameterNoiseSampling('ParamNoise', hparams_pnoise), PosteriorBNNSampling('BBAlphaDiv', hparams_alpha_div, 'AlphaDiv'), PosteriorBNNSampling('MultitaskGP', hparams_gp, 'GP'), ] _, h_rewards, times = run_contextual_bandit(context_dim, num_actions, dataset, algos) display_results(algos, opt_rewards, opt_actions, h_rewards, times, data_type)
def main(argv): opts = get_options() print("Parameters: {}".format(opts)) address = ('localhost', opts.ipc_port) # family is deduced to be 'AF_INET' listener = Listener(address, authkey=b'bandit') conn = listener.accept() multiprocessing.current_process().authkey = b'bandit' print('connection accepted from', listener.last_accepted) # Create contextual bandit bandit = IPCBandit(conn) if opts.algorithm == "uniform": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions) policy = UniformSampling('Uniform Sampling', policy_parameters) elif opts.algorithm == "linear": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) policy = LinearFullPosteriorSampling('LinFullPost', policy_parameters) elif opts.algorithm == "rms": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) policy = PosteriorBNNSampling('RMS', policy_parameters, 'RMSProp') elif opts.algorithm == "bootrms": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) policy =BootstrappedBNNSampling('BootRMS', policy_parameters) elif opts.algorithm == "dropout": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) policy = PosteriorBNNSampling('Dropout', policy_parameters, 'RMSProp') elif opts.algorithm == "bbb": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) policy = PosteriorBNNSampling('BBB', policy_parameters, 'Variational') elif opts.algorithm == "neurallinear": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) policy = NeuralLinearPosteriorSampling('NeuralLinear', policy_parameters) elif opts.algorithm == "neurallinear2": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) policy = NeuralLinearPosteriorSampling('NeuralLinear2', policy_parameters) elif opts.algorithm == "pnoise": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, ) policy = ParameterNoiseSampling('ParamNoise', policy_parameters) elif opts.algorithm == "alpha_div": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, context_dim=bandit.context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, alpha=1.0, k=20, prior_variance=0.1) policy = PosteriorBNNSampling('BBAlphaDiv', policy_parameters, 'AlphaDiv') elif opts.algorithm == "gp": policy_parameters = tf.contrib.training.HParams(num_actions=bandit.num_actions, num_outputs=bandit.num_actions, context_dim=bandit.context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=1000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False) policy = PosteriorBNNSampling('MultitaskGP', policy_parameters, 'GP') else: raise Exception("Misspecified bandit algorithm.") print(policy) # Run the contextual bandit process while True: context = bandit.context() if context is None: break action = policy.action(context) reward = bandit.pull(action) if reward is None: break policy.update(context, action, reward) conn.close() listener.close()
def main(_): # Problem parameters num_contexts = 3500#2000 tfn=200 MEMSIZE = 700#num_contexts/10 # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = 'financial' # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear_finite_memory = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=MEMSIZE, mu_prior_flag=1, sigma_prior_flag=1, ) hparams_nlinear_finite_memory2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=MEMSIZE, mu_prior_flag=1, sigma_prior_flag=1, ) hparams_nlinear_finite_memory_no_prior = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=MEMSIZE, mu_prior_flag=0, sigma_prior_flag=0, ) hparams_nlinear_finite_memory2_no_prior = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=MEMSIZE, mu_prior_flag=0, sigma_prior_flag=0, ) hparams_nlinear_finite_memory_no_sig_prior = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=MEMSIZE, mu_prior_flag=1, sigma_prior_flag=0, ) hparams_nlinear_finite_memory2_no_sig_prior = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=MEMSIZE, mu_prior_flag=1, sigma_prior_flag=0, ) hparams_pnoise = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, ) hparams_alpha_div = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, alpha=1.0, k=20, prior_variance=0.1) hparams_gp = tf.contrib.training.HParams(num_actions=num_actions, num_outputs=num_actions, context_dim=context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=1000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False) Nruns=50 par=0 NAgents = 10 res=[[]for i in xrange(NAgents)] # for i in xrange(Nruns): print(i) algos = [ UniformSampling('Uniform Sampling', hparams), # UniformSampling('Uniform Sampling 2', hparams), # FixedPolicySampling('fixed1', [0.75, 0.25], hparams), # FixedPolicySampling('fixed2', [0.25, 0.75], hparams), # PosteriorBNNSampling('RMS', hparams_rms, 'RMSProp'), # PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), # PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory', hparams_nlinear_finite_memory), NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory2', hparams_nlinear_finite_memory2), NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noP', hparams_nlinear_finite_memory_no_prior), NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory2_noP', hparams_nlinear_finite_memory2_no_prior), NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noSigP', hparams_nlinear_finite_memory_no_sig_prior), NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory2_noSigP', hparams_nlinear_finite_memory2_no_sig_prior), LinearFullPosteriorSampling('LinFullPost', hparams_linear), # BootstrappedBNNSampling('BootRMS', hparams_rms), # ParameterNoiseSampling('ParamNoise', hparams_pnoise), # PosteriorBNNSampling('BBAlphaDiv', hparams_alpha_div, 'AlphaDiv'), # PosteriorBNNSampling('MultitaskGP', hparams_gp, 'GP'), ] if par==0: # Run contextual bandit problem t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type) # Append Results for j, a in enumerate(algos): res[j].append((np.sum(h_rewards[:, j]))) else: par_res = Parallel(n_jobs=num_cores)( delayed(Run)(context_dim, num_actions, dataset, algos, opt_rewards, opt_actions, data_type) for i in xrange(Nruns)) for j, rr in enumerate(par_res): res[j].append(rr[j]) if i<(Nruns-3): algos=None display_final_results(algos,opt_rewards, res, data_type)
def main(argv): # Problem parameters num_contexts = 4000 tfn = 400 tfe = tfn * 2 data_type = 'statlog' l_sizes = [50] outdir = "./" # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim, vocab_processor = sampled_vals if not os.path.exists(outdir): os.makedirs(outdir) # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_txt = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, batch_size=64, initial_lr=0.1, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, training_freq=1, training_freq_network=tfn, training_epochs=tfe, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=tfe, a0=6, b0=6, lambda_prior=0.25) hparams_epsilon = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=tfe, epsilon=0.1) hparams_nlinear_finite_memory = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=tfe, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=1) hparams_nlinear_finite_memory_no_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=tfe, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=0, sigma_prior_flag=0) hparams_nlinear_finite_memory_no_sig_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=tfe, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=0) Nruns = 10 n_algs = 5 res = np.zeros((n_algs, num_contexts)) totalreward = [0 for i in xrange(n_algs)] rewards = [[] for i in xrange(n_algs)] for i_run in xrange(Nruns): algos = [ NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory', hparams_nlinear_finite_memory), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory_noP', hparams_nlinear_finite_memory_no_prior), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory_noSigP', hparams_nlinear_finite_memory_no_sig_prior), LinearFullPosteriorSampling('LinFullPost', hparams_linear), ] results = run_contextual_bandit(context_dim, num_actions, dataset, algos) h_actions, h_rewards = results for j, a in enumerate(algos): print(np.sum(h_rewards[:, j])) totalreward[j] += ((np.sum(h_rewards[:, j])) / Nruns) rewards[j].append((np.sum(h_rewards[:, j]))) actions = [[] for i in xrange(len(h_actions[0]))] for aa in h_actions: for i, a in enumerate(aa): actions[i].append(a) for i_alg in xrange(len(algos)): res[i_alg, :] += 1 * ((actions[i_alg] != opt_actions)) if i_run < (Nruns - 1): algos = None display_final_results(algos, opt_rewards, opt_actions, rewards, data_type)
def main(_): # Problem parameters num_contexts = 40000 # parameters of finite tfn = 400 tfe = tfn * 2 data_type = 'statlog' l_sizes = [50, 50] outdir = "./" # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = 'moon' nExperiment = 2 # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50, 50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50, 50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50, 50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear_finite_memory = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=1) hparams_nlinear_finite_memory_no_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=0, sigma_prior_flag=0) hparams_nlinear_finite_memory_no_sig_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=0) hparams_pnoise = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, ) hparams_alpha_div = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, alpha=1.0, k=20, prior_variance=0.1) hparams_gp = tf.contrib.training.HParams(num_actions=num_actions, num_outputs=num_actions, context_dim=context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=1000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False) hparams_greedy = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, optimizer='RMS', training_freq=50, training_freq_network=50, training_epochs=100, lambda_prior=0.25, delta=0.01, lamb=0.01, mu=1, S=1) hparams_ucb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, optimizer='RMS', training_freq=50, training_freq_network=50, training_epochs=100, lambda_prior=0.25, delta=0.01, lamb=0.01, mu=1, S=1) # Run contextual bandit problem t_init = time.time() for i in range(nExperiment): algos = [ #UniformSampling('Uniform Sampling', hparams), #UniformSampling('Uniform Sampling 2', hparams), #FixedPolicySampling('fixed1', [0.75, 0.25], hparams), #FixedPolicySampling('fixed2', [0.25, 0.75], hparams), #PosteriorBNNSampling('RMS', hparams_rms, 'RMSProp'), #PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), #PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), #NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), #NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), #BootstrappedBNNSampling('BootRMS', hparams_rms), #NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory', hparams_nlinear_finite_memory), #NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noP', hparams_nlinear_finite_memory_no_prior), #NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noSigP', hparams_nlinear_finite_memory_no_sig_prior) #ParameterNoiseSampling('ParamNoise', hparams_pnoise), #PosteriorBNNSampling('BBAlphaDiv', hparams_alpha_div, 'AlphaDiv'), #PosteriorBNNSampling('MultitaskGP', hparams_gp, 'GP'),hparams_ucb #NeuralUCBSampling('NeuralUCB', hparams_ucb) NeuralGreedy('NeuralGreedy', hparams_greedy) ] results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results np.savetxt("resultLin" + str(i) + ".csv", h_rewards[:, 0], delimiter=',') np.savetxt("resultMoon" + str(i) + ".csv", h_rewards[:, 1], delimiter=',') # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type)