def mujoco_gail(): return dict( policy_network=mlp(num_hidden=100, num_layers=2), classifier_network=mlp(num_hidden=100, num_layers=2), max_iters=4001, timesteps_per_batch=1000, max_kl=0.01, cg_iters=10, gamma=0.995, lam=0.97, entcoeff=0.0, cg_damping=0.1, vf_stepsize=1e-3, vf_iters=5, expert_trajs_path='./expert_trajs', num_expert_trajs=25, g_step=1, d_step=5, classifier_entcoeff=1e-3, num_particles=1, d_stepsize=0.01, normalize_observations=True, observation_dependent_var=True, use_classifier_logsumexp=False, use_reward_logsumexp=False, use_svgd=False )
def main(): ishumanFirstPlayer = int(sys.argv[1]) ishumanCut = int(sys.argv[2]) iterNo = int(sys.argv[3]) env = gym.make('shannon_switching-v0') env.configureEnvironment(computerType="minMax", ishumanFirstPlayer=ishumanFirstPlayer, ishumanCut=ishumanCut, iterNo=iterNo) print("Computer Type: ", "minMax") print("ishumanFirstPlayer ", ishumanFirstPlayer) print("ishumanCut", ishumanCut) print("iterNo", iterNo) act = deepq.learn( env, network=models.mlp(num_hidden=20, num_layers=3), lr=5e-4, total_timesteps=50, buffer_size=5000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=1, param_noise=False, prioritized_replay=True, load_path='model/minMax/shannon_switching_{}_{}_{}.pkl'.format( ishumanFirstPlayer, ishumanCut, iterNo - 1) if iterNo > 0 else None) return act print("Saving model to model/minMax/shannon_switching_{}_{}_{}.pkl".format( ishumanFirstPlayer, ishumanCut, iterNo)) act.save("model/minMax/shannon_switching_{}_{}_{}.pkl".format( ishumanFirstPlayer, ishumanCut, iterNo)) act.save_act("model/minMax/shannon_switching_train_{}_{}_{}.pkl".format( ishumanFirstPlayer, ishumanCut, iterNo))
def main(): env = gym.make("mediator-v0") act = learn(env, network=models.mlp(num_layers=3, num_hidden=128, activation=tf.tanh, layer_norm=False), lr=1e-3, total_timesteps=10000, buffer_size=5000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=25, batch_size=128, print_freq=100, learning_starts=1000, gamma=0.1, target_network_update_freq=100, param_noise=True, callback=callback) print("Saving model to mediator_model.pkl") act.save("mediator-v0_model_00.pkl")
def load_policy(model_path, input_dim, output_dim, num_hidden, num_layers, init_logstd=1., discrete=False, beta=1.0): observation_space = Box(low=-np.inf, high=np.inf, shape=(input_dim, )) if discrete: action_space = Discrete(n=output_dim) else: action_space = Box(low=-np.inf, high=np.inf, shape=(output_dim, )) tf.reset_default_graph() config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=8, intra_op_parallelism_threads=8, device_count={'CPU': 8}) config.gpu_options.allow_growth = True sess = U.make_session(make_default=True, config=config) network = mlp(num_hidden=num_hidden, num_layers=num_layers) policy_train = build_policy(observation_space, action_space, network, trainable_variance=True, state_dependent_variance=True, beta=beta, init_logstd=init_logstd)() U.initialize() policy_train.load(model_path) return policy_train
def main(learning_rate): tf.reset_default_graph( ) # to avoid the conflict with the existing parameters, but this is not suggested for reuse parameters graph = tf.get_default_graph() #print(graph.get_operations()) env = PowerDynSimEnv(case_files_array, dyn_config_file, rl_config_file, java_port) #model = deepq.models.mlp([128,128]) act = deepq.learn(env, network=models.mlp(num_layers=2, num_hidden=128, activation=tf.nn.relu), lr=learning_rate, total_timesteps=900000, buffer_size=50000, checkpoint_freq=1000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback) print("Saving final model to power_model_multistep498_508_lr_%s_90w.pkl" % (str(learning_rate))) act.save(savedModel + "/" + model_name + "_lr_%s_90w.pkl" % (str(learning_rate)))
def main(): random.seed(10) env = gym.make('Scaling-v0') act = deepq.learn(env, network=models.mlp(num_hidden=20, num_layers=1), train_freq=4, buffer_size=1000, exploration_fraction=1.0, exploration_final_eps=1e-5, total_timesteps=200000, prioritized_replay=True, checkpoint_freq=None, print_freq=1) # play model using shorter change rate env.change_rate = 100 frames = 1000 play(act, env, frames) # play sine curve env.change_rate = 1 env.scaling_env_options['input'] = INPUTS['SINE_CURVE'] play(act, env, frames)
def sample_strategy_from_mixed(env, str_set, mix_str, identity): if not isinstance(mix_str,np.ndarray): raise ValueError("mix_str in sample func is not a numpy array.") if not len(str_set) == len(mix_str): raise ValueError("Length of mixed strategies does not match number of strategies.") picked_str = np.random.choice(str_set,p=mix_str) if not fp.isInName('.pkl', name = picked_str): raise ValueError('The strategy picked is not a pickle file.') if identity == 0: # pick a defender's strategy path = DIR + 'defender_strategies/' elif identity == 1: path = DIR + 'attacker_strategies/' else: raise ValueError("identity is neither 0 or 1!") if not fp.isExist(path + picked_str): raise ValueError('The strategy picked does not exist!') #TODO: assign nn info from game act = deepq.learn( env, network=models.mlp(num_hidden=256, num_layers=1), total_timesteps=0, load_path= path + picked_str ) return act
def main(): env = gym.make("sparse-v0") act = deepq.learn(env, network=models.mlp(num_layers=2, num_hidden=128, activation=tf.nn.relu), total_timesteps=0, load_path=dirs) while True: obs, screen_obs = env.reset_with_render() done = False episode_rew = 0 converted = converter(screen_obs) my_plot = plt.imshow(converted) while not done: obs, rew, done, _, screen_obs = env.step_with_render(act(obs)[0]) #obs, rew, done, _ , screen_obs = env.step_with_render(env.action_space.sample()) converted = converter(screen_obs) plt.ion() my_plot.autoscale() my_plot.set_data(converted) plt.pause(.1) plt.draw() plt.show() print("action: ", act(obs)[0]) episode_rew += rew print("Episode reward", episode_rew)
def main(test_episodes=20, test_steps=50): env = env_search_control() print(env.observation_space) print(env.action_space) act = deepq.learn(env, network=models.mlp(num_layers=1, num_hidden=64), total_timesteps=0, total_episodes=0, total_steps=0, load_path="assembly_model_fuzzy_final.pkl") episode_rewards = [] episode_states = [] for i in range(test_episodes): obs, done = env.reset() episode_rew = 0 episode_obs = [] logger.info( "================== The {} episode start !!! ===================". format(i)) for j in range(test_steps): obs, rew, done, _ = env.step(act(obs[None])[0], j) episode_rew += rew episode_obs.append(obs) episode_rewards.append(cp.deepcopy(episode_rew)) episode_states.append(cp.deepcopy(episode_obs)) print("Episode reward", episode_rew) np.save('../data/test_episode_reward_fuzzy_final_new', episode_rewards) np.save('../data/test_episode_state_fuzzy_final_new', episode_states)
def main(): # setup environment ishumanFirstPlayer = int(sys.argv[1]) ishumanCut = int(sys.argv[2]) iterNo = int(sys.argv[3]) env = gym.make('shannon_switching-v0') env.configureEnvironment(computerType="selfPlayZero", ishumanFirstPlayer=ishumanFirstPlayer, ishumanCut=ishumanCut, iterNo=iterNo) print("Computer Type: ", "selfPlayZero") print("ishumanFirstPlayer ", ishumanFirstPlayer) print("ishumanCut", ishumanCut) print("iterNo", iterNo) # input("Press Enter to continue...") # train network act = deepq.learn( env, network=models.mlp(num_hidden=25, num_layers=8), lr=5e-4, total_timesteps=20, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=1, param_noise=False, prioritized_replay=True, load_path='model/selfPlayZero/shannon_switching_{}_{}_{}.pkl'.format( ishumanFirstPlayer, ishumanCut, iterNo - 1) if iterNo > 0 else None) print("Saving model to model/selfPlayZero/shannon_switching_{}_{}_{}.pkl". format(ishumanFirstPlayer, ishumanCut, iterNo))
def train(args, extra_args): env_type = 'steam' env_id = 'dota2' print('env_type: {}'.format(env_type)) alg_kwargs = dict(network=models.mlp(num_hidden=128, num_layers=1), lr=1e-3, buffer_size=10000, total_timesteps=500000, propertyexploration_fraction=1.0, exploration_initial_eps=0.1, exploration_final_eps=0.1, train_freq=4, target_network_update_freq=1000, gamma=0.999, batch_size=32, prioritized_replay=True, prioritized_replay_alpha=0.6, experiment_name='test', dueling=True) alg_kwargs.update(extra_args) env = DotaEnvironment() print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) pool_size = multiprocessing.cpu_count() with multiprocessing.Pool(processes=pool_size) as pool: model = learn(env=env) return model, env
def mlp_sokoban(observation, output_dim=1): net = observation net = mlp(num_layers=2, num_hidden=64, activation=tf.tanh, layer_norm=False)(net) output = slim.fully_connected(net, output_dim, activation_fn=None) return output
def mlp_bsuite(observation, output_dim=1, output_rescale=1.): """Similiar to bsuite default agent network.""" net = observation net = mlp(num_layers=2, num_hidden=50, activation=tf.nn.relu, layer_norm=False)(net) output = slim.fully_connected(net, output_dim, activation_fn=None) * output_rescale return output
def mujoco(): return dict(network=mlp(num_hidden=32, num_layers=2), timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, normalize_observations=True)
def load_action(path, game): env = game.env num_layers = game.num_layers num_hidden = game.num_hidden act = deepq.learn(env, network=models.mlp(num_layers=num_layers, num_hidden=num_hidden), total_timesteps=0, load_path=path) return act
def training_hado_att(game, transfer=False): param = game.param mix_str_def = game.hado_str(identity=0, param=param) if len(mix_str_def) != len(game.def_str): raise ValueError( "The length of mix_str_def and def_str does not match while retraining" ) env = game.env env.reset_everything() env.set_training_flag(1) env.defender.set_mix_strategy(mix_str_def) env.defender.set_str_set(game.def_str) param_path = os.getcwd() + '/network_parameters/param.json' param = jp.load_json_data(param_path) if transfer: lr = param['trans_lr'] total_timesteps = param['trans_timesteps'] ex_frac = param['trans_exploration_fraction'] ex_final_eps = param['trans_exploration_final_eps'] else: lr = param['lr'] total_timesteps = param['total_timesteps'] ex_frac = param['exploration_fraction'] ex_final_eps = param['exploration_final_eps'] learner = Learner(retrain=True, freq=param['retrain_freq']) #TODO: add epoch??? with learner.graph.as_default(): with learner.sess.as_default(): act_att, _ = learner.learn_multi_nets( env, network=models.mlp(num_hidden=param['num_hidden'], num_layers=param['num_layers']), lr=lr, total_timesteps=total_timesteps, exploration_fraction=ex_frac, exploration_final_eps=ex_final_eps, print_freq=param['print_freq'], param_noise=param['param_noise'], gamma=param['gamma'], prioritized_replay=param['prioritized_replay'], checkpoint_freq=param['checkpoint_freq'], scope='att_str_retrain' + str(0) + '.pkl' + '/', load_path=os.getcwd() + '/retrain_att/' + 'att_str_retrain' + str(0) + '.pkl') # print("Saving attacker's model to pickle.") # act_att.save(os.getcwd() + '/retrain_att/' + 'att_str_retrain' + str(epoch) + ".pkl", 'att_str_epoch' + str(epoch) + '.pkl' + '/') learner.sess.close()
def dr(): return dict( network=mlp(num_hidden=128, num_layers=3), timesteps_per_batch=144 * 100, max_kl=0.01, gamma=0.995, lam=0.95, ent_coef=0.01, activation=tf.nn.relu, normalize_observations=True, value_network='copy', )
def training_def(game, mix_str_att, epoch, retrain=False): if len(mix_str_att) != len(game.att_str): raise ValueError( "The length of mix_str_att and att_str does not match while retraining" ) print("training_def mix_str_att is ", mix_str_att) # env = copy.deepcopy(game.env) env = game.env env.reset_everything() env.set_training_flag(0) env.attacker.set_mix_strategy(mix_str_att) env.attacker.set_str_set(game.att_str) param_path = os.getcwd() + '/network_parameters/param.json' param = jp.load_json_data(param_path) if retrain: scope = 'def_str_retrain' + str(0) + '.pkl' + '/' else: scope = 'def_str_epoch' + str(epoch) + '.pkl' + '/' learner = Learner() with learner.graph.as_default(): with learner.sess.as_default(): act_def, d_BD = learner.learn_multi_nets( env, network=models.mlp(num_hidden=param['num_hidden'], num_layers=param['num_layers']), lr=param['lr'], total_timesteps=param['total_timesteps_def'], exploration_fraction=param['exploration_fraction'], exploration_final_eps=param['exploration_final_eps'], print_freq=param['print_freq'], param_noise=param['param_noise'], gamma=param['gamma'], prioritized_replay=param['prioritized_replay'], checkpoint_freq=param['checkpoint_freq'], scope=scope, epoch=epoch) print("Saving defender's model to pickle.") if retrain: act_def.save( os.getcwd() + '/retrain_def/' + 'def_str_retrain' + str(0) + '.pkl', 'def_str_retrain' + str(0) + '.pkl' + '/') else: act_def.save(DIR_def + "def_str_epoch" + str(epoch) + ".pkl", "def_str_epoch" + str(epoch) + '.pkl' + '/') learner.sess.close() return d_BD
def sample_strategy_from_mixed(env, str_set, mix_str, identity, str_dict=None): if not isinstance(mix_str, np.ndarray): raise ValueError("mix_str in sample func is not a numpy array.") if not len(str_set) == len(mix_str): raise ValueError( "Length of mixed strategies does not match number of strategies.") # if np.sum(mix_str) != 1: # mix_str = mix_str/np.sum(mix_str) picked_str = np.random.choice(str_set, p=mix_str) # print('current str:', picked_str) #TODO: modification for fast sampling. if str_dict != None: return str_dict[picked_str] if not fp.isInName('.pkl', name=picked_str): raise ValueError('The strategy picked is not a pickle file.') if identity == 0: # pick a defender's strategy path = DIR + 'defender_strategies/' elif identity == 1: path = DIR + 'attacker_strategies/' else: raise ValueError("identity is neither 0 or 1!") # print(path + picked_str) if not fp.isExist(path + picked_str): raise ValueError('The strategy picked does not exist!') if "epoch1.pkl" in picked_str: act = fp.load_pkl(path + picked_str) return act flag = env.training_flag env.set_training_flag(identity) param_path = os.getcwd() + '/network_parameters/param.json' param = jp.load_json_data(param_path) act = learn(env, network=models.mlp(num_hidden=param['num_hidden'], num_layers=param['num_layers']), total_timesteps=0, load_path=path + picked_str, scope=picked_str + '/') env.set_training_flag(flag) return act
def airhockey(): return dict( network=mlp(num_hidden=32, num_layers=2), timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.5**(1. / 16), lam=0.5**(1. / 8), vf_iters=5, vf_stepsize=1e-3, normalize_observations=False, )
def classic_control(): return dict( network=mlp(num_hidden=32, num_layers=2), timesteps_per_batch=1024, #epsilon=0.01, #cg_iters=10, #cg_damping=0.1, #gamma=0.99, #lam=0.98, #vf_iters=5, #vf_stepsize=1e-3, normalize_observations=True, )
def lucia_env(): return dict( network=mlp(num_hidden=64, num_layers=5), timesteps_per_batch=1024, max_kl=0.005, cg_iters=10, cg_damping=0.1, gamma=0.98, lam=0.98, vf_iters=5, vf_stepsize=1e-3, normalize_observations=False, )
def ev(): return dict( network=mlp(num_hidden=64, num_layers=3), timesteps_per_batch=7000, max_kl=0.01, max_sf=2.0, gamma=0.99, lam=0.95, ent_coef=0.1, activation=tf.nn.relu, normalize_observations=True, value_network='copy', )
def roboschool(): return dict( network=mlp(num_hidden=64, num_layers=2), timesteps_per_batch=512, max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=10, vf_stepsize=1e-3, normalize_observations=True, )
def mujoco(): return dict( network = mlp(num_hidden=32, num_layers=2), timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, normalize_observations=True, )
def sample_both_strategies(env, att_str_set, att_mix_str, def_str_set, def_mix_str): if not len(att_str_set) == len(att_mix_str): raise ValueError("Length of mixed strategies does not match number of strategies for the attacker.") if not len(def_str_set) == len(def_mix_str): raise ValueError("Length of mixed strategies does not match number of strategies for the defender.") att_picked_str = np.random.choice(att_str_set, p=att_mix_str) def_picked_str = np.random.choice(def_str_set, p=def_mix_str) if not fp.isInName('.pkl', name=def_picked_str): raise ValueError('The strategy picked is not a pickle file for the defender.') if not fp.isInName('.pkl', name=att_picked_str): raise ValueError('The strategy picked is not a pickle file for the attacker.') path_def = DIR + 'defender_strategies/' path_att = DIR + 'attacker_strategies/' if not fp.isExist(path_def + def_picked_str): raise ValueError('The strategy picked does not exist for the defender!') if not fp.isExist(path_att + att_picked_str): raise ValueError('The strategy picked does not exist for the attacker!') act_att = deepq.learn( env, network=models.mlp(num_hidden=256, num_layers=1), total_timesteps=0, load_path=path_att + att_picked_str ) act_def = deepq.learn( env, network=models.mlp(num_hidden=256, num_layers=1), total_timesteps=0, load_path= path_def + def_picked_str ) return act_att, act_def
def mujoco(): return dict( network=mlp(num_hidden=256, num_layers=5), nsteps=2048, nminibatches=8, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=lambda f: 5e-5 * f, cliprange=0.2, #value_network='copy' )
def main(): env = gym.make("MountainCar-v0") # Enabling layer_norm here is import for parameter space noise! act = deepq.learn(env, network=models.mlp(num_hidden=64, num_layers=1), lr=1e-3, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.1, print_freq=10, param_noise=True) print("Saving model to mountaincar_model.pkl") act.save("mountaincar_model.pkl")
def robotics(): return dict( network=mlp(num_hidden=32, num_layers=2), timesteps_per_batch=1024, epsilon=0.01, cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, normalize_observations=True, entcoeff=0.0, )
def rand_str_generator(env, game): # Generate random nn for attacker. num_layers = game.num_layers num_hidden = game.num_hidden act_att = deepq.learn( env, network=models.mlp(num_hidden=num_hidden, num_layers=num_layers-3), total_timesteps=0 ) act_def = deepq.learn( env, network=models.mlp(num_hidden=num_hidden, num_layers=num_layers-3), total_timesteps=0 ) print("Saving attacker's model to pickle. Epoch name is equal to 1.") act_att.save(DIR_att + "att_str_epoch" + str(1) + ".pkl") game.att_str.append("att_str_epoch" + str(1) + ".pkl") print("Saving defender's model to pickle. Epoch in name is equal to 1.") act_def.save(DIR_def + "def_str_epoch" + str(1) + ".pkl") game.def_str.append("def_str_epoch" + str(1) + ".pkl")
def main(): env = gym.make("MountainCar-v0") act = deepq.learn(env, network=models.mlp(num_layers=1, num_hidden=64), total_timesteps=0, load_path='mountaincar_model.pkl') while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def main(): env = gym.make("MountainCar-v0") # Enabling layer_norm here is import for parameter space noise! act = deepq.learn( env, network=models.mlp(num_hidden=64, num_layers=1), lr=1e-3, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.1, print_freq=10, param_noise=True ) print("Saving model to mountaincar_model.pkl") act.save("mountaincar_model.pkl")
def main(): env = gym.make("MountainCar-v0") act = deepq.learn( env, network=models.mlp(num_layers=1, num_hidden=64), total_timesteps=0, load_path='mountaincar_model.pkl' ) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)