def main(_): model_dir = util.get_model_dir(conf, ['data_dir', 'sample_dir', 'max_epoch', 'test_step', 'save_step', 'is_train', 'random_seed', 'log_level', 'display', 'runtime_base_dir', 'occlude_start_row', 'num_generated_images']) util.preprocess_conf(conf) validate_parameters(conf) data = 'mnist' if conf.data == 'color-mnist' else conf.data DATA_DIR = os.path.join(conf.runtime_base_dir, conf.data_dir, data) SAMPLE_DIR = os.path.join(conf.runtime_base_dir, conf.sample_dir, conf.data, model_dir) util.check_and_create_dir(DATA_DIR) util.check_and_create_dir(SAMPLE_DIR) dataset = get_dataset(DATA_DIR, conf.q_levels) with tf.Session() as sess: network = Network(sess, conf, dataset.height, dataset.width, dataset.channels) stat = Statistic(sess, conf.data, conf.runtime_base_dir, model_dir, tf.trainable_variables()) stat.load_model() if conf.is_train: train(dataset, network, stat, SAMPLE_DIR) else: generate(network, dataset.height, dataset.width, SAMPLE_DIR)
def main(_): model_dir = get_model_dir(conf, ['is_train', 'random_seed', 'monitor', 'display', 'log_level']) preprocess_conf(conf) with tf.Session() as sess: # environment env = gym.make(conf.env_name) env.seed(conf.random_seed) assert isinstance(env.observation_space, gym.spaces.Box), \ "observation space must be continuous" assert isinstance(env.action_space, gym.spaces.Box), \ "action space must be continuous" # exploration strategy if conf.noise == 'ou': strategy = OUExploration(env, sigma=conf.noise_scale) elif conf.noise == 'brownian': strategy = BrownianExploration(env, conf.noise_scale) elif conf.noise == 'linear_decay': strategy = LinearDecayExploration(env) else: raise ValueError('Unkown exploration strategy: %s' % conf.noise) # networks shared_args = { 'sess': sess, 'input_shape': env.observation_space.shape, 'action_size': env.action_space.shape[0], 'hidden_dims': conf.hidden_dims, 'use_batch_norm': conf.use_batch_norm, 'use_seperate_networks': conf.use_seperate_networks, 'hidden_w': conf.hidden_w, 'action_w': conf.action_w, 'hidden_fn': conf.hidden_fn, 'action_fn': conf.action_fn, 'w_reg': conf.w_reg, } logger.info("Creating prediction network...") pred_network = Network( scope='pred_network', **shared_args ) logger.info("Creating target network...") target_network = Network( scope='target_network', **shared_args ) target_network.make_soft_update_from(pred_network, conf.tau) # statistic stat = Statistic(sess, conf.env_name, model_dir, pred_network.variables, conf.update_repeat) agent = NAF(sess, env, strategy, pred_network, target_network, stat, conf.discount, conf.batch_size, conf.learning_rate, conf.max_steps, conf.update_repeat, conf.max_episodes) agent.run(conf.monitor, conf.display, conf.is_train)
def main(_): model_dir = get_model_dir(conf, ['is_train', 'random_seed', 'monitor', 'display', 'log_level']) preprocess_conf(conf) with tf.Session() as sess: # environment env = gym.make(conf.env_name) env._seed(conf.random_seed) assert isinstance(env.observation_space, gym.spaces.Box), \ "observation space must be continuous" assert isinstance(env.action_space, gym.spaces.Box), \ "action space must be continuous" # exploration strategy if conf.noise == 'ou': strategy = OUExploration(env, sigma=conf.noise_scale) elif conf.noise == 'brownian': strategy = BrownianExploration(env, conf.noise_scale) elif conf.noise == 'linear_decay': strategy = LinearDecayExploration(env) else: raise ValueError('Unkown exploration strategy: %s' % conf.noise) # networks shared_args = { 'sess': sess, 'input_shape': env.observation_space.shape, 'action_size': env.action_space.shape[0], 'hidden_dims': conf.hidden_dims, 'use_batch_norm': conf.use_batch_norm, 'use_seperate_networks': conf.use_seperate_networks, 'hidden_w': conf.hidden_w, 'action_w': conf.action_w, 'hidden_fn': conf.hidden_fn, 'action_fn': conf.action_fn, 'w_reg': conf.w_reg, } logger.info("Creating prediction network...") pred_network = Network( scope='pred_network', **shared_args ) logger.info("Creating target network...") target_network = Network( scope='target_network', **shared_args ) target_network.make_soft_update_from(pred_network, conf.tau) # statistic stat = Statistic(sess, conf.env_name, model_dir, pred_network.variables, conf.update_repeat) agent = NAF(sess, env, strategy, pred_network, target_network, stat, conf.discount, conf.batch_size, conf.learning_rate, conf.max_steps, conf.update_repeat, conf.max_episodes) #agent.run(conf.monitor, conf.display, conf.is_train) agent.run(conf.monitor, conf.display, True)
def main(_): model_dir, data_dir = get_dirs(conf, ['exp_name']) # exp_start_time = datetime.datetime.now().strftime("%A_%b%d-%H%M%S") # data_dir = "logs/" + conf.exp_name + "_" + exp_start_time preprocess_conf(conf, model_dir) env = gym.make(conf.env_name) env.seed(conf.random_seed) state_shape = env.observation_space.shape if type(env.action_space) is gym.spaces.Discrete: action_shape = env.action_space.n else: action_shape = env.action_space.shape[0] # replay buffer buffer = ReplayBuffer2(conf.buffer_size) # building agent # config = tf.ConfigProto(allow_soft_placement=True) # config.gpu_options.allow_growth = True config = tf.ConfigProto(intra_op_parallelism_threads=8, inter_op_parallelism_threads=8) with tf.Session(config=config) as sess: # agent agent = SoftPolicyGradient(sess, conf, state_shape, action_shape) # statistic stat = Statistic(sess, conf, model_dir, data_dir) if conf.load_model: stat.load_model() def var_print(): for var in tf.global_variables(): print(var) print("printing vars:------------------------------------------------") var_print() print( "printing vars::------------------------------------------------") start_steps = 1000 episode, global_step, local_step = 0, 0, 0 epi_rewards = 0 total_Q, Q_loss, pi_loss = [], [], [] state = env.reset() # pbar = tqdm(total=conf.max_steps, dynamic_ncols=True) while global_step < conf.max_steps: # interaction with environment action = agent.sampling_actions( [state], is_deterministic=False)[0] # [-inf, inf] next_state, reward, done, info = env.step( action_converter(env, action)) global_step += 1 local_step += 1 epi_rewards += reward reward *= conf.reward_scale buffer.add_transition(state, action, reward, next_state, done) state = next_state # train step if buffer.size() >= conf.batch_size and global_step >= start_steps: for i in range(conf.num_train_steps): transitions = buffer.get_transitions(conf.batch_size) Q, single_Q_loss, single_pi_loss = agent.trainer( transitions) total_Q.append(np.mean(Q)) Q_loss.append(single_Q_loss) pi_loss.append(single_pi_loss) # evaluate step if global_step % conf.eval_interval == 0: ave_epi_rewards = np.mean(eval_step(env, agent)) stat.save_step(global_step, ave_epi_rewards) print('\n[Evaluation] averaged_epi_rewards: %.3f' % ave_epi_rewards) if done: # save step all_epi_rewards.append(epi_rewards) stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss)) # pbar.update(local_step) lenn = len(all_epi_rewards) fromm = max(lenn - 20, 0) to = lenn min_5_ep_ret = min(all_epi_rewards[fromm:to]) # pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f avg_5_epi_rew %.1f' % # (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss), sum(all_epi_rewards[fromm:to])/(to-fromm) ) ) print( 'Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f \tmin_5_epi_rew %.1f' % (episode + 1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss), min_5_ep_ret)) threshold = -500.0 if ((to - fromm) > 3 and min_5_ep_ret > threshold): time_end = time.time() print("SHI hyperParams have made algo converge (", threshold, ") in ", (time_end - time_begin) / 1.0, " s") stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss)) stat.save_model(global_step) sys.exit() episode += 1 local_step = 0 epi_rewards = 0 total_Q, Q_loss, pi_loss = [], [], [] state = env.reset()
def main(_): model_dir, data_dir = get_dirs(conf, ['env_name']) preprocess_conf(conf, model_dir) env = gym.make(conf.env_name) # env.seed(conf.random_seed) state_shape = env.observation_space.shape if type(env.action_space) is gym.spaces.Discrete: action_shape = env.action_space.n else: action_shape = env.action_space.shape[0] # replay buffer buffer = ReplayBuffer2(conf.buffer_size) # building agent config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # agent agent = SoftPolicyGradient(sess, conf, state_shape, action_shape) # statistic stat = Statistic(sess, conf, model_dir, data_dir) if conf.load_model: stat.load_model() episode, global_step, local_step = 0, 0, 0 epi_rewards = 0 total_Q, Q_loss, pi_loss = [], [], [] state = env.reset() pbar = tqdm(total=conf.max_steps, dynamic_ncols=True) while global_step < conf.max_steps: # interaction with environment action = agent.sampling_actions([state], is_deterministic=False)[0] # [-inf, inf] next_state, reward, done, info = env.step(action_converter(env, action)) global_step += 1 local_step += 1 epi_rewards += reward reward *= conf.reward_scale buffer.add_transition(state, action, reward, next_state, done) state = next_state # train step if buffer.size() >= conf.batch_size: for i in range(conf.num_train_steps): transitions = buffer.get_transitions(conf.batch_size) Q, single_Q_loss, single_pi_loss = agent.trainer(transitions) total_Q.append(np.mean(Q)) Q_loss.append(single_Q_loss) pi_loss.append(single_pi_loss) # evaluate step if global_step % conf.eval_interval == 0: ave_epi_rewards = np.mean(eval_step(env, agent)) stat.save_step(global_step, ave_epi_rewards) print('\n[Evaluation] averaged_epi_rewards: %.3f' % ave_epi_rewards) if done: # save step stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss)) pbar.update(local_step) pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f' % (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss))) print() episode += 1 local_step = 0 epi_rewards = 0 total_Q, Q_loss, pi_loss = [], [], [] state = env.reset() pbar.close()
flags.DEFINE_integer('max_steps', 200, 'maximum # of steps for each episode') flags.DEFINE_integer('update_repeat', 5, 'maximum # of q-learning updates for each step') flags.DEFINE_integer('max_episodes', 1000, 'maximum # of episodes to train') # Debug flags.DEFINE_boolean('is_train', True, 'training or testing') flags.DEFINE_integer('random_seed', 123, 'random seed') flags.DEFINE_boolean('monitor', False, 'monitor the training or not') flags.DEFINE_boolean('display', False, 'display the game screen or not') flags.DEFINE_string('log_level', 'INFO', 'log level [DEBUG, INFO, WARNING, ERROR, CRITICAL]') conf = flags.FLAGS # ['is_train', 'random_seed', 'monitor', 'display', 'log_level']) preprocess_conf(conf) env = 'GazeboModularScara4DOF-v3' # set random seed tf.set_random_seed(123) np.random.seed(123) with tf.Session() as sess: # environment env = gym.make(env) env._seed(123) # learn (env, # sess, # conf.noise, # conf.noise_scale, # conf.hidden_dims,