def main(_): model_dir, data_dir = get_dirs(conf, ['exp_name']) # exp_start_time = datetime.datetime.now().strftime("%A_%b%d-%H%M%S") # data_dir = "logs/" + conf.exp_name + "_" + exp_start_time preprocess_conf(conf, model_dir) env = gym.make(conf.env_name) env.seed(conf.random_seed) state_shape = env.observation_space.shape if type(env.action_space) is gym.spaces.Discrete: action_shape = env.action_space.n else: action_shape = env.action_space.shape[0] # replay buffer buffer = ReplayBuffer2(conf.buffer_size) # building agent # config = tf.ConfigProto(allow_soft_placement=True) # config.gpu_options.allow_growth = True config = tf.ConfigProto(intra_op_parallelism_threads=8, inter_op_parallelism_threads=8) with tf.Session(config=config) as sess: # agent agent = SoftPolicyGradient(sess, conf, state_shape, action_shape) # statistic stat = Statistic(sess, conf, model_dir, data_dir) if conf.load_model: stat.load_model() def var_print(): for var in tf.global_variables(): print(var) print("printing vars:------------------------------------------------") var_print() print( "printing vars::------------------------------------------------") start_steps = 1000 episode, global_step, local_step = 0, 0, 0 epi_rewards = 0 total_Q, Q_loss, pi_loss = [], [], [] state = env.reset() # pbar = tqdm(total=conf.max_steps, dynamic_ncols=True) while global_step < conf.max_steps: # interaction with environment action = agent.sampling_actions( [state], is_deterministic=False)[0] # [-inf, inf] next_state, reward, done, info = env.step( action_converter(env, action)) global_step += 1 local_step += 1 epi_rewards += reward reward *= conf.reward_scale buffer.add_transition(state, action, reward, next_state, done) state = next_state # train step if buffer.size() >= conf.batch_size and global_step >= start_steps: for i in range(conf.num_train_steps): transitions = buffer.get_transitions(conf.batch_size) Q, single_Q_loss, single_pi_loss = agent.trainer( transitions) total_Q.append(np.mean(Q)) Q_loss.append(single_Q_loss) pi_loss.append(single_pi_loss) # evaluate step if global_step % conf.eval_interval == 0: ave_epi_rewards = np.mean(eval_step(env, agent)) stat.save_step(global_step, ave_epi_rewards) print('\n[Evaluation] averaged_epi_rewards: %.3f' % ave_epi_rewards) if done: # save step all_epi_rewards.append(epi_rewards) stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss)) # pbar.update(local_step) lenn = len(all_epi_rewards) fromm = max(lenn - 20, 0) to = lenn min_5_ep_ret = min(all_epi_rewards[fromm:to]) # pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f avg_5_epi_rew %.1f' % # (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss), sum(all_epi_rewards[fromm:to])/(to-fromm) ) ) print( 'Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f \tmin_5_epi_rew %.1f' % (episode + 1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss), min_5_ep_ret)) threshold = -500.0 if ((to - fromm) > 3 and min_5_ep_ret > threshold): time_end = time.time() print("SHI hyperParams have made algo converge (", threshold, ") in ", (time_end - time_begin) / 1.0, " s") stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss)) stat.save_model(global_step) sys.exit() episode += 1 local_step = 0 epi_rewards = 0 total_Q, Q_loss, pi_loss = [], [], [] state = env.reset()
def main(_): model_dir, data_dir = get_dirs(conf, ['env_name']) preprocess_conf(conf, model_dir) env = gym.make(conf.env_name) # env.seed(conf.random_seed) state_shape = env.observation_space.shape if type(env.action_space) is gym.spaces.Discrete: action_shape = env.action_space.n else: action_shape = env.action_space.shape[0] # replay buffer buffer = ReplayBuffer2(conf.buffer_size) # building agent config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # agent agent = SoftPolicyGradient(sess, conf, state_shape, action_shape) # statistic stat = Statistic(sess, conf, model_dir, data_dir) if conf.load_model: stat.load_model() episode, global_step, local_step = 0, 0, 0 epi_rewards = 0 total_Q, Q_loss, pi_loss = [], [], [] state = env.reset() pbar = tqdm(total=conf.max_steps, dynamic_ncols=True) while global_step < conf.max_steps: # interaction with environment action = agent.sampling_actions([state], is_deterministic=False)[0] # [-inf, inf] next_state, reward, done, info = env.step(action_converter(env, action)) global_step += 1 local_step += 1 epi_rewards += reward reward *= conf.reward_scale buffer.add_transition(state, action, reward, next_state, done) state = next_state # train step if buffer.size() >= conf.batch_size: for i in range(conf.num_train_steps): transitions = buffer.get_transitions(conf.batch_size) Q, single_Q_loss, single_pi_loss = agent.trainer(transitions) total_Q.append(np.mean(Q)) Q_loss.append(single_Q_loss) pi_loss.append(single_pi_loss) # evaluate step if global_step % conf.eval_interval == 0: ave_epi_rewards = np.mean(eval_step(env, agent)) stat.save_step(global_step, ave_epi_rewards) print('\n[Evaluation] averaged_epi_rewards: %.3f' % ave_epi_rewards) if done: # save step stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss)) pbar.update(local_step) pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f' % (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss))) print() episode += 1 local_step = 0 epi_rewards = 0 total_Q, Q_loss, pi_loss = [], [], [] state = env.reset() pbar.close()