def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, restore=True): rank = MPI.COMM_WORLD.Get_rank() # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, (env.action_space.shape[0], ), gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, observation_range=(env.observation_space.low[0], env.observation_space.high[0]), action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up saving stuff only for a single worker. savingModelPath = "/home/joel/Documents/saved_models_OpenAI_gym/" if rank == 0: saver = tf.train.Saver(keep_checkpoint_every_n_hours=1) else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. # from https://github.com/openai/baselines/issues/162#issuecomment-397356482 and # https://www.tensorflow.org/api_docs/python/tf/train/import_meta_graph if restore == True: # restoring doesn't actually work logger.info("Restoring from saved model") saver = tf.train.import_meta_graph(savingModelPath + "ddpg_test_model.meta") saver.restore(sess, tf.train.latest_checkpoint(savingModelPath)) else: logger.info("Starting from scratch!") sess.run(tf.global_variables_initializer() ) # this should happen here and not in the agent right? agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 t_rollout = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): start_time_epoch = time.time() for cycle in range(nb_epoch_cycles): start_time_cycle = time.time() # Perform rollouts. for t_rollout in range(nb_rollout_steps): # while(not done): start_time_rollout = time.time() # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) logging.debug("q-value of selected action: {}".format(q)) # np.set_printoptions(precision=3) logging.debug( "selected (unscaled) action: " + str(action)) # e.g. [ 0.04 -0.662 -0.538 0.324] # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) target = scale_range(action, -1, 1, env.action_space.low, env.action_space.high) # Execute next action. if rank == 0 and render: env.render() assert target.shape == env.action_space.shape new_obs, r, done, info = env.step(target) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done or t_rollout >= nb_rollout_steps - 1: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # t_rollout += 1 logger.info( 'runtime rollout-step {0}.{1}.{2}: {3}s'.format( epoch, cycle, t_rollout, time.time() - start_time_rollout)) # for rollout_steps # Train. logging.info("Training the Agent") start_time_train = time.time() epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # 50 iterations # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise( ) # e.g. 0.7446093559265137 epoch_adaptive_distances.append(distance) cl, al = agent.train() logging.debug( "critic loss: {}".format(cl)) # e.g. 25.988863 logging.debug( "actor loss: {}".format(al)) # e.g. -0.008966461 epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. logger.info('runtime training actor & critic: {}s'.format( time.time() - start_time_train)) # Saving the trained model if (saver is not None): logger.info("saving the trained model") start_time_save = time.time() saver.save(sess, savingModelPath + "ddpg_test_model") logger.info('runtime saving: {}s'.format(time.time() - start_time_save)) done = False logger.info('runtime epoch-cycle {0}: {1}s'.format( cycle, time.time() - start_time_cycle)) # for epoch_cycles mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logging.info("\t{0} : {1}".format(key, combined_stats[key])) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) # Saving the trained model if (saver is not None): logger.info("saving the trained model") start_time_save = time.time() saver.save(sess, savingModelPath + "ddpg_model_epochSave", global_step=epoch) logger.info('runtime saving: {}s'.format(time.time() - start_time_save)) logger.info('runtime epoch {0}: {1}s'.format( epoch, time.time() - start_time_epoch))
reward_scale=reward_scale) max_iteration = 1 step_number = [] success = [] reason = {1: 0, 2: 0, 3: 0} with U.single_threaded_session() as sess: agent.initialize(sess) # sess.graph.finalize() iteration = 0 success_number = 0 while iteration < max_iteration: iteration += 1 print(iteration) agent.reset() obs = env.reset() saver = tf.train.Saver() saver.restore( tf.get_default_session(), '/home/projectvenom/Documents/AIPilot/AIPilot-ProjectVenom-master/model_SMode/model_SMode_exp3/Exp3_SMode_best' ) done = False step = 0 while not done: state, reward, done, dis = env.step(actiongenerator(obs)) # print("reward: ",reward) # print("done: ",done) print("distance: ", dis['distance'])
def train(env_id, env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, my_render=True, eg_explore=True,reward_param_scaling=1.0, reward_param_thr = 70, reward_param_type='const'): #save data #################################### full_path = txt_path + '_etc_RL.txt' file = open(full_path,'w') print('Start training for env: '+env_id) #change to your dir of choice for saving save_path = os.getcwd() print('Save data at '+save_path+'. Change to your desired path.') dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.pkl' append_num = 0 while os.path.exists(os.path.join(save_path,dump_name)): dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.'+str(append_num)+'.pkl' append_num+=1 rank = MPI.COMM_WORLD.Get_rank() print('second rank is ',rank) assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions.##############danny max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver(max_to_keep = 1) else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_com_sav = [] epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 count_fwj = 0 # used to count how many step() is played for epoch in range(nb_epochs): print(nb_epochs) # collect data for saving plot save_data = {'act': [], 'obs': [], 'qpos':[], 'rew':[], # reward for this episode 'freq_com':[], # communication frequency 'act_ts': [], 'obs_ts': [], 'qpos_ts': [], 'rew_ts': [], # reward for this episode 'freq_com_ts': [], # communication frequency 'comm_r_factor':reward_param_scaling, 'eplen_ts':[] # len of test episodes } # decay the exploration e_greed = 0.5 - 0.1 * np.log10( (t%10000) + 1) explore_switch = (t < 20000 and eg_explore and e_greed > 0) print('total steps: '+str(t)+', eps greedy rate: '+str(e_greed)+', explore is '+str(explore_switch)) for cycle in range(nb_epoch_cycles): # Perform rollouts. # init u_old, don't forget to change test also u_old = 1.0 * env.action_space.sample() / max_action num_no_com = 0 for t_rollout in range(nb_rollout_steps): count_fwj+=1 print ('test played is ###################',count_fwj) # Predict next action. # edit this to be param version if len(obs) is not 6: obs.append(obs[2]-obs[0]) obs.append(obs[3]-obs[1]) a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=False, compute_Q=True) else: #a_1 = np.zeros() a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=False, compute_Q=True) print('value of q is',q) if count_fwj % 50000 == 0: saver.save(sess,'./home/test_3_ar.ckpt',global_step =1) a0 = a_raw[0] a1 = a_raw[1] # eps greedy, flip the coin # make eps decay first 10k updates dice_greed = np.random.uniform() if explore_switch and dice_greed < e_greed: com = ( np.random.uniform() > 0.5 ) else: com = (a0 > a1) # action according to com switch if com: r_com = 0.0 action = np.copy(a_raw[2:]) #No communication num_no_com += 1 #No communication else: if reward_param_type=='const': r_com = 1. # const reward elif reward_param_type=='linear': r_com = (1.0 / (nb_rollout_steps - reward_param_thr)) * (nb_rollout_steps - num_no_com) # linear interp reward elif reward_param_type=='inv': r_com = 1.0 / (1.0 + (np.maximum(num_no_com - reward_param_thr, 0))) # inv decay reward else: print('no such reward type!') assert 1==0 r_com = reward_param_scaling * r_com #action = np.copy(u_old) action = np.copy(a_raw[2:]) num_no_com += 1 assert action.shape == env.action_space.shape assert max_action.shape == action.shape new_obs, r, done, info = env.step(action) print(done) file.write(str(new_obs)+',q_value_is,'+str(q)+',step_reward,'+str(r)+',action used,' + str(max_action*action)+'\n') t += 1 if rank == 0 and render: pass episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(a_raw) epoch_qs.append(q) agent.store_transition(np.concatenate([obs,u_old],axis=0), a_raw, r+r_com, np.concatenate([np.squeeze(new_obs), action],axis=0) , done) obs = np.squeeze(new_obs) save_data['act'].append(np.array(action)) save_data['obs'].append(np.array(obs)) if hasattr(env.unwrapped, 'data'): save_data['qpos'].append(np.array(env.unwrapped.data.qpos)) u_old = np.copy(action) if done: # Episode done. epoch_com_sav.append(np.asarray(1.0*num_no_com/episode_step)) epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 print('one game finished, count is =================================',env.count) file.write('count is,'+str(env.count)) file.write('done is,' + str(done)) file.write('long term reward is,' + str(env.long_term_reward)) file.write('#'*12+'one game finished\n') agent.reset() obs = env.reset() #end of loop nb_rollout print('communication savings: ' + str(num_no_com)) # check com number # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # log stuff save_data['rew'].append(np.mean(epoch_episode_rewards)) save_data['freq_com'].append(np.mean(epoch_com_sav)) duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) ###=============================================== # test the fully-trained agent env = env.unwrapped
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() #print(np.abs(env.action_space.low)) #print(np.abs(env.action_space.high)) #assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high print(env.action_space) print(env.observation_space) #logger.info('scaling actions by {} before executing in env'.format(max_action)) if load_memory: memory=pickle.load(open("/home/vaisakhs_shaj/Desktop/BIG-DATA/memory1000000.pickle","rb")) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) ''' # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None ''' saver=tf.train.Saver() step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=10) with U.make_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() if restore: filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(120000)+".model" saver.restore(sess,filename) obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() tr=0 s=0 while True: action=agent.pi(obs, apply_noise=False, compute_Q=False)[0] obs, r, done, info = env.step(action) tr=tr+r s=s+1 print(r) if done: print(tr) obs=env.reset() tr=0 print(s) break
def main(): with U.single_threaded_session() as sess: batch_size = 64 current_noise_type = 'adaptive-param_0.2' _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) param_noise_adaption_interval = 2 env = gym.make("Pendulum-v0") nb_actions = env.action_space.shape[-1] layer_norm = True # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = int(1000000 * np.random.rand()) logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) env.seed(seed) max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, batch_size=batch_size, param_noise=param_noise) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() for t in itertools.count(): episode_rewards = [] done = False while not done: env.render() # Take action and update exploration to the newest value action, q = agent.pi(obs, apply_noise=True, compute_Q=True) new_obs, rew, done, _ = env.step(max_action * action) # Book-keeping. agent.store_transition(obs, action, rew, new_obs, done) obs = new_obs episode_rewards.append(rew) if done: agent.reset() obs = env.reset() nb_train_steps = 100 epoch_adaptive_distances = [] epoch_critic_losses = [] epoch_actor_losses = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() if t % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(np.mean(episode_rewards), 1)) logger.record_tabular('train/loss_actor', round(np.mean(epoch_actor_losses))) logger.record_tabular('train/loss_critic', round(np.mean(epoch_critic_losses))) logger.record_tabular('train/param_noise_distance', round(np.mean(epoch_adaptive_distances))) logger.dump_tabular()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, teacher, tau=0.01, eval_env=True, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() t = datetime.now().strftime('%H-%M') PATH = 'results/ddpg'.format(t) #assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 agent.restore_model(PATH) for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( action) eval_env.background = get_q_background( eval_env, agent.q, eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t agent.save_model(PATH, epoch) for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def evaluate(env, nb_episodes, reward_scale, render, param_noise, action_noise, actor, critic, memory, critic_l2_reg, normalize_returns=False, normalize_observations=True, weight_file=None): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, normalize_returns=normalize_returns, normalize_observations=normalize_observations, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) with U.single_threaded_session() as sess: agent.initialize(sess) if weight_file: saver = tf.train.Saver(actor.trainable_vars + critic.trainable_vars) saver.restore(sess, weight_file) agent.actor_optimizer.sync() agent.critic_optimizer.sync() # sess.graph.finalize() agent.reset() obs = env.reset() total_reward = 0.0 max_steps = 2000 for ep in range(nb_episodes): i = 0 done = False episode_reward = 0.0 while not done and i < max_steps: action, q, all_actions, sample = agent.pi(obs, apply_noise=False, compute_Q=True) assert action.shape == env.action_space.shape assert max_action.shape == action.shape obs, r, done, info = env.step(max_action * action) episode_reward += r # env.render() # print('Action:{}, reward:{}'.format(action, r)) # time.sleep(0.1) i += 1 total_reward += episode_reward logger.info("Episode:{}, reward:{}, steps:{}".format( ep, episode_reward, i)) if done: obs = env.reset() logger.info("Average reward:{}, total reward:{}, episodes:{}".format( (total_reward / nb_episodes), total_reward, nb_episodes))
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): """ Runs the training of the Deep Deterministic Policy Gradien (DDPG) model DDPG: https://arxiv.org/pdf/1509.02971.pdf :param env: (Gym Environment) the environment :param nb_epochs: (int) the number of training epochs :param nb_epoch_cycles: (int) the number cycles within each epoch :param render_eval: (bool) enable rendering of the evalution environment :param reward_scale: (float) the value the reward should be scaled by :param render: (bool) enable rendering of the environment :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None) :param actor: (TensorFlow Tensor) the actor model :param critic: (TensorFlow Tensor) the critic model :param normalize_returns: (bool) should the critic output be normalized :param normalize_observations: (bool) should the observation be normalized :param critic_l2_reg: (float) l2 regularizer coefficient :param actor_lr: (float) the actor learning rate :param critic_lr: (float) the critic learning rate :param action_noise: (ActionNoise) the action noise type (can be None) :param popart: (bool) enable pop-art normalization of the critic output (https://arxiv.org/pdf/1602.07714.pdf) :param gamma: (float) the discount rate :param clip_norm: (float) clip the gradients (disabled if None) :param nb_train_steps: (int) the number of training steps :param nb_rollout_steps: (int) the number of rollout steps :param nb_eval_steps: (int) the number of evalutation steps :param batch_size: (int) the size of the batch for learning the policy :param memory: (Memory) the replay buffer :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) :param eval_env: (Gym Environment) the evaluation environment (can be None) :param param_noise_adaption_interval: (int) apply param noise every N steps """ rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, param_noise=param_noise, action_noise=action_noise, gamma=gamma, tau=tau, normalize_returns=normalize_returns, enable_popart=popart, normalize_observations=normalize_observations, batch_size=batch_size, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: tf.train.Saver() eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with tf_util.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() episode_reward = 0. episode_step = 0 episodes = 0 step = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for _ in range(nb_epoch_cycles): # Perform rollouts. for _ in range(nb_rollout_steps): # Predict next action. action, q_value = agent.policy(obs, apply_noise=True, compute_q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) new_obs, reward, done, _ = env.step(max_action * action) step += 1 if rank == 0 and render: env.render() episode_reward += reward episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q_value) agent.store_transition(obs, action, reward, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) critic_loss, actor_loss = agent.train() epoch_critic_losses.append(critic_loss) epoch_actor_losses.append(actor_loss) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for _ in range(nb_eval_steps): eval_action, eval_q = agent.policy(eval_obs, apply_noise=False, compute_q=True) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, _ = eval_env.step( max_action * eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(step) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(scalar): """ check and return the input if it is a scalar, otherwise raise ValueError :param scalar: (Any) the object to check :return: (Number) the scalar if x is a scalar """ if isinstance(scalar, np.ndarray): assert scalar.size == 1 return scalar[0] elif np.isscalar(scalar): return scalar else: raise ValueError('expected scalar, got %s' % scalar) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = step for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler: pickle.dump(env.get_state(), file_handler) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler: pickle.dump(eval_env.get_state(), file_handler)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, **kwargs): # print("kwargs:",kwargs) rank = MPI.COMM_WORLD.Get_rank() print("rank:", rank) assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. # --------------- AMEND: For saving and restoring the model. added by xlv ------------------ if kwargs['restore'] == True and kwargs['restore_path'] != None: logger.info("Restoring from saved model") saver = tf.train.import_meta_graph(restore_path + "trained_model.meta") saver.restore(sess, tf.train.latest_checkpoint(restore_path)) else: logger.info("Starting from scratch!") sess.run(tf.global_variables_initializer()) # ---------------------------------------------------------------------------------------- agent.initialize(sess) sess.graph.finalize() agent.reset() obs = eval_obs = env.reset() # if eval_env is not None: # eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 # every 30 epochs plot statistics and save it. nb_epochs_unit = 30 ddpg_rewards = [] eval_ddpg_rewards = [] ddpg_suc_percents = [] eval_suc_percents = [] # ---- AMEND: added by xlv to calculate success percent ----- suc_num = 0 episode_num = 0 # ----------------------------------------------------------- for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape # new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) new_obs, r, done, suc, info = env.step(max_action * action) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 # --- AMEND: added by xlv to calculate success percent --- episode_num += 1 if suc: suc_num += 1 # ------------------------------------------------------- agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. # eval_episode_rewards = [] # eval_qs = [] # if eval_env is not None: # eval_episode_reward = 0. # for t_rollout in range(nb_eval_steps): # eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # if render_eval: # eval_env.render() # eval_episode_reward += eval_r # # eval_qs.append(eval_q) # if eval_done: # eval_obs = eval_env.reset() # eval_episode_rewards.append(eval_episode_reward) # eval_episode_rewards_history.append(eval_episode_reward) # eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. # if eval_env is not None: # combined_stats['eval/return'] = eval_episode_rewards # combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) # combined_stats['eval/Q'] = eval_qs # combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) # ------------------------------ plot statistics every nb_epochs_unit ----------------------------------- ddpg_rewards.append(np.mean(episode_rewards_history)) if (epoch + 1) % nb_epochs_unit == 0: ddpg_suc_percents.append(suc_num / episode_num) # ---------- Evaluate for 5 iters ----------------------- nb_eval_epochs = 5 nb_eval_epoch_cycles = 5 eval_episode_num = 0 eval_suc_num = 0 eval_episode_reward = 0 eval_episode_step = 0 eval_epoch_episode_rewards = [] eval_epoch_episode_steps = [] for i_epoch in range(nb_eval_epochs): logger.log( "********** Start Evaluation. Iteration %i ************" % i_epoch) for i_cycle in range(nb_eval_epoch_cycles): for t_rollout in range(nb_rollout_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) assert eval_action.shape == env.action_space.shape eval_obs, eval_r, eval_done, eval_suc, eval_info = env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_episode_reward += eval_r eval_episode_step += 1 if eval_done: eval_obs = env.reset() eval_epoch_episode_rewards.append( eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_epoch_episode_steps.append( eval_episode_step) eval_episode_reward = 0 eval_episode_step = 0 eval_episode_num += 1 if eval_suc: eval_suc_num += 1 logger.record_tabular( "Eval_EpRewMean", np.mean(eval_episode_rewards_history)) logger.record_tabular("Eval_EpNumUntilNow", eval_episode_num) logger.record_tabular("Eval_EpNumSuc", eval_suc_num) logger.record_tabular("Eval_EpSucPercent", eval_suc_num / eval_episode_num) logger.dump_tabular() eval_ddpg_rewards.append( np.mean(eval_episode_rewards_history)) eval_suc_percents.append(eval_suc_num / eval_episode_num) # ---------------------------------------------------------------------------------------------- # --------------------- plotting and saving ------------------------- if saver is not None: logger.info("saving the trained model") start_time_save = time.time() if epoch + 1 == nb_epochs: saver.save(sess, kwargs['MODEL_DIR'] + "/trained_model") else: saver.save( sess, kwargs['MODEL_DIR'] + "/iter_" + str( (epoch + 1) // nb_epochs_unit)) plot_performance(range(len(ddpg_rewards)), ddpg_rewards, ylabel=r'avg reward per DDPG learning step', xlabel='ddpg iteration', figfile=os.path.join(kwargs['FIGURE_DIR'], 'ddpg_reward'), title='TRAIN') plot_performance( range(len(ddpg_suc_percents)), ddpg_suc_percents, ylabel= r'overall success percentage per algorithm step under DDPG', xlabel='algorithm iteration', figfile=os.path.join(kwargs['FIGURE_DIR'], 'success_percent'), title="TRAIN") plot_performance(range(len(eval_ddpg_rewards)), eval_ddpg_rewards, ylabel=r'avg reward per DDPG eval step', xlabel='ddpg iteration', figfile=os.path.join(kwargs['FIGURE_DIR'], 'eval_ddpg_reward'), title='EVAL') plot_performance( range(len(eval_suc_percents)), eval_suc_percents, ylabel= r'overall eval success percentage per algorithm step under DDPG', xlabel='algorithm iteration', figfile=os.path.join(kwargs['FIGURE_DIR'], 'eval_success_percent'), title="EVAL") # save data which is accumulated UNTIL iter i with open( kwargs['RESULT_DIR'] + '/ddpg_reward_' + 'iter_' + str( (epoch + 1) // nb_epochs_unit) + '.pickle', 'wb') as f2: pickle.dump(ddpg_rewards, f2) with open( kwargs['RESULT_DIR'] + '/success_percent_' + 'iter_' + str((epoch + 1) // nb_epochs_unit) + '.pickle', 'wb') as fs: pickle.dump(ddpg_suc_percents, fs) # save evaluation data accumulated until iter i with open( kwargs['RESULT_DIR'] + '/eval_ddpg_reward_' + 'iter_' + str((epoch + 1) // nb_epochs_unit) + '.pickle', 'wb') as f_er: pickle.dump(eval_ddpg_rewards, f_er) with open( kwargs['RESULT_DIR'] + '/eval_success_percent_' + 'iter_' + str( (epoch + 1) // nb_epochs_unit) + '.pickle', 'wb') as f_es: pickle.dump(eval_suc_percents, f_es)
class DDPGAgent(Policy): def __init__(self, env, agent_index, sess, action_range=(-1., 1.), reward_scale=0.1, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.975, clip_norm=10, batch_size=64, memory_size=1e6, tau=0.01, normalize_returns=False, normalize_observations=False, noise_type="adaptive-param_0.1", layer_norm=True, nb_layers=2, nb_neurons=64, activation='tanh', **network_kwargs): super(DDPGAgent, self).__init__(agent_index) # self.sess = sess self.nb_actions = env.action_space[agent_index].n print('agent action_space ' + str(env.action_space[agent_index].n)) self.state_size = env.observation_space[agent_index].shape self.action_range = action_range with tf.variable_scope('ddpg_' + str(agent_index)): critic = Critic(name='critic_' + str(agent_index), layer_norm=layer_norm, nb_layers=nb_layers, nb_neurons=nb_neurons) actor = Actor(self.nb_actions, name='actor_' + str(agent_index), layer_norm=layer_norm, nb_neurons=nb_neurons, activation=activation) memory = Memory(limit=int(memory_size), action_shape=(self.nb_actions, ), observation_shape=self.state_size) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise( mu=np.zeros(self.nb_actions), sigma=float(stddev) * np.ones(self.nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.nb_actions), sigma=float(stddev) * np.ones(self.nb_actions), dt=env.world.dt, theta=0.1) else: raise RuntimeError('unknown noise type "{}"'.format( current_noise_type)) self.agent = DDPG(actor, critic, memory, self.state_size, (self.nb_actions, ), action_range=self.action_range, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(self.agent.__dict__.items())) self.agent.initialize(sess) self.agent.reset() def action(self, obs, apply_noise=False, compute_Q=False): if compute_Q: return self.agent.pi(obs, apply_noise=apply_noise, compute_Q=compute_Q) else: return self.agent.pi(obs, apply_noise=apply_noise, compute_Q=compute_Q)[0] def reset(self): return self.agent.reset()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, callback=None, pretrained='none'): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Copy an env for evaluation env_eval = copy.deepcopy(env.env) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() # load pretrained agent if possible if pretrained == 'none': logger.info('Training from scratch...') else: logger.info('Loading pretrained model from {}'.format(pretrained)) #assert os.path.exists(pretrained) saver.restore(sess, pretrained) agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 total_time = 0 start_time = time.time() total_time_record = [] epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() #epochxposdict = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 total_time += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) total_time_record.append(total_time) #epochxposdict.append(info['pos'][0]) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: # eval for one episode eval_episode_reward = 0.0 eval_done = False eval_obs = eval_env.reset() while not eval_done: eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action) eval_episode_reward += eval_r eval_qs.append(eval_q) eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) """ eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. """ # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) # Call the callback if callback is not None: if callback(locals(), globals()): # callback returns a boolean value break # Evaluate the policy on env to record trajs eval_rewards, eval_steps, trajs_obs, trajs_actions = evaluate( env_eval, agent=agent) if callback is not None: callback.final_call(locals(), globals())
def train(env_id, env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, my_render=True, eg_explore=True,reward_param_scaling=1.0, reward_param_thr = 70, reward_param_type='const'): print('Start training for env: '+env_id) #change to your dir of choice for saving save_path = os.getcwd() print('Save data at '+save_path+'. Change to your desired path.') dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.pkl' append_num = 0 while os.path.exists(os.path.join(save_path,dump_name)): dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.'+str(append_num)+'.pkl' append_num+=1 rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_com_sav = [] epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): # collect data for saving plot save_data = {'act': [], 'obs': [], 'qpos':[], 'rew':[], # reward for this episode 'freq_com':[], # communication frequency 'act_ts': [], 'obs_ts': [], 'qpos_ts': [], 'rew_ts': [], # reward for this episode 'freq_com_ts': [], # communication frequency 'comm_r_factor':reward_param_scaling, 'eplen_ts':[] # len of test episodes } # decay the exploration e_greed = 0.5 - 0.1 * np.log10( (t%10000) + 1) explore_switch = (t < 20000 and eg_explore and e_greed > 0) print('total steps: '+str(t)+', eps greedy rate: '+str(e_greed)+', explore is '+str(explore_switch)) for cycle in range(nb_epoch_cycles): # Perform rollouts. # init u_old, don't forget to change test also u_old = 1.0 * env.action_space.sample() / max_action num_no_com = 0 for t_rollout in range(nb_rollout_steps): # Predict next action. # edit this to be param version a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=True, compute_Q=True) a0 = a_raw[0] a1 = a_raw[1] # eps greedy, flip the coin # make eps decay first 10k updates dice_greed = np.random.uniform() if explore_switch and dice_greed < e_greed: com = ( np.random.uniform() > 0.5 ) else: com = (a0 > a1) # action according to com switch if com: r_com = 0.0 action = np.copy(a_raw[2:]) #motor cmd else: if reward_param_type=='const': r_com = 1. # const reward elif reward_param_type=='linear': r_com = (1.0 / (nb_rollout_steps - reward_param_thr)) * (nb_rollout_steps - num_no_com) # linear interp reward elif reward_param_type=='inv': r_com = 1.0 / (1.0 + (np.maximum(num_no_com - reward_param_thr, 0))) # inv decay reward else: print('no such reward type!') assert 1==0 r_com = reward_param_scaling * r_com action = np.copy(u_old) num_no_com += 1 assert action.shape == env.action_space.shape assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: pass # env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(a_raw) epoch_qs.append(q) agent.store_transition(np.concatenate([obs,u_old],axis=0), a_raw, r+r_com, np.concatenate([np.squeeze(new_obs), action],axis=0) , done) obs = np.squeeze(new_obs) save_data['act'].append(np.array(action)) save_data['obs'].append(np.array(obs)) if hasattr(env.unwrapped, 'data'): save_data['qpos'].append(np.array(env.unwrapped.data.qpos)) u_old = np.copy(action) if done: # Episode done. epoch_com_sav.append(np.asarray(1.0*num_no_com/episode_step)) epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() print('communication savings: ' + str(num_no_com)) # check com number # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # log stuff save_data['rew'].append(np.mean(epoch_episode_rewards)) save_data['freq_com'].append(np.mean(epoch_com_sav)) duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) ###=============================================== # test the fully-trained agent env = env.unwrapped print('*Final testing*') n_test = 1 n_ts_rollout = 500 # obs = env.env.reset() for i_test in range(n_test): if i_test%50==0: print('test iteration: '+str(i_test)) obs = env.reset() # take some actions # start with small during test time u_old = 0 * env.action_space.sample() / max_action num_no_com = 0 ts_step = 0 ts_reward = 0 for i_test_rollout in range(n_ts_rollout): # Predict next action. # edit this to be param version a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=False, compute_Q=True) a0 = a_raw[0] a1 = a_raw[1] com = (a0 > a1) # action according to com switch if com: action = np.copy(a_raw[2:]) else: action = np.copy(u_old) num_no_com += 1 assert action.shape == env.action_space.shape new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # print('Done: '+str(done)) ts_reward += r # do i really need to change this? change back to r only ts_step += 1 # record trajectory # save_data['rew'].append(np.array(r)) # need to change here, what's a good performance measure? save_data['act_ts'].append(max_action *action) # record the actual u save_data['obs_ts'].append(np.array(obs)) u_old = np.copy(action) obs = np.copy(new_obs) # update obs # # store episode rew as performance measure # save_data['eplen_ts'].append(np.array(i_test_rollout+1)) # save_data['rew_ts'].append(np.array(ts_reward)) # save_data['freq_com_ts'].append(np.array(1.0*num_no_com/(i_test_rollout+1))) agent.reset() # doesn't matter if not stochastic # plot the trajectory ### states xs = np.asarray(save_data['obs_ts']) ths = np.arctan2(xs[:, 1], xs[:, 0]) ### control us = np.asarray(save_data['act_ts']) id_seg = 0 horz_plt = 500 plt.figure(figsize=[15, 20]) plt.subplot(211) plt.plot(ths[id_seg * horz_plt:(id_seg + 1) * horz_plt], label='th') plt.plot(xs[:, 2][id_seg * horz_plt:(id_seg + 1) * horz_plt], color='g', label='th_dot') plt.legend() plt.title('state plot') plt.subplot(212) plt.plot(us[id_seg * horz_plt:(id_seg + 1) * horz_plt], color='r') plt.title('control plot') plt.show()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() #print(np.abs(env.action_space.low)) #print(np.abs(env.action_space.high)) #assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) if load_memory: memory=pickle.load(open("/home/vaisakhs_shaj/Desktop/BIG-DATA/memory1000000.pickle","rb")) ''' samps = memoryPrev.sample(batch_size=memoryPrev.nb_entries) print(len(samps['obs0'][1])) for i in range(memoryPrev.nb_entries): memory.append(samps['obs0'][i], samps['actions'][i], samps['rewards'][i], samps['obs1'][i], samps['terminals1'][i]) ''' print("=============memory loaded================") agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) envs = [make_env(seed) for seed in range(nproc)] envs = SubprocVecEnv(envs) ''' # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None ''' saver=tf.train.Saver() step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=10) with U.make_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() if restore: filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(15000)+".model" saver.restore(sess,filename) print("loaded!!!!!!!!!!!!!") #p=[v.name for v in tf.all_variables()] #print(p) obs = envs.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_reward3 = 0. episode_step = 0 episode_step3 = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = deque(maxlen=10) epoch_episode_steps3 = deque(maxlen=10) epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 learning_starts = 10000 for epoch in range(nb_epochs): print("cycle-memory") print(max_action) for cycle in range(nb_epoch_cycles): print(cycle,"-",memory.nb_entries,end=" ") sys.stdout.flush() # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action = np.stack([agent.pi(obs[i], apply_noise=True, compute_Q=False)[0] for i in range(nproc)]) q = np.stack([agent.pi(obs[i], apply_noise=True, compute_Q=True)[1] for i in range(nproc)]) # action, q = agent.pi(obs, apply_noise=True, compute_Q=True) #assert action.shape == env.action_space.shape #print(i) # Execute next action in parallel. if rank == 0 and render: env.render() #assert max_action.shape == action.shape new_obs, r, done, info = envs.step(action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() #print(r) #print(r[1]) sys.stdout.flush() episode_reward += r[1] #episode_reward3 += r[2] episode_step += 1 #episode_step3 += 1 ''' if episode_step==300: e=episode_step re=episode_reward if episode_step>300: episode_step=e episode_reward=re ''' #print(episode_step) book_keeping_obs=obs obs = new_obs #print(envs[1]) #print(episode_reward) # Book-keeping in parallel. epoch_actions.append(np.mean(action)) epoch_qs.append(np.mean(q)) for i in range(nproc): agent.store_transition(book_keeping_obs[i], action[i], r[i], new_obs[i], done[i]) #print(done) if done[i]: # Episode done. #print("====done====",episode_reward) if i==1: epoch_episode_rewards.append(episode_reward) #rint(epoch_episode_rewards) #episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. #episode_reward3 = 0 episode_step = 0 epoch_episodes += 1 episodes += 1 ''' if i==2: #epoch_episode_rewards.append(episode_reward3) #rint(epoch_episode_rewards) episode_rewards_history.append(episode_reward3) epoch_episode_steps3.append(episode_step3) episode_reward3 = 0 episode_step3 = 0 ''' agent.reset() temp = envs.reset() obs[i]=temp[i] ''' Variables in TensorFlow only have values inside sessions. Once the session is over, the variables are lost. saver,save and saver .restore depends on session and has to be inside the session. ''' # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_rl eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. #print(episode_rewards_history) if (t)%20000 == 0: fname="/home/vaisakhs_shaj/Desktop/BIG-DATA/memoryStill"+str(memory.nb_entries)+".pickle" pickle.dump(memory,open(fname,"wb"),protocol=-1) if t % 5000 == 0: print("=======saving interim model==========") filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(t)+".model" saver.save(sess,filename) mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps2'] = np.mean(epoch_episode_steps) combined_stats['rollout/episode_steps3'] = np.mean(epoch_episode_steps3) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = np.mean(eval_episode_rewards) combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = np.mean(eval_qs) combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() print(logdir) if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
class DDPGEnvLearner(EnvLearner): def __init__(self, env_in): EnvLearner.__init__(self, env_in) # from baselines.ddpg.models import Actor, Critic # Parse noise_type action_noise = None param_noise = None noise_type = 'adaptive-param_0.2' layer_norm = True nb_actions = self.state_dim for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. self.buff_len = 10 self.buffer = deque(self.buff_init * self.buff_len, maxlen=self.buff_len) obs_space = (self.buff_init[0].size * self.buff_len, ) self.memory = Memory(limit=int(1e6), action_shape=env_in.observation_space.shape, observation_shape=obs_space) self.critic = models.Critic(layer_norm=layer_norm) self.actor = models.Actor(nb_actions, layer_norm=layer_norm) self.agent = DDPG(self.actor, self.critic, self.memory, obs_space, env_in.observation_space.shape, gamma=0.99, tau=0.01, normalize_returns=False, normalize_observations=True, batch_size=64, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=1e-2, actor_lr=1e-5, critic_lr=1e-5, enable_popart=False, clip_norm=None, reward_scale=1.) def initialize(self, session, load=False): self.sess = session if not load: self.sess.run(tf.global_variables_initializer()) self.agent.initialize(self.sess) def train(self, train, total_steps, valid=None, log_interval=10, early_stopping=-1, saver=None, save_str=None): G, yS, yR, yD, X, S, A = self.__prep_data__(train, batch_size=0) X = X[0] S = S[0] self.agent.reset() # max_action = self.env.action_space.high batch_size = 64 t = 0 episode_reward = 0 episode_step = 0 episodes = 0 epoch_episodes = 0 epoch_episode_rewards = [] nb_epoch_cycles = 10 nb_rollout_steps = 100 nb_epochs = int(len(train) / (nb_epoch_cycles * nb_rollout_steps)) nb_train_steps = total_steps param_noise_adaption_interval = 50 i = 0 for epoch in range(nb_epochs): start_time = time.time() for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. # (obs_in, action_in, _, new_obs_in, done, episode_step) = train[i] # obs = np.array([np.concatenate([obs_in/self.state_mul_const, # action_in/self.act_mul_const])]).flatten() obs = X[i] done = train[i][4] action, q = self.agent.pi(obs, apply_noise=True, compute_Q=True) r = -np.linalg.norm(S[i] / self.state_mul_const - action) / action.shape[0] # if not done and i < len(train): # new_obs = np.array([np.concatenate([new_obs_in / self.state_mul_const, # train[i][1] / self.act_mul_const])]).flatten() # else: # new_obs = np.array([np.concatenate([new_obs_in / self.state_mul_const, # np.zeros_like(action_in)])]).flatten() if i < len(train): new_obs = X[i + 1] else: new_obs = np.zeros_like(X[i]) t += 1 i += 1 episode_reward += r episode_step += 1 # Book-keeping. self.agent.store_transition(obs, action, r, new_obs, done) if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_reward = 0. epoch_episodes += 1 episodes += 1 self.agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if self.memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = self.agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = self.agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) self.agent.update_target_net() print('Epoch ' + str(epoch) + '/' + str(nb_epochs) + ' with avg rew of: ' + str(sum(epoch_episode_rewards) / len(epoch_episode_rewards)) + ' in ' + str(time.time() - start_time) + 's') if epoch % log_interval == 0 and epoch > 0: if saver is not None and save_str is not None: save_path = saver.save(self.sess, 'models/' + str(save_str) + '.ckpt') print("Model saved in path: %s" % save_path) if saver is not None and save_str is not None: save_path = saver.save(self.sess, 'models/' + str(save_str) + '.ckpt') print("Model saved in path: %s" % save_path) def step(self, obs_in, action_in, episode_step, save=True, buff=None): import copy obs = obs_in / self.state_mul_const action = action_in / self.act_mul_const if save: if episode_step == 0: self.buffer = deque(self.buff_init * self.buff_len, maxlen=self.buff_len) self.buffer.append( np.array([np.concatenate([obs, action])]).flatten()) else: if buff is None: buff = copy.copy(self.buffer) if episode_step == 0: buff = deque(self.buff_init * self.buff_len, maxlen=self.buff_len) buff.append(np.array([np.concatenate([obs, action])]).flatten()) if buff is not None: x = np.array([np.concatenate(buff).flatten()])[0] else: x = np.array([np.concatenate(self.buffer).flatten()])[0] new_obs, _ = self.agent.pi(x, apply_noise=True, compute_Q=True) return new_obs
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, load_network_id, latest, plot_info, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() if (load_network_id): agent.load_actor_critic(id=load_network_id) if (latest): agent.load_actor_critic(latest=True) obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_distances2target = [] epoch_episode_relative_alt = [] epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: logger.info('EPISODE OVER!') # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) epoch_episode_distances2target.append( info['dist2target']) epoch_episode_relative_alt.append(info['relative_alt']) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 if (episodes % 10 == 0): agent.save_actor_critic(id=episodes) if (episodes % 2 == 0 and plot_info): plot_information(epoch_episode_distances2target, epoch_episode_rewards, epoch_episode_relative_alt) plt.pause(0.1) agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Update learning rates if (epoch % 5 == 0 and epoch > 0): agent.update_lr(agent.actor_lr * 0.65, agent.critic_lr * 0.65) logger.info('Finished training')
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, saved_model_basename, restore_model_name, crowdai_client, crowdai_token, reward_shaping, feature_embellishment, relative_x_pos, relative_z_pos, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. saved_model_dir = 'saved-models/' if saved_model_basename is None: saved_model_basename = ''.join( random.choices(string.ascii_lowercase + string.digits, k=8)) saved_model_path = saved_model_dir + saved_model_basename if restore_model_name: restore_model_path = restore_model_name if not pathlib.Path(restore_model_path + '.index').is_file(): restore_model_path = saved_model_dir + restore_model_name max_to_keep = 500 eval_reward_threshold_to_keep = 300 saver = tf.train.Saver(max_to_keep=max_to_keep) adam_optimizer_store = dict() adam_optimizer_store['actor_optimizer'] = dict() adam_optimizer_store['critic_optimizer'] = dict() #eval_episode_rewards_history = deque(maxlen=100) #episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: try: if restore_model_name: logger.info("Restoring from model at", restore_model_path) #saver.restore(sess, tf.train.latest_checkpoint(model_path)) saver.restore(sess, restore_model_path) else: logger.info("Creating new model") sess.run(tf.global_variables_initializer( )) # this should happen here and not in the agent right? except InvalidArgumentError as exc: if "Assign requires shapes of both tensors to match." in str(exc): print("Unable to restore model from {:s}.".format( restore_model_path)) print( "Chances are you're trying to restore a model with reward embellishment into an environment without reward embellishment (or vice versa). Unfortunately this isn't supported (yet)." ) print(exc.message) sys.exit() else: raise exc # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() # restore adam optimizer try: if restore_model_name: logger.info("Restoring pkl file with adam state", restore_model_path) #saver.restore(sess, tf.train.latest_checkpoint(model_path)) adam_optimizer_store = pickle.load( open(restore_model_path + ".pkl", "rb")) agent.actor_optimizer.m = adam_optimizer_store[ 'actor_optimizer']['m'] agent.actor_optimizer.v = adam_optimizer_store[ 'actor_optimizer']['v'] agent.actor_optimizer.t = adam_optimizer_store[ 'actor_optimizer']['t'] agent.critic_optimizer.m = adam_optimizer_store[ 'critic_optimizer']['m'] agent.critic_optimizer.v = adam_optimizer_store[ 'critic_optimizer']['v'] agent.critic_optimizer.t = adam_optimizer_store[ 'critic_optimizer']['t'] if 'param_noise' in adam_optimizer_store: agent.param_noise = adam_optimizer_store['param_noise'] except: print("Unable to restore adam state from {:s}.".format( restore_model_path)) obs = env.reset() done = False episode_reward = 0. #episode_step = 0 #episodes = 0 #t = 0 #epoch_episode_steps = [] #epoch_episode_eval_rewards = [] #epoch_episode_eval_steps = [] #epoch_start_time = time.time() #epoch_actions = [] #epoch_episodes = 0 for epoch in range(nb_epochs): start_time = time.time() epoch_episode_rewards = [] epoch_qs = [] eval_episode_rewards = [] eval_qs = [] eval_steps = [] epoch_actor_losses = [] epoch_critic_losses = [] worth_keeping = False for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape #new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) new_obs, r, done, info = env.step(action) #t += 1 if rank == 0 and render: env.render() episode_reward += r #episode_step += 1 # Book-keeping. #epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) #episode_rewards_history.append(episode_reward) #epoch_episode_steps.append(episode_step) episode_reward = 0. #episode_step = 0 #epoch_episodes += 1 #episodes += 1 agent.reset() obs = env.reset() # Train. #epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() #epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Submit to crowdai competition. What a hack. :) #if crowdai_client is not None and crowdai_token is not None and eval_env is not None: crowdai_submit_count = 0 if crowdai_client is not None and crowdai_token is not None: eval_obs_dict = crowdai_client.env_create( crowdai_token, env_id="ProstheticsEnv") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=reward_shaping, reward_shaping_x=1., feature_embellishment=feature_embellishment, relative_x_pos=relative_x_pos, relative_z_pos=relative_z_pos) while True: action, _ = agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False) submit_action = prosthetics_env.openai_to_crowdai_submit_action( action) clipped_submit_action = np.clip(submit_action, 0., 1.) actions_equal = clipped_submit_action == submit_action if not np.all(actions_equal): logger.debug("crowdai_submit_count:", crowdai_submit_count) logger.debug(" openai-action:", action) logger.debug(" submit-action:", submit_action) crowdai_submit_count += 1 [eval_obs_dict, reward, done, info] = crowdai_client.env_step( clipped_submit_action.tolist(), True) #[eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True) eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=reward_shaping, reward_shaping_x=1., feature_embellishment=feature_embellishment, relative_x_pos=relative_x_pos, relative_z_pos=relative_z_pos) if done: logger.debug("done: crowdai_submit_count:", crowdai_submit_count) eval_obs_dict = crowdai_client.env_reset() if not eval_obs_dict: break logger.debug( "done: eval_obs_dict exists after reset") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=reward_shaping, reward_shaping_x=1., feature_embellishment=feature_embellishment, relative_x_pos=relative_x_pos, relative_z_pos=relative_z_pos) crowdai_client.submit() return # kids, don't try any of these (expedient hacks) at home! if eval_env: eval_episode_reward_mean, eval_q_mean, eval_step_mean = evaluate_n_episodes( 3, eval_env, agent, nb_eval_steps, render_eval) if eval_episode_reward_mean >= eval_reward_threshold_to_keep: worth_keeping = True mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time if nb_epochs and nb_epoch_cycles and nb_train_steps > 0: #stats = agent.get_stats() #combined_stats = stats.copy() combined_stats = {} combined_stats['train/epoch_episode_reward_mean'] = np.mean( epoch_episode_rewards) #combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) #combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) #combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['train/epoch_Q_mean'] = np.mean(epoch_qs) combined_stats['train/epoch_loss_actor'] = np.mean( epoch_actor_losses) combined_stats['train/epoch_loss_critic'] = np.mean( epoch_critic_losses) #combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['train/epoch_duration'] = duration #combined_stats['epoch/steps_per_second'] = float(t) / float(duration) #combined_stats['total/episodes'] = episodes #combined_stats['rollout/episodes'] = epoch_episodes #combined_stats['rollout/actions_std'] = np.std(epoch_actions) #combined_stats['memory/rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss else: combined_stats = {} # Evaluation statistics. if eval_env: combined_stats[ 'eval/epoch_episode_reward_mean'] = eval_episode_reward_mean # np.mean(eval_episode_rewards) #combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) #combined_stats['eval/epoch_episode_reward_std'] = np.std(eval_episode_rewards) combined_stats[ 'eval/epoch_Q_mean'] = eval_q_mean # np.mean(eval_qs) #combined_stats['eval/episodes'] = len(eval_episode_rewards) combined_stats[ 'eval/steps_mean'] = eval_step_mean # np.mean(eval_steps) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. #combined_stats['total/epochs'] = epoch + 1 #combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.info('') logger.info('Epoch', epoch) logger.dump_tabular() logdir = logger.get_dir() if worth_keeping and rank == 0 and nb_epochs and nb_epoch_cycles and nb_train_steps and nb_rollout_steps: logger.info( 'Saving model to', saved_model_dir + saved_model_basename + '-' + str(epoch)) saver.save(sess, saved_model_path, global_step=epoch, write_meta_graph=False) adam_optimizer_store['actor_optimizer'][ 'm'] = agent.actor_optimizer.m adam_optimizer_store['actor_optimizer'][ 'v'] = agent.actor_optimizer.v adam_optimizer_store['actor_optimizer'][ 't'] = agent.actor_optimizer.t adam_optimizer_store['critic_optimizer'][ 'm'] = agent.critic_optimizer.m adam_optimizer_store['critic_optimizer'][ 'v'] = agent.critic_optimizer.v adam_optimizer_store['critic_optimizer'][ 't'] = agent.critic_optimizer.t adam_optimizer_store['param_noise'] = agent.param_noise pickle.dump( adam_optimizer_store, open((saved_model_path + "-" + str(epoch) + ".pkl"), "wb")) old_epoch = epoch - max_to_keep if old_epoch >= 0: try: os.remove(saved_model_path + "-" + str(old_epoch) + ".pkl") except OSError: pass if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def main(): args = parse_args() logger.configure() gamma = 0.99 tau = 0.01 normalize_returns = False normalize_observations = True batch_size = 64 action_noise = None stddev = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) critic_l2_reg = 1e-2 actor_lr = 1e-4 critic_lr = 1e-3 popart = False clip_norm = None reward_scale = 1. env = prosthetics_env.Wrapper(osim_env.ProstheticsEnv(visualize=False), frameskip=4, reward_shaping=True, reward_shaping_x=1, feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) top_model_dir = 'top-models/' # create tf sessions and graphs sess_list = [] graph_list = [] for i in range(len(args.model_files)): graph_list.append(tf.Graph()) sess_list.append(tf.Session(graph=graph_list[i])) ddpg_agents = [] for i in range(len(args.model_files)): model_name = args.model_files[i] sess = sess_list[i] graph = graph_list[i] l_size = args.layer_sizes[i] with sess.as_default(): #with U.make_session(num_cpu=1, graph=g) as sess: with graph.as_default(): #tf.global_variables_initializer() # restore agents from model files and store in ddpg_agents print("Restoring from..." + model_name) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=True, activation='relu', layer_sizes=[l_size, l_size]) actor = Actor(env.action_space.shape[-1], layer_norm=True, activation='relu', layer_sizes=[l_size, l_size]) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) # restore adam state and param noise restore_model_path = top_model_dir + model_name saver = tf.train.Saver(max_to_keep=500) # restore network weights saver.restore(sess, restore_model_path) adam_optimizer_store = pickle.load(open(restore_model_path + ".pkl", "rb")) agent.actor_optimizer.m = adam_optimizer_store['actor_optimizer']['m'] agent.actor_optimizer.v = adam_optimizer_store['actor_optimizer']['v'] agent.actor_optimizer.t = adam_optimizer_store['actor_optimizer']['t'] agent.critic_optimizer.m = adam_optimizer_store['critic_optimizer']['m'] agent.critic_optimizer.v = adam_optimizer_store['critic_optimizer']['v'] agent.critic_optimizer.t = adam_optimizer_store['critic_optimizer']['t'] if 'param_noise' in adam_optimizer_store: agent.param_noise = adam_optimizer_store['param_noise'] # intialize and prepare agent session. agent.initialize(sess) #sess.graph.finalize() agent.reset() ddpg_agents.append(agent) agent = BlendedAgent(ddpg_agents, sess_list, graph_list) if args.evaluation: # setup eval env eval_env = prosthetics_env.EvaluationWrapper(osim_env.ProstheticsEnv(visualize=False), frameskip=4, reward_shaping=True, reward_shaping_x=1, feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) eval_env.change_model(model=('3D').upper(), prosthetic=True, difficulty=0, seed=0) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) nb_eval_steps = 1000 # reward, mean_q, final_steps = evaluate_one_episode(eval_env, ddpg_agents, sess_list, graph_list, # nb_eval_steps=nb_eval_steps, # render=False) reward, mean_q, final_steps = evaluate_one_episode(eval_env, agent, nb_eval_steps, render=False) print("Reward: " + str(reward)) print("Mean Q: " + str(mean_q)) print("Final num steps: " + str(final_steps)) # Submit to crowdai competition. What a hack. :) # if crowdai_client is not None and crowdai_token is not None and eval_env is not None: crowdai_submit_count = 0 if args.crowdai_submit: remote_base = "http://grader.crowdai.org:1729" crowdai_client = Client(remote_base) eval_obs_dict = crowdai_client.env_create(args.crowdai_token, env_id="ProstheticsEnv") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) while True: action, _ = agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False) submit_action = prosthetics_env.openai_to_crowdai_submit_action(action) clipped_submit_action = np.clip(submit_action, 0., 1.) actions_equal = clipped_submit_action == submit_action if not np.all(actions_equal): logger.debug("crowdai_submit_count:", crowdai_submit_count) logger.debug(" openai-action:", action) logger.debug(" submit-action:", submit_action) crowdai_submit_count += 1 [eval_obs_dict, reward, done, info] = crowdai_client.env_step(clipped_submit_action.tolist(), True) # [eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True) eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) if done: logger.debug("done: crowdai_submit_count:", crowdai_submit_count) eval_obs_dict = crowdai_client.env_reset() if not eval_obs_dict: break logger.debug("done: eval_obs_dict exists after reset") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) crowdai_client.submit() for i in range(len(sess_list)): sess_list[i].close()
def __init__( self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, # ddpg related params layer_norm=False, tau=0.001, normalize_returns=False, normalize_observations=True, batch_size=128, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, popart=False, clip_norm=10., reward_scale=1.): sess = tf.get_default_session() act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True) # init DDPG critic = Critic(layer_norm=layer_norm) actor = Actor(ac_space.shape[-1], layer_norm=layer_norm) memory = Memory(limit=int(1e6), action_shape=ac_space.shape, observation_shape=ob_space.shape) ddpg_agent = DDPG(actor, critic, memory, ob_space.shape, ac_space.shape, gamma=0.99, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=None, param_noise=None, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) ddpg_agent.initialize(sess) ddpg_agent.reset() A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) if use_annealing: DDPG_AC = tf.placeholder(tf.float32, (None, ) + ac_space.shape) DDPG_W = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) if use_annealing: pi_mean = train_model.pi ac_loss = tf.reduce_mean(tf.square(pi_mean - DDPG_AC)) # loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # ----------------- DDPG ----------------- if use_ddpg: loss = pg_loss - entropy * ent_coef if use_annealing: loss = pg_loss - entropy * ent_coef + ac_loss * DDPG_W else: loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # ----------------- DDPG ----------------- with tf.variable_scope('model'): params = tf.trainable_variables() grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) _train = trainer.apply_gradients(grads) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None, ddpg_acs=None, ddpg_w=0.): advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) if not use_annealing: td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } else: td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values, DDPG_AC: ddpg_acs, DDPG_W: ddpg_w } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks if not use_annealing: return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] else: return sess.run([ pg_loss, vf_loss, entropy, approxkl, clipfrac, ac_loss, _train ], td_map)[:-1] if not use_annealing: self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] else: self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'ac_loss' ] def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.agent = ddpg_agent self.save = save self.load = load tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101
def train(env, nb_epochs, nb_episodes, nb_epoch_cycles, episode_length, nb_train_steps, eval_freq, save_freq, nb_eval_episodes, actor, critic, memory, gamma, normalize_returns, normalize_observations, critic_l2_reg, action_noise, param_noise, popart, clip_norm, batch_size, reward_scale, action_repeat, full, exclude_centering_frame, visualize, fail_reward, num_processes, num_processes_to_wait, num_testing_processes, learning_session, min_buffer_length, integrator_accuracy=5e-5, max_env_traj=100, tau=0.01): """ Parameters ---------- nb_epochs : the number of epochs to train. nb_episodes : the number of episodes for each epoch. episode_length : the maximum number of steps for each episode. gamma : discount factor. tau : soft update coefficient. clip_norm : clip on the norm of the gradient. """ assert action_repeat > 0 assert nb_episodes >= num_processes # Get params from learning session checkpoint_dir = learning_session.checkpoint_dir log_dir = learning_session.log_dir training_step = learning_session.last_training_step # Initialize DDPG agent (target network and replay buffer) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=None, critic_l2_reg=critic_l2_reg, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, training_step=training_step) # We need max_action because the NN output layer is a tanh. # So we must scale it back. max_action = env.action_space.high # Build Workers events = [Event() for _ in range(num_processes)] inputQs = [Queue() for _ in range(num_processes)] outputQ = Queue() # Split work among workers nb_episodes_per_worker = nb_episodes // num_processes workers = [ SamplingWorker(i, actor, critic, episode_length, nb_episodes_per_worker, action_repeat, max_action, gamma, tau, normalize_returns, batch_size, normalize_observations, param_noise, critic_l2_reg, popart, clip_norm, reward_scale, events[i], inputQs[i], outputQ, full, exclude_centering_frame, integrator_accuracy, max_env_traj, visualize, fail_reward) for i in range(num_processes) ] # Run the Workers for w in workers: w.start() # Create Round Robin tester tester = RoundRobinTester( num_testing_processes, actor, critic, episode_length, nb_eval_episodes, action_repeat, max_action, gamma, tau, normalize_returns, batch_size, normalize_observations, critic_l2_reg, popart, clip_norm, reward_scale, full, exclude_centering_frame, integrator_accuracy, max_env_traj, visualize, fail_reward) # Start training loop with U.single_threaded_session() as sess: agent.initialize(sess) writer = tf.summary.FileWriter(log_dir) writer.add_graph(sess.graph) # Initialize writer and statistics stats = EvaluationStatistics(tf_session=sess, tf_writer=writer) # setup saver saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=2) get_parameters = U.GetFlat(actor.trainable_vars) global_step = 0 obs = env.reset() agent.reset() # Processes waiting for a new sampling task waiting_indices = [i for i in range(num_processes)] for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # If we have sampling workers waiting, dispatch a sampling job if waiting_indices: actor_ws = get_parameters() # Run parallel sampling for i in waiting_indices: inputQs[i].put(('sample', actor_ws)) events[i].set() # Notify worker: sample baby, sample! waiting_indices.clear() # Collect results when ready for i in range(num_processes_to_wait): process_index, transitions = outputQ.get() waiting_indices.append(process_index) print('Collecting transition samples from Worker {}/{}'. format(i + 1, num_processes_to_wait)) for t in transitions: agent.store_transition(*t) # try to collect other samples if available for i in range(num_processes): try: process_index, transitions = outputQ.get_nowait() if process_index not in waiting_indices: waiting_indices.append(process_index) print('Collecting transition samples from Worker {}'. format(process_index)) for t in transitions: agent.store_transition(*t) except queue.Empty: # No sampling ready, keep on training. pass # Training phase if agent.memory.nb_entries > min_buffer_length: for _ in range(nb_train_steps): critic_loss, actor_loss = agent.train() agent.update_target_net() # Plot statistics stats.add_critic_loss(critic_loss, global_step) stats.add_actor_loss(actor_loss, global_step) global_step += 1 # Evaluation phase if cycle % eval_freq == 0: print("Cycle number: ", cycle + epoch * nb_epoch_cycles) print("Sending testing job...") actor_ws = get_parameters() # Send a testing job tester.test(actor_ws, global_step) # Print stats (if any) tester.log_stats(stats, logger) if cycle % save_freq == 0: # Save weights save_path = saver.save(sess, checkpoint_dir, global_step=global_step) print("Model saved in path: %s" % save_path) # Dump learning session learning_session.dump(agent.training_step) print("Learning session dumped to: %s" % str(learning_session.session_path)) else: print("Not enough entry in memory buffer") # Stop workers for i in range(num_processes): inputQs[i].put(('exit', None)) events[i].set() # Notify worker: exit! tester.close() # Stop testing workers env.close()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train_return(env, param_noise, actor, critic, memory, nb_epochs=250, nb_epoch_cycles=20, reward_scale=1., render=False, normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, action_noise=None, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, nb_rollout_steps=2048, batch_size=64, tau=0.01, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) # Set up logging stuff only for a single worker. episode_rewards_history = deque(maxlen=100) #with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): print('epoch number:', epoch) for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() return agent
class DDPGAgent(BaseAgent): """A Deep Deterministic Policy Gradient implementation of an SC2 agent.""" def __init__(self): super(DDPGAgent, self).__init__() return def setup(self, obs_shape, nb_actions, action_spec, noise_type, gamma=1., tau=0.01, layer_norm=True): super(DDPGAgent, self).setup(obs_shape, nb_actions, action_spec, noise_type, gamma, tau, layer_norm) self.action_spec_internal = action_spec self.obs_dim = obs_shape action_noise = None param_noise = None # Parse noise_type for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. self.memory = Memory(limit=int(500), action_shape=(nb_actions, ), observation_shape=obs_shape) self.critic = Critic(layer_norm=layer_norm, hidden_size=128) self.actor = Actor(nb_actions, layer_norm=layer_norm, hidden_size=128) tf.reset_default_graph() # max_action = env.action_space.high self.ddpg = DDPG(actor=self.actor, critic=self.critic, memory=self.memory, observation_shape=obs_shape, action_shape=(nb_actions, ), gamma=gamma, tau=tau, action_noise=action_noise, param_noise=param_noise) def step(self, obs): super(DDPGAgent, self).step(obs) acts, q = self.ddpg.pi(obs, apply_noise=True, compute_Q=True) # Move distribution from [-1, 1] to [0, 2] and convert to z-score actions_z = (2 - (acts + 1)) / 2 return actions_z, q def reset(self): super(DDPGAgent, self).reset() self.ddpg.reset() def initialize(self, sess): super(DDPGAgent, self).initialize(sess) self.ddpg.initialize(sess) def store_transition(self, obs, action, r, new_obs, done): super(DDPGAgent, self).store_transition(obs, action, r, new_obs, done) self.ddpg.store_transition(obs, action, r, new_obs, done) def train(self): super(DDPGAgent, self).train() return self.ddpg.train() def adapt_param_noise(self): super(DDPGAgent, self).adapt_param_noise() return self.ddpg.adapt_param_noise() def backprop(self): super(DDPGAgent, self).backprop() self.ddpg.update_target_net() def get_memory_size(self): super(DDPGAgent, self).get_memory_size() return self.memory.nb_entries @property def action_spec(self): return self.action_spec_internal @property def obs_shape(self): return self.obs_dim
def train(env, nb_epochs, nb_episodes, episode_length, nb_train_steps, eval_freq, nb_eval_episodes, actor, critic, memory, gamma, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, clip_norm, batch_size, reward_scale, tau=0.01): """ Parameters ---------- nb_epochs : the number of epochs to train. nb_episodes : the number of episodes for each epoch. episode_length : the maximum number of steps for each episode. gamma : discount factor. tau : soft update coefficient. clip_norm : clip on the norm of the gradient. """ # Initialize DDPG agent (target network and replay buffer) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=None, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) # We need max_action because the NN output layer is a tanh. # So we must scale it back. max_action = env.action_space.high with U.single_threaded_session() as sess: agent.initialize(sess) # Setup summary writer writer = _setup_tf_summary() writer.add_graph(sess.graph) stats = EvaluationStatistics(tf_session=sess, tf_writer=writer) sess.graph.finalize() global_step = 0 obs = env.reset() agent.reset() for epoch in range(nb_epochs): for episode in range(nb_episodes): obs = env.reset() # Generate a trajectory for t in range(episode_length): # Select action a_t according to current policy and # exploration noise a_t, _ = agent.pi(obs, apply_noise=True, compute_Q=False) assert a_t.shape == env.action_space.shape # Execute action a_t and observe reward r_t and next state s_{t+1} new_obs, r_t, done, info = env.step(max_action * a_t) # Store transition in the replay buffer agent.store_transition(obs, a_t, r_t, new_obs, done) obs = new_obs if done: agent.reset() obs = env.reset() break # End episode # Training phase for t_train in range(nb_train_steps): critic_loss, actor_loss = agent.train() agent.update_target_net() # Plot statistics stats.add_critic_loss(critic_loss, global_step) stats.add_actor_loss(actor_loss, global_step) global_step += 1 # Evaluation phase if episode % eval_freq == 0: # Generate evaluation trajectories for eval_episode in range(nb_eval_episodes): obs = env.reset() for t in range(episode_length): env.render() # Select action a_t according to current policy and # exploration noise a_t, _ = agent.pi(obs, apply_noise=False, compute_Q=False) assert a_t.shape == env.action_space.shape # Execute action a_t and observe reward r_t and next state s_{t+1} obs, r_t, eval_done, info = env.step(max_action * a_t) stats.add_reward(r_t) if eval_done: obs = env.reset() break # Plot average reward stats.plot_reward(global_step)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, perform=False, expert=None, save_networks=False, supervise=False, pre_epoch=60, actor_only=False, critic_only=False, both_ours_sup=False, gail=False, pofd=False): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, expert=expert, save_networks=save_networks, supervise=supervise, actor_only=actor_only, critic_only=critic_only, both_ours_sup=both_ours_sup, gail=gail, pofd=pofd) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. network_saving_dir = os.path.join('./saved_networks', env.env.spec.id) + '/' if not os.path.exists(network_saving_dir): os.makedirs(network_saving_dir) agent.initialize(sess, saver, network_saving_dir, 10000, 30000) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() if expert is None: pretrain = False else: pretrain = True done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 small_buffer = [] big_buffer = [] for epoch in range(nb_epochs): if epoch >= pre_epoch and pretrain: pretrain = False logger.info('Stoped pretrain at epoch {}'.format(epoch)) for cycle in range(nb_epoch_cycles): if not perform: # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train(pretrain) epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): old_eval_obs = eval_obs eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if perform: small_buffer.append([ old_eval_obs, eval_action, eval_r, eval_obs, eval_done ]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. if perform and len(small_buffer) > 0: big_buffer.append(small_buffer) small_buffer = [] if len(big_buffer ) > 0 and len(big_buffer) % 1000 == 0: expert_dir = os.path.join( './expert', env.env.spec.id) + '/' if not os.path.exists(expert_dir): os.makedirs(expert_dir) pwritefile = open( os.path.join(expert_dir, 'expert.pkl'), 'wb') pickle.dump(big_buffer, pwritefile, -1) pwritefile.close() logger.info('Expert data saved!') return # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time combined_stats = {} if not perform: stats = agent.get_stats() for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. if not perform: combined_stats['rollout/return'] = mpi_mean( epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean( epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean( epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean( epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) if not perform: # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, save_path=None, restore_path=None, hindsight_mode=None): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. transitions = [] for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) transitions.append((obs, action, r, new_obs, done)) #agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # store regular transitions into replay memory for (obs, action, r, new_obs, done) in transitions: agent.store_transition(obs, action, r, new_obs, done) if hindsight_mode in ['final', 'future']: for (obs, action, r, new_obs, done) in replay_final(transitions, env.env): agent.store_transition(obs, action, r, new_obs, done) if hindsight_mode in ['future']: for (obs, action, r, new_obs, done) in replay_future(transitions, env.env): agent.store_transition(obs, action, r, new_obs, done) # store hindsight transitions. '''for i in range(3): # sample a random point in the trajectory idx = np.random.randint(0, len(transitions)) obs, action, r, new_obs, done = transitions[idx] # create a goal from that point goal = env.env.obs_to_goal(new_obs) for (obs, action, r, new_obs, done) in replay_with_goal(transitions[:idx+1], goal, env.env): agent.store_transition(obs, action, r, new_obs, done) obs, action, r, new_obs, done = transitions[-1] # store a "final" transition. goal = env.env.obs_to_goal(new_obs) for (obs, action, r, new_obs, done) in replay_with_goal(transitions, goal, env.env): agent.store_transition(obs, action, r, new_obs, done)''' # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['reward'] = mpi_mean(epoch_episode_rewards) # combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history)) combined_stats['episode_steps'] = mpi_mean(epoch_episode_steps) combined_stats['episodes'] = mpi_sum(epoch_episodes) # combined_stats['actions_mean'] = mpi_mean(epoch_actions) combined_stats['actions_std'] = mpi_std(epoch_actions) combined_stats['Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['policy_loss'] = mpi_mean(epoch_actor_losses) combined_stats['value_loss'] = mpi_mean(epoch_critic_losses) combined_stats['param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/reward'] = mpi_mean(eval_episode_rewards) # combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history)) combined_stats['eval/Q_mean'] = mpi_mean(eval_qs) # combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards)) # Total statistics. # combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) # combined_stats['total/episodes'] = mpi_mean(episodes) # combined_stats['total/epochs'] = epoch + 1 # combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, overwrite_memory, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, logdir, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, eval_jump, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, agentName=None, resume=0, max_to_keep=100): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver(max_to_keep=max_to_keep) else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) logF = open(os.path.join(logdir, 'log.txt'), 'a') logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a') logReward = open(os.path.join(logdir, 'logReward.txt'), 'a') with U.single_threaded_session() as sess: # Prepare everything. if (resume == 0): agent.initialize(sess, max_to_keep=max_to_keep) else: #restore = "{}-{}".format(agentName,resume) agent.initialize(sess, path=os.path.abspath(logdir), restore=agentName, itr=resume, overwrite=overwrite_memory, max_to_keep=max_to_keep) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(resume, resume + nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. print("Epoch " + str(epoch) + " episodes " + str(episodes) + " steps " + str(episode_step) + " reward " + str(episode_reward)) epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None and epoch % eval_jump == 0: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: print("Eval reward " + str(eval_episode_reward)) eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None and epoch % eval_jump == 0: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') # logdir = logger.get_dir() if rank == 0: logReward.write( str(epoch) + "," + str(combined_stats["rollout/return"]) + "\n") logReward.flush() logF.write(str(combined_stats["rollout/return"]) + "\n") json.dump(combined_stats, logStats) logF.flush() logStats.flush() # if not os.path.exists(os.path.abspath(logdir)): # os.makedirs(os.path.abspath(logdir), exist_ok=True) # print("logdir = ", logdir) # with open(os.path.join(logdir, "{}_{}".format(agentName, agent.itr.eval())), 'wb') as f: # pickle.dump(agent, f) agent.save(path=logdir, name=agentName, overwrite=overwrite_memory) logger.info("agent {} saved".format(agent.itr.eval())) if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, save_model, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() if not os.path.exists(os.path.join(logger.get_dir(), 'model')): os.makedirs(os.path.join(logger.get_dir(), 'model')) else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.make_session( num_cpu=4) as sess: # U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, num_timesteps, nb_trials, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, test_interval, batch_size, memory, output, load_file, save=False, tau=0.01, evaluation=False, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. observation_range = [env.observation_space.low, env.observation_space.high] max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, observation_range=observation_range) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None trial_return_history = deque(maxlen=100) eval_trial_return_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() #dir_path = os.path.dirname(os.path.realpath(__file__)) #tf.summary.FileWriter(dir_path, sess.graph) trial = 0 ts = 0 if load_file != '': saver.restore(sess, load_file) start_time = time.time() trial_returns = [] trial_steps = [] actions = [] qs = [] train_actor_losses = [] train_critic_losses = [] train_adaptive_distances = [] while True: test = (test_interval >= 0 and trial % (test_interval + 1) == test_interval) if not test: # Perform rollout. env.set_test(test=False) obs = env.reset() agent.reset() done = 0 trial_return = 0. trial_step = 0 while done == 0: # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) ts += 1 if rank == 0 and render: env.render() trial_return += r trial_step += 1 # Book-keeping. actions.append(action) qs.append(q) agent.store_transition( obs, action, r, new_obs, done == 2) # terminal indicator is 2 obs = new_obs # Train. if memory.nb_entries >= batch_size: for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if trial % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() train_adaptive_distances.append(distance) cl, al = agent.train() train_critic_losses.append(cl) train_actor_losses.append(al) agent.update_target_net() # Episode done. trial_steps.append(trial_step) trial_returns.append(trial_return) trial_return_history.append(trial_return) else: # Evaluate. eval_trial_return = 0. eval_trial_steps = 0 if evaluation is not None: env.set_test(test=True) eval_obs = env.reset() agent.reset() eval_done = 0 while eval_done == 0: eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: env.render() eval_trial_return += eval_r eval_trial_steps += 1 # Episode done. eval_trial_return_history.append(eval_trial_return) # Log stats. duration = time.time() - start_time combined_stats = {} if memory.nb_entries > 0: # Print only if learing was happaning stats = agent.get_stats() for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/Q_mean'] = mpi_mean(qs) combined_stats['rollout/actions_mean'] = mpi_mean(actions) combined_stats['rollout/actions_std'] = mpi_std(actions) combined_stats['rollout/trial_steps'] = mpi_mean( trial_steps) combined_stats['rollout/return'] = mpi_mean(trial_returns) combined_stats['rollout/return_history'] = mpi_mean( trial_return_history) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean( train_actor_losses) combined_stats['train/loss_critic'] = mpi_mean( train_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( train_adaptive_distances) # Evaluation statistics. if evaluation is not None: combined_stats['eval/Q'] = mpi_mean(eval_q) combined_stats['eval/return'] = eval_trial_return combined_stats['eval/return_history'] = mpi_mean( eval_trial_return_history) combined_stats['eval/steps'] = eval_trial_steps # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(ts) / float(duration)) combined_stats['total/trials'] = trial combined_stats['total/steps'] = ts for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if evaluation and hasattr(env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) # Reset statistics. trial_returns = [] trial_steps = [] actions = [] qs = [] train_actor_losses = [] train_critic_losses = [] train_adaptive_distances = [] # End of evaluate and log statistics # Check if this is the last trial trial += 1 if nb_trials and trial >= nb_trials: break if num_timesteps and ts >= num_timesteps: break # Saving policy and value function if save and saver and output != '': saver.save(sess, './%s' % output)