def test(env, render_eval, reward_scale, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, **kwargs): assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, False, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) saver = tf.train.Saver() writer = imageio.get_writer('/tmp/0.mp4', fps=10) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() ## restore restore_dir = osp.join(kwargs["restore_dir"], "model") if (restore_dir is not None): print('Restore path : ', restore_dir) checkpoint = tf.train.get_checkpoint_state(restore_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(U.get_session(), checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-")[-1] # set global step global_t = int(tokens) print(">>> global step set:", global_t) else: print(">>>no checkpoint file found") epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] # Evaluate. eval_episode_rewards = [] eval_episode_rewards_history = [] eval_episode_success = [] logdir = logger.get_dir() assert logdir is not None try: os.mkdir(osp.join(logdir, 'vis')) except: pass #already exists try: os.mkdir(osp.join(logdir, 'cam')) except: pass #logdir = logger.get_dir() vidpath = osp.join(logdir, 'vis/0.mp4') campath = osp.join(logdir, 'cam/0.mp4') vid_writer = imageio.get_writer(vidpath, fps=10) cam_writer = imageio.get_writer(campath, fps=10) for i in range(100): print("Evaluating:%d" % (i + 1)) eval_episode_reward = 0. eval_obs = eval_env.reset() eval_done = False while (not eval_done): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # print(eval_obs, max_action*eval_action, eval_info) #if render_eval: # eval_env.render() if render_eval: eval_env.render(writer=vid_writer) #let's draw the bounding box... box = eval_env.last_box box = np.minimum(box, 63) box = box.astype(np.int32) img = eval_obs[1] img[box[2], box[0]:box[1], :] = 0 img[box[3], box[0]:box[1], :] = 0 img[box[2]:box[3], box[0], :] = 0 img[box[2]:box[3], box[1], :] = 0 img = (img * 255.0).astype(np.uint8) cam_writer.append_data(img) #eval_env.render(writer = writer) #sleep(0.1) eval_episode_reward += eval_r print("episode reward::%f" % eval_episode_reward) eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_success.append(eval_info["done"] == "goal reached") eval_episode_reward = 0. print("episode reward - mean:%.4f, var:%.4f, success:%.4f" % (np.mean(eval_episode_rewards), np.var(eval_episode_rewards), np.mean(eval_episode_success))) cam_writer.close() vid_writer.close()
def test(env, render_eval, reward_scale, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, **kwargs): assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) saver = tf.train.Saver() with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() ## restore restore_dir = osp.join(kwargs["restore_dir"], "model") if (restore_dir is not None): print('Restore path : ',restore_dir) checkpoint = tf.train.get_checkpoint_state(restore_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(U.get_session(), checkpoint.model_checkpoint_path) print( "checkpoint loaded:" , checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-")[-1] # set global step global_t = int(tokens) print( ">>> global step set:", global_t) else: print(">>>no checkpoint file found") epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] # Evaluate. eval_episode_rewards = [] eval_episode_rewards_history = [] eval_episode_success = [] for i in range(10): print("Evaluating:%d"%(i+1)) eval_episode_reward = 0. eval_obs = eval_env.reset() eval_done = False while(not eval_done): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # print(eval_obs, max_action*eval_action, eval_info) if render_eval: eval_env.render() sleep(0.001) eval_episode_reward += eval_r print("ended",eval_info["done"]) print("episode reward::%f"%eval_episode_reward) eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_success.append(eval_info["done"]=="goal reached") eval_episode_reward = 0. print("episode reward - mean:%.4f, var:%.4f, success:%.4f"%(np.mean(eval_episode_rewards), np.var(eval_episode_rewards), np.mean(eval_episode_success)))
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, batch_size, memory, tau=0.05, eval_env=None, param_noise_adaption_interval=50, nb_eval_episodes=20, **kwargs): rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=4) assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high if "dologging" in kwargs: dologging = kwargs["dologging"] else: dologging = True if "tf_sum_logging" in kwargs: tf_sum_logging = kwargs["tf_sum_logging"] else: tf_sum_logging = False if "invert_grad" in kwargs: invert_grad = kwargs["invert_grad"] else: invert_grad = False if "actor_reg" in kwargs: actor_reg = kwargs["actor_reg"] else: actor_reg = False if dologging: logger.info( 'scaling actions by {} before executing in env'.format(max_action)) if kwargs['look_ahead']: look_ahead = True look_ahead_planner = Planning_with_memories( skillset=kwargs['my_skill_set'], env=env, num_samples=kwargs['num_samples']) exploration = LinearSchedule(schedule_timesteps=int(nb_epochs * nb_epoch_cycles), initial_p=1.0, final_p=kwargs['exploration_final_eps']) else: look_ahead = False agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, inverting_grad=invert_grad, actor_reg=actor_reg) if dologging: logger.debug('Using agent with the following configuration:') if dologging: logger.debug(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank != -1: saver = tf.train.Saver(keep_checkpoint_every_n_hours=2, max_to_keep=5, save_relative_paths=True) save_freq = kwargs["save_freq"] else: saver = None # step = 0 global_t = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Set summary saver if dologging and tf_sum_logging and rank == 0: tf.summary.histogram("actor_grads", agent.actor_grads) tf.summary.histogram("critic_grads", agent.critic_grads) actor_trainable_vars = actor.trainable_vars for var in actor_trainable_vars: tf.summary.histogram(var.name, var) critic_trainable_vars = critic.trainable_vars for var in critic_trainable_vars: tf.summary.histogram(var.name, var) tf.summary.histogram("actions_out", agent.actor_tf) tf.summary.histogram("critic_out", agent.critic_tf) tf.summary.histogram("target_Q", agent.target_Q) summary_var = tf.summary.merge_all() writer_t = tf.summary.FileWriter( osp.join(logger.get_dir(), 'train'), sess.graph) else: summary_var = tf.no_op() # Prepare everything. agent.initialize(sess) sess.graph.finalize() ## restore if kwargs['skillset']: ## restore skills my_skill_set = kwargs['my_skill_set'] my_skill_set.restore_skillset(sess=sess) if kwargs["restore_dir"] is not None: restore_dir = osp.join(kwargs["restore_dir"], "model") if (restore_dir is not None): print('Restore path : ', restore_dir) # checkpoint = tf.train.get_checkpoint_state(restore_dir) # if checkpoint and checkpoint.model_checkpoint_path: model_checkpoint_path = read_checkpoint_local(restore_dir) if model_checkpoint_path: saver.restore(U.get_session(), model_checkpoint_path) print("checkpoint loaded:", model_checkpoint_path) logger.info("checkpoint loaded:" + str(model_checkpoint_path)) tokens = model_checkpoint_path.split("-")[-1] # set global step global_t = int(tokens) print(">>> global step set:", global_t) agent.reset() obs = env.reset() # for _ in range(10): # env.render() # sleep(0.1) done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 # skill skill_done = True num_skill_steps = 0 paction = None epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 ## containers for hindsight if kwargs["her"]: # logger.info("-"*50 +'\nWill create HER\n' + "-"*50) states, actions = [], [] print("Ready to go!") for epoch in range(global_t, nb_epochs): # stat containers epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] eval_episode_rewards = [] eval_qs = [] eval_episode_success = [] for cycle in range(nb_epoch_cycles): # print("cycle:%d"%cycle) # Perform rollouts. for t_rollout in range( int(nb_rollout_steps / MPI.COMM_WORLD.Get_size())): # print(rank, t_rollout) # Predict next action. if kwargs['look_ahead'] and (np.random.rand( ) < exploration.value(epoch * nb_epoch_cycles + cycle)): if skill_done: paction, planner_info = look_ahead_planner.create_plan( env, obs) skill_done = False num_skill_steps = 0 # print("skill:%d"%np.argmax(paction[:my_skill_set.len])) # print(planner_info['sequence']) # print("received meta action",paction) # set_trace() primitives_prob = paction[:my_skill_set.len] primitive_id = np.argmax(primitives_prob) action = my_skill_set.pi( primitive_id=primitive_id, obs=obs.copy(), primitive_params=paction[my_skill_set.len:].copy()) num_skill_steps += 1 else: action, q = agent.pi(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # print("len of traj:%d, skill step:%d"%(len(planner_info["trajectories"]), num_skill_steps)) # traj_idx = min(len(planner_info["trajectories"])-1, num_skill_steps-1) # print("start") # print(obs[[0,1,2,3,4,5,-3,-2,-1]]) # print(planner_info['trajectories'][traj_idx][0][[0,1,2,3,4,5,-3,-2,-1]]) # print("action") # print(action) # print(planner_info['trajectories'][traj_idx][1]) # print("end") # print(new_obs[[0,1,2,3,4,5,-3,-2,-1]]) # print(planner_info['trajectories'][traj_idx][2][[0,1,2,3,4,5,-3,-2,-1]]) if kwargs['look_ahead'] and ( num_skill_steps == kwargs['commit_for'] or my_skill_set.termination( new_obs, primitive_id, primitive_params=paction[my_skill_set. len:].copy())): skill_done = True # print("succ model pred", planner_info['next_state'][:6]) # print("actual end state",new_obs[:6], new_obs[-3:]) # print("diff succ model",np.linalg.norm(planner_info['next_state'][:6] - new_obs[:6])) # print("diff nn model",np.linalg.norm(planner_info['next_state_nn'][:6] - new_obs[:6])) # set_trace() t += 1 if rank == 0 and render: env.render() sleep(0.1) episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) # epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) ## storing info for hindsight states.append(obs.copy()) actions.append(action.copy()) obs = new_obs if done: # print("reset") # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 if kwargs["her"]: # logger.info("-"*50 +'\nCreating HER\n' + "-"*50) ## create hindsight experience replay her_states, her_rewards = env.apply_hindsight( states, actions, new_obs.copy()) ## store her transitions: her_states: n+1, her_rewards: n for her_i in range(len(her_states) - 2): agent.store_transition(her_states[her_i], actions[her_i], her_rewards[her_i], her_states[her_i + 1], False) #store last transition agent.store_transition(her_states[-2], actions[-1], her_rewards[-1], her_states[-1], True) ## refresh the storage containers del states, actions states, actions = [], [] agent.reset() obs = env.reset() #print(obs) # print(rank, "Training!") # Train. for t_train in range(nb_train_steps): # print(rank, t_train) # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al, current_summary = agent.train(summary_var) epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() if dologging and tf_sum_logging and rank == 0: writer_t.add_summary( current_summary, epoch * nb_epoch_cycles * nb_train_steps + cycle * nb_train_steps + t_train) # print("Evaluating!") # Evaluate. if (eval_env is not None) and rank == 0: for _ in range(nb_eval_episodes): eval_episode_reward = 0. eval_obs = eval_env.reset() eval_obs_start = eval_obs.copy() eval_done = False while (not eval_done): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: sleep(0.1) print("Render!") eval_env.render() print("rendered!") eval_episode_reward += eval_r eval_qs.append(eval_q) eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_success.append( eval_info["done"] == "goal reached") if (eval_info["done"] == "goal reached"): logger.info( "success, training epoch:%d,starting config:" % epoch, eval_obs_start, 'final state', eval_obs) if dologging and rank == 0: print("Logging!") # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = normal_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = normal_mean( epoch_episode_rewards) if len(episode_rewards_history) > 0: combined_stats['rollout/return_history'] = normal_mean( np.mean(episode_rewards_history)) else: combined_stats['rollout/return_history'] = 0. combined_stats['rollout/episode_steps'] = normal_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = np.sum(epoch_episodes) combined_stats['rollout/actions_mean'] = normal_mean( epoch_actions) combined_stats['rollout/actions_std'] = normal_std( epoch_actions) combined_stats['rollout/Q_mean'] = normal_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = normal_mean( epoch_actor_losses) combined_stats['train/loss_critic'] = normal_mean( epoch_critic_losses) combined_stats['train/param_noise_distance'] = normal_mean( epoch_adaptive_distances) if kwargs['look_ahead']: combined_stats['train/exploration'] = exploration.value( epoch * nb_epoch_cycles + cycle) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = normal_mean( eval_episode_rewards) combined_stats['eval/success'] = normal_mean( eval_episode_success) if len(eval_episode_rewards_history) > 0: combined_stats['eval/return_history'] = normal_mean( np.mean(eval_episode_rewards_history)) else: combined_stats['eval/return_history'] = 0. combined_stats['eval/Q'] = normal_mean(eval_qs) combined_stats['eval/episodes'] = normal_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = normal_mean(duration) combined_stats['total/steps_per_second'] = normal_mean( float(t) / float(duration)) combined_stats['total/episodes'] = normal_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: print("Dumping progress!") if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) ## save tf model if rank == 0 and (epoch + 1) % save_freq == 0: print("Saving the model!") os.makedirs(osp.join(logdir, "model"), exist_ok=True) saver.save(U.get_session(), logdir + "/model/ddpg", global_step=epoch)
def test(env, render_eval, reward_scale, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, **kwargs): assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high if kwargs['skillset']: action_shape = (kwargs['my_skill_set'].len + kwargs['my_skill_set'].params, ) else: action_shape = env.action_space.shape agent = DDPG(actor, critic, memory, env.observation_space.shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) var_list_restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="actor") + \ tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="obs_rms") print(var_list_restore) saver = tf.train.Saver(var_list=var_list_restore) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() ## restore if kwargs['skillset']: ## restore skills my_skill_set = kwargs['my_skill_set'] my_skill_set.restore_skillset(sess=sess) ## restore meta controller weights restore_dir = osp.join(kwargs["restore_dir"], "model") if (restore_dir is not None): print('Restore path : ', restore_dir) # checkpoint = tf.train.get_checkpoint_state(restore_dir) # if checkpoint and checkpoint.model_checkpoint_path: model_checkpoint_path = read_checkpoint_local(restore_dir) if model_checkpoint_path: saver.restore(U.get_session(), model_checkpoint_path) print("checkpoint loaded:", model_checkpoint_path) tokens = model_checkpoint_path.split("-")[-1] # set global step global_t = int(tokens) print(">>> global step set:", global_t) else: print(">>>no checkpoint file found") epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] # Evaluate. eval_episode_rewards = [] eval_episode_rewards_history = [] eval_episode_success = [] for i in range(100): print("Evaluating:%d" % (i + 1)) eval_episode_reward = 0. eval_obs = eval_env.reset() print("start obs", eval_obs[:6], eval_obs[-3:]) # for _ in range(1000): # eval_env.render() # print(eval_env.sim.data.get_site_xpos('box')) # sleep(0.1) eval_done = False while (not eval_done): eval_paction, eval_pq = agent.pi(eval_obs, apply_noise=False, compute_Q=True) if (kwargs['skillset']): ## break actions into primitives and their params eval_primitives_prob = eval_paction[:my_skill_set.len] eval_primitive_id = np.argmax(eval_primitives_prob) print("skill chosen%d" % eval_primitive_id) eval_r = 0. eval_skill_obs = eval_obs.copy() for _ in range(kwargs['commit_for']): eval_action = my_skill_set.pi( primitive_id=eval_primitive_id, obs=eval_skill_obs.copy(), primitive_params=eval_paction[ kwargs['my_skill_set'].len:]) eval_skill_new_obs, eval_skill_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_skill_obs = eval_skill_new_obs eval_r += eval_skill_r if render_eval: eval_env.render() sleep(0.1) if eval_done or my_skill_set.termination( eval_skill_new_obs, eval_primitive_id, primitive_params=eval_paction[my_skill_set. len:]): break eval_new_obs = eval_skill_new_obs else: eval_action, q = eval_paction, eval_pq eval_new_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_episode_reward += eval_r eval_obs = eval_new_obs print("ended", eval_info["done"]) print("episode reward::%f" % eval_episode_reward) eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_success.append(eval_info["done"] == "goal reached") eval_episode_reward = 0. print("episode reward - mean:%.4f, var:%.4f, success:%.4f" % (np.mean(eval_episode_rewards), np.var(eval_episode_rewards), np.mean(eval_episode_success)))
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, single_train, tau=0.05, eval_env=None, param_noise_adaption_interval=50, **kwargs): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high if "dologging" in kwargs: dologging = kwargs["dologging"] else: dologging = True if "tf_sum_logging" in kwargs: tf_sum_logging = kwargs["tf_sum_logging"] else: tf_sum_logging = False if "invert_grad" in kwargs: invert_grad = kwargs["invert_grad"] else: invert_grad = False if "actor_reg" in kwargs: actor_reg = kwargs["actor_reg"] else: actor_reg = False if dologging: logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, single_train, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, inverting_grad=invert_grad, actor_reg=actor_reg) if dologging: logger.info('Using agent with the following configuration:') if dologging: logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver(keep_checkpoint_every_n_hours=2, max_to_keep=5) save_freq = kwargs["save_freq"] else: saver = None # step = 0 global_t = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Set summary saver if dologging and tf_sum_logging and rank == 0: tf.summary.histogram("actor_grads", agent.actor_grads) tf.summary.histogram("critic_grads", agent.critic_grads) actor_trainable_vars = actor.trainable_vars for var in actor_trainable_vars: tf.summary.histogram(var.name, var) critic_trainable_vars = critic.trainable_vars for var in critic_trainable_vars: tf.summary.histogram(var.name, var) tf.summary.histogram("actions_out", agent.actor_tf) tf.summary.histogram("critic_out", agent.critic_tf) tf.summary.histogram("target_Q", agent.target_Q) summary_var = tf.summary.merge_all() writer_t = tf.summary.FileWriter( osp.join(logger.get_dir(), 'train'), sess.graph) else: summary_var = tf.no_op() # Prepare everything. agent.initialize(sess) sess.graph.finalize() ## restore if kwargs["restore_dir"] is not None: restore_dir = osp.join(kwargs["restore_dir"], "model") if (restore_dir is not None) and rank == 0: print('Restore path : ', restore_dir) checkpoint = tf.train.get_checkpoint_state(restore_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(U.get_session(), checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:" + str(checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-")[-1] # set global step global_t = int(tokens) print(">>> global step set:", global_t) agent.sync() agent.reset() obs = env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 ## containers for hindsight if kwargs["her"]: # logger.info("-"*50 +'\nWill create HER\n' + "-"*50) states, actions = [], [] if rank == 0: logdir = logger.get_dir() try: os.mkdir(osp.join(logdir, 'vis')) except: pass #already exists try: os.mkdir(osp.join(logdir, 'cam')) except: pass print("Ready to go!") for epoch in range(global_t, nb_epochs): if rank == 0: logdir = logger.get_dir() vidpath = osp.join(logdir, 'vis/%d.mp4' % epoch) campath = osp.join(logdir, 'cam/%d.mp4' % epoch) vid_writer = imageio.get_writer(vidpath, fps=10) cam_writer = imageio.get_writer(campath, fps=10) # stat containers epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] eval_episode_rewards = [] eval_qs = [] eval_episode_success = [] for cycle in range(nb_epoch_cycles): # Perform rollouts. t0 = time.time() rollout_steps = int(nb_rollout_steps / MPI.COMM_WORLD.Get_size()) for t_rollout in range(rollout_steps): if rank == 0 and single_train: break # print(rank, t_rollout) # Predict next action. #assert obs[0].shape == (6,) action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) #if((t+1)%100) == 0: # print(max_action*action, new_obs, r) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) ## storing info for hindsight states.append(copy_obs(obs)) actions.append(action.copy()) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 if kwargs["her"]: # logger.info("-"*50 +'\nCreating HER\n' + "-"*50) ## create hindsight experience replay her_states, her_rewards = env.apply_hindsight( states, actions, copy_obs(new_obs)) ## store her transitions: her_states: n+1, her_rewards: n for her_i in range(len(her_states) - 2): agent.store_transition(her_states[her_i], actions[her_i], her_rewards[her_i], her_states[her_i + 1], False) #store last transition agent.store_transition(her_states[-2], actions[-1], her_rewards[-1], her_states[-1], True) ## refresh the storage containers del states, actions states, actions = [], [] agent.reset() obs = env.reset() #print(obs) if single_train: MPI.COMM_WORLD.Barrier() if rank != 0: agent.finish_sending() else: agent.recv_transitions() MPI.COMM_WORLD.Barrier() t1 = time.time() for t_train in range(nb_train_steps): if not single_train or (single_train and rank == 0): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al, current_summary = agent.train(summary_var) #samples from memory #updates the std and mean ops #oof, we should upjeb these too! #trains both actor + critic epoch_critic_losses.append(cl) epoch_actor_losses.append(al) MPI.COMM_WORLD.Barrier() if single_train: agent.sync() agent.update_target_net() if dologging and tf_sum_logging and rank == 0: writer_t.add_summary( current_summary, epoch * nb_epoch_cycles * nb_train_steps + cycle * nb_train_steps + t_train) # print("Evaluating!") # Evaluate. t2 = time.time() if (eval_env is not None) and rank == 0: eval_episode_reward = 0. eval_obs = eval_env.reset() eval_obs_start = copy_obs(eval_obs) eval_done = False while (not eval_done): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render(writer=vid_writer) cam_writer.append_data( (eval_obs[1] * 255.0).astype(np.uint8)) eval_episode_reward += eval_r eval_qs.append(eval_q) eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_success.append( eval_info["done"] == "goal reached") def allow_nice_printing(obs): if not isinstance(obs, tuple): return obs return (obs[0], np.mean(obs[1])) if (eval_info["done"] == "goal reached"): logger.info( "success, training epoch:%d,starting config:" % epoch, allow_nice_printing(eval_obs_start), 'final state', allow_nice_printing(eval_obs)) t3 = time.time() do_timing = False if rank == 0 and do_timing: print("times:", t1 - t0, t2 - t1, t3 - t2) if dologging and rank == 0: vid_writer.close() cam_writer.close() print("Logging!") # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = normal_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = normal_mean( epoch_episode_rewards) if len(episode_rewards_history) > 0: combined_stats['rollout/return_history'] = normal_mean( np.mean(episode_rewards_history)) else: combined_stats['rollout/return_history'] = 0. combined_stats['rollout/episode_steps'] = normal_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = np.sum(epoch_episodes) combined_stats['rollout/actions_mean'] = normal_mean( epoch_actions) combined_stats['rollout/actions_std'] = normal_std( epoch_actions) combined_stats['rollout/Q_mean'] = normal_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = normal_mean( epoch_actor_losses) combined_stats['train/loss_critic'] = normal_mean( epoch_critic_losses) combined_stats['train/param_noise_distance'] = normal_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = normal_mean( eval_episode_rewards) combined_stats['eval/success'] = normal_mean( eval_episode_success) if len(eval_episode_rewards_history) > 0: combined_stats['eval/return_history'] = normal_mean( np.mean(eval_episode_rewards_history)) else: combined_stats['eval/return_history'] = 0. combined_stats['eval/Q'] = normal_mean(eval_qs) combined_stats['eval/episodes'] = normal_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = normal_mean(duration) combined_stats['total/steps_per_second'] = normal_mean( float(t) / float(duration)) combined_stats['total/episodes'] = normal_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: print("Dumping progress!") if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) ## save tf model if rank == 0 and (epoch + 1) % save_freq == 0: print("Saving the model!") os.makedirs(osp.join(logdir, "model"), exist_ok=True) saver.save(U.get_session(), logdir + "/model/ddpg", global_step=epoch)