def get_target_updates(vars, target_vars, tau): logger.debug('setting up target updates ...') soft_updates = [] init_updates = [] assert len(vars) == len(target_vars) for var, target_var in zip(vars, target_vars): logger.debug(' {} <- {}'.format(target_var.name, var.name)) init_updates.append(tf.assign(target_var, var)) soft_updates.append( tf.assign(target_var, (1. - tau) * target_var + tau * var)) assert len(init_updates) == len(vars) assert len(soft_updates) == len(vars) return tf.group(*init_updates), tf.group(*soft_updates)
def create_restore_var_dict(self): var_restore_dict_ddpg = {} for var in self.actor.trainable_vars: name = var.name name = name.replace("%s/" % self.skill_name, "") var_restore_dict_ddpg[name[:-2]] = var obs_rms_var = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='%s/obs_rms' % self.skill_name) for var in obs_rms_var: name = var.name name = name.replace("%s/" % self.skill_name, "") var_restore_dict_ddpg[name[:-2]] = var for var in self.critic.trainable_vars: name = var.name name = name.replace("%s/" % self.skill_name, "") var_restore_dict_ddpg[name[:-2]] = var if MPI.COMM_WORLD.Get_rank() == 0: logger.debug("restoring following ddpg vars\n" + "-" * 20) logger.debug("num of vars to restore:%d" % len(var_restore_dict_ddpg)) logger.debug(str(var_restore_dict_ddpg)) logger.debug("-" * 50) return var_restore_dict_ddpg
def create_restore_var_dict(self): train_vars = self.actor.trainable_vars + tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='%s/obs_rms' % self.skill_name) var_restore_dict = {} for var in train_vars: name = var.name ## takes care of obs normalization var if ("obs_rms" in name): name = name.replace("%s/" % self.skill_name, "") ## takes care of actor weights elif (self.skill_name in name): name = name.replace(self.skill_name, "actor") var_restore_dict[name[:-2]] = var if MPI.COMM_WORLD.Get_rank() == 0: logger.debug("restoring following vars\n" + "-" * 20) logger.debug("num of vars to restore:%d" % len(train_vars)) logger.debug(str(var_restore_dict)) logger.debug("-" * 50) return var_restore_dict
def setup_critic_optimizer(self): if MPI.COMM_WORLD.Get_rank() == 0: logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) \ + tf.reduce_mean(tf.square(self.normalized_critic_tf1 - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in (self.critic.trainable_vars + self.critic1.trainable_vars) if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.debug(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in (self.critic.trainable_vars + self.critic1.trainable_vars) ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) if MPI.COMM_WORLD.Get_rank() == 0: logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad( self.critic_loss, (self.critic.trainable_vars + self.critic1.trainable_vars), clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=(self.critic.trainable_vars + self.critic1.trainable_vars), beta1=0.9, beta2=0.999, epsilon=1e-08)
def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev): assert len(actor.vars) == len(perturbed_actor.vars) assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars) updates = [] for var, perturbed_var in zip(actor.vars, perturbed_actor.vars): if var in actor.perturbable_vars: logger.debug(' {} <- {} + noise'.format(perturbed_var.name, var.name)) updates.append( tf.assign( perturbed_var, var + tf.random_normal( tf.shape(var), mean=0., stddev=param_noise_stddev))) else: logger.debug(' {} <- {}'.format(perturbed_var.name, var.name)) updates.append(tf.assign(perturbed_var, var)) assert len(updates) == len(actor.vars) return tf.group(*updates)
def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.debug('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))
def create_restore_var_dict_successor_model(self): var_restore_dict_successor_model = {} model_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='%s/suc_pred_model' % self.skill_name) for var in model_var: name = var.name name = name.replace("%s/" % self.skill_name, "") var_restore_dict_successor_model[name[:-2]] = var if MPI.COMM_WORLD.Get_rank() == 0: logger.debug("restoring following pred model vars\n" + "-" * 20) logger.debug("num of vars to restore:%d" % len(var_restore_dict_successor_model)) logger.debug(str(var_restore_dict_successor_model)) logger.debug("-" * 50) return var_restore_dict_successor_model
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) logger.debug("Env info") logger.debug(env.__doc__) logger.debug("-" * 20) gym.logger.setLevel(logging.WARN) if evaluation and rank == 0: if kwargs['eval_env_id']: eval_env_id = kwargs['eval_env_id'] else: eval_env_id = env_id eval_env = gym.make(eval_env_id) # del eval_env_id from kwargs del kwargs['eval_env_id'] else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'epsnorm' in current_noise_type: _, stddev, epsilon = current_noise_type.split('_') action_noise = EpsilonNormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions), epsilon=float(epsilon)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank tf.reset_default_graph() # importing the current skill configs if kwargs['look_ahead'] and kwargs['skillset']: skillset_file = __import__("HER.skills.%s" % kwargs['skillset'], fromlist=['']) my_skill_set = SkillSet(skillset_file.skillset) else: my_skill_set = None set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: logger.info('rank {}: seed={}, logdir={}'.format( rank, seed, logger.get_dir())) start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, my_skill_set=my_skill_set, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
boolean_flag(parser, 'look-ahead', default=True) parser.add_argument('--commit-for', type=int, default=10) parser.add_argument('--exploration-final-eps', type=float, default=0.001) parser.add_argument('--num-samples', type=int, default=5) parser.add_argument('--skillset', type=str, default='set13') args = parser.parse_args() # we don't directly specify timesteps for this script, so make sure that if we do specify them # they agree with the other parameters if args.num_timesteps is not None: assert (args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps) dict_args = vars(args) del dict_args['num_timesteps'] return dict_args if __name__ == '__main__': args = parse_args() if MPI.COMM_WORLD.Get_rank() == 0: logger.configure(dir=args["log_dir"]) logger.debug(str(args)) # Run actual script. try: run(**args) except KeyboardInterrupt: print("Exiting!")
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure loggers. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) else: logger.set_level(logger.DEBUG) # Create envs. env = gym.make(env_id) if MPI.COMM_WORLD.Get_rank() == 0: logger.debug("Env info") logger.debug(env.__doc__) logger.debug("-" * 20) gym.logger.setLevel(logging.WARN) if evaluation and rank == 0: if kwargs['eval_env_id']: eval_env_id = kwargs['eval_env_id'] else: eval_env_id = env_id eval_env = gym.make(eval_env_id) # del eval_env_id from kwargs del kwargs['eval_env_id'] else: eval_env = None ### tf.reset_default_graph() ## NOTE: do tf things after this line if kwargs['select_action']: assert kwargs[ 'skillset'], 'Skillset should be given for selecting action' # importing the current skill configs if kwargs['skillset']: # import HER.skills.set2 as skillset_file skillset_file = __import__("HER.skills.%s" % kwargs['skillset'], fromlist=['']) my_skill_set = SkillSet(skillset_file.skillset) nb_actions = my_skill_set.num_params + my_skill_set.len else: nb_actions = env.action_space.shape[-1] ### # Parse noise_type action_noise = None param_noise = None for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'epsnorm' in current_noise_type: _, stddev, epsilon = current_noise_type.split('_') if kwargs['skillset']: action_noise = EpsilonNormalParameterizedActionNoise( mu=np.zeros(my_skill_set.num_params), sigma=float(stddev) * np.ones(my_skill_set.num_params), epsilon=float(epsilon), discrete_actions_dim=my_skill_set.len) else: action_noise = EpsilonNormalActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions), epsilon=float(epsilon)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) ### # Configure components. memory = Memory(limit=int(1e6), action_shape=(nb_actions, ), observation_shape=env.observation_space.shape) # tf components critic = Critic(layer_norm=layer_norm) if kwargs['skillset'] is None: actor = Actor(discrete_action_size=env.env.discrete_action_size, cts_action_size=nb_actions - env.env.discrete_action_size, layer_norm=layer_norm) my_skill_set = None else: actor = Actor(discrete_action_size=my_skill_set.len, cts_action_size=my_skill_set.num_params, layer_norm=layer_norm) ### # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.debug('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) set_global_seeds(seed) # tf, numpy, random env.seed(seed) # numpy with a more complicated seed ### # Disable logging for rank != 0 to avoid a ton of prints. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, my_skill_set=my_skill_set, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_episodes, batch_size, memory, tau=0.05, eval_env=None, param_noise_adaption_interval=50, **kwargs): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high if "dologging" in kwargs: dologging = kwargs["dologging"] else: dologging = True if "tf_sum_logging" in kwargs: tf_sum_logging = kwargs["tf_sum_logging"] else: tf_sum_logging = False if "invert_grad" in kwargs: invert_grad = kwargs["invert_grad"] else: invert_grad = False if "actor_reg" in kwargs: actor_reg = kwargs["actor_reg"] else: actor_reg = False if dologging: logger.debug( 'scaling actions by {} before executing in env'.format(max_action)) if kwargs['look_ahead']: look_ahead = True look_ahead_planner = Planning_with_memories( skillset=kwargs['my_skill_set'], env=env, num_samples=kwargs['num_samples']) exploration = LinearSchedule(schedule_timesteps=int(nb_epochs * nb_epoch_cycles), initial_p=1.0, final_p=kwargs['exploration_final_eps']) else: look_ahead = False if kwargs['skillset']: action_shape = (kwargs['my_skill_set'].len + kwargs['my_skill_set'].num_params, ) else: action_shape = env.action_space.shape agent = DDPG(actor, critic, memory, env.observation_space.shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, inverting_grad=invert_grad, actor_reg=actor_reg) if dologging and MPI.COMM_WORLD.Get_rank() == 0: logger.debug('Using agent with the following configuration:') logger.debug(str(agent.__dict__.items())) # should have saver for all thread to restore. But dump only using 1 saver saver = tf.train.Saver(keep_checkpoint_every_n_hours=2, max_to_keep=20, save_relative_paths=True) save_freq = kwargs["save_freq"] # step = 0 global_t = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) ## get the session with the current graph => identical graph is used for each session with U.single_threaded_session() as sess: # Set summary saver if dologging and tf_sum_logging and rank == 0: tf.summary.histogram("actor_grads", agent.actor_grads) tf.summary.histogram("critic_grads", agent.critic_grads) actor_trainable_vars = actor.trainable_vars for var in actor_trainable_vars: tf.summary.histogram(var.name, var) critic_trainable_vars = critic.trainable_vars for var in critic_trainable_vars: tf.summary.histogram(var.name, var) tf.summary.histogram("actions_out", agent.actor_tf) tf.summary.histogram("critic_out", agent.critic_tf) tf.summary.histogram("target_Q", agent.target_Q) summary_var = tf.summary.merge_all() writer_t = tf.summary.FileWriter( osp.join(logger.get_dir(), 'train'), sess.graph) else: summary_var = tf.no_op() # Prepare everything. agent.initialize(sess) sess.graph.finalize() ## restore if kwargs['skillset']: ## restore skills my_skill_set = kwargs['my_skill_set'] my_skill_set.restore_skillset(sess=sess) ## restore current controller if kwargs["restore_dir"] is not None: restore_dir = osp.join(kwargs["restore_dir"], "model") if (restore_dir is not None) and rank == 0: print('Restore path : ', restore_dir) model_checkpoint_path = read_checkpoint_local(restore_dir) if model_checkpoint_path: saver.restore(U.get_session(), model_checkpoint_path) logger.info("checkpoint loaded:" + str(model_checkpoint_path)) tokens = model_checkpoint_path.split("-")[-1] # set global step global_t = int(tokens) print(">>> global step set:", global_t) agent.reset() obs = env.reset() # maintained across epochs episodes = 0 t = 0 start_time = time.time() # creating vars. this is done to keep the syntax for deleting the list simple a[:] = [] epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_actor_losses = [] epoch_critic_losses = [] if param_noise is not None: epoch_adaptive_distances = [] eval_episode_rewards = [] eval_episode_success = [] # for each episode done = False episode_reward = 0. episode_step = 0 ## containers for hierarchical hindsight if kwargs["her"]: logger.debug("-" * 50 + '\nWill create HER\n' + "-" * 50) # per episode states, pactions, sub_states = [], [], [] print("Ready to go!") for epoch in range(global_t, nb_epochs): # stat containers epoch_episodes = 0. epoch_start_time = time.time() epoch_episode_rewards[:] = [] epoch_episode_steps[:] = [] epoch_actions[:] = [ ] # action mean: don't know if this indicates anything epoch_actor_losses[:] = [] epoch_critic_losses[:] = [] if param_noise is not None: epoch_adaptive_distances[:] = [] eval_episode_rewards[:] = [] eval_episode_success[:] = [] for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range( int(nb_rollout_steps / MPI.COMM_WORLD.Get_size())): # print(rank, t_rollout) # Predict next action. # exploration check if kwargs['look_ahead'] and (np.random.rand( ) < exploration.value(epoch * nb_epoch_cycles + cycle)): paction, planner_info = look_ahead_planner.create_plan( obs) else: paction, _ = agent.pi(obs, apply_noise=True, compute_Q=True) if (my_skill_set): ## break actions into primitives and their params primitives_prob = paction[:kwargs['my_skill_set'].len] primitive_id = np.argmax(primitives_prob) # print("skill chosen", primitive_id) r = 0. skill_obs = obs.copy() if kwargs['her']: curr_sub_states = [skill_obs.copy()] for _ in range(kwargs['commit_for']): action = my_skill_set.pi( primitive_id=primitive_id, obs=skill_obs.copy(), primitive_params=paction[my_skill_set.len:]) # Execute next action. if rank == 0 and render: sleep(0.1) env.render() assert max_action.shape == action.shape new_obs, skill_r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) r += skill_r if kwargs['her']: curr_sub_states.append(new_obs.copy()) skill_obs = new_obs if done or my_skill_set.termination( new_obs, primitive_id, primitive_params=paction[my_skill_set. len:]): break # assuming the skill is trained from different reward signal r = skill_r else: action = paction # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) assert action.shape == env.action_space.shape t += 1 episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(paction) agent.store_transition(obs, paction, r, new_obs, done) # storing info for hindsight if kwargs['her']: states.append(obs.copy()) pactions.append(paction.copy()) sub_states.append(curr_sub_states) # print(planner_info['next_state'][:6], new_obs[:6]) obs = new_obs if done: # Episode done. # update stats epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) epoch_episodes += 1 episodes += 1 # reinit episode_reward = 0. episode_step = 0 agent.reset() obs = env.reset() if kwargs["her"]: # logger.info("-"*50 +'\nCreating HER\n' + "-"*50) # create hindsight experience replay if kwargs['skillset']: her_states, her_rewards = env.apply_hierarchical_hindsight( states, pactions, new_obs.copy(), sub_states) else: her_states, her_rewards = env.apply_hindsight( states, pactions, new_obs.copy()) ## store her transitions: her_states: n+1, her_rewards: n for her_i in range(len(her_states) - 2): agent.store_transition(her_states[her_i], pactions[her_i], her_rewards[her_i], her_states[her_i + 1], False) #store last transition agent.store_transition(her_states[-2], pactions[-1], her_rewards[-1], her_states[-1], True) ## refresh the storage containers states[:], pactions[:] = [], [] if kwargs['skillset']: sub_states[:] = [] # print(rank, "Training!") # Train. for t_train in range(nb_train_steps): # print(rank, t_train) # Adapt param noise, if necessary. if (memory.nb_entries >= batch_size) and ( t % param_noise_adaption_interval == 0) and (param_noise is not None): distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al, current_summary = agent.train(summary_var) epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() if dologging and tf_sum_logging and rank == 0: writer_t.add_summary( current_summary, epoch * nb_epoch_cycles * nb_train_steps + cycle * nb_train_steps + t_train) # print("Evaluating!") # Evaluate after training is done. if (eval_env is not None) and rank == 0: for _ in range(nb_eval_episodes): eval_episode_reward = 0. eval_obs = eval_env.reset() eval_obs_start = eval_obs.copy() eval_done = False while (not eval_done): eval_paction, _ = agent.pi(eval_obs, apply_noise=False, compute_Q=False) if (kwargs['skillset']): ## break actions into primitives and their params eval_primitives_prob = eval_paction[:kwargs[ 'my_skill_set'].len] eval_primitive_id = np.argmax(eval_primitives_prob) eval_r = 0. eval_skill_obs = eval_obs.copy() for _ in range(kwargs['commit_for']): eval_action = my_skill_set.pi( primitive_id=eval_primitive_id, obs=eval_skill_obs.copy(), primitive_params=eval_paction[my_skill_set. len:]) eval_new_obs, eval_skill_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_r += eval_skill_r # check for skill termination or episode termination eval_terminate_skill = my_skill_set.termination( eval_new_obs, eval_primitive_id, primitive_params=eval_paction[my_skill_set. len:]) if eval_done or eval_terminate_skill: break eval_skill_obs = eval_new_obs # hack assuming the skills are trained from diff reward signal eval_r = eval_skill_r else: eval_action, _ = eval_paction, eval_pq eval_new_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action) eval_episode_reward += eval_r eval_obs = eval_new_obs eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_success.append( eval_info["done"] == "goal reached") if (eval_info["done"] == "goal reached"): logger.info( "success, training epoch:%d,starting config:" % epoch, eval_obs_start, 'final state', eval_obs) if dologging and rank == 0: print("Logging!") # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = normal_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = normal_mean( epoch_episode_rewards) if len(episode_rewards_history) > 0: combined_stats['rollout/return_history'] = normal_mean( np.mean(episode_rewards_history)) else: combined_stats['rollout/return_history'] = 0. combined_stats['rollout/episode_steps'] = normal_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = np.sum(epoch_episodes) combined_stats['rollout/actions_mean'] = normal_mean( epoch_actions) combined_stats['rollout/actions_std'] = normal_std( epoch_actions) # Train statistics. combined_stats['train/loss_actor'] = normal_mean( epoch_actor_losses) combined_stats['train/loss_critic'] = normal_mean( epoch_critic_losses) if param_noise is not None: combined_stats['train/param_noise_distance'] = normal_mean( epoch_adaptive_distances) if kwargs['look_ahead']: combined_stats['train/exploration'] = exploration.value( epoch * nb_epoch_cycles + cycle) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = normal_mean( eval_episode_rewards) combined_stats['eval/success'] = normal_mean( eval_episode_success) if len(eval_episode_rewards_history) > 0: combined_stats['eval/return_history'] = normal_mean( np.mean(eval_episode_rewards_history)) else: combined_stats['eval/return_history'] = 0. combined_stats['eval/episodes'] = normal_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = normal_mean(duration) combined_stats['total/rollout_per_second'] = normal_mean( float(t) / float(duration)) combined_stats['total/episodes'] = normal_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() # if rank == 0 and logdir: # print("Dumping progress!") # if hasattr(env, 'get_state'): # with open(osp.join(logdir, 'env_state.pkl'), 'wb') as f: # pickle.dump(env.get_state(), f) # if eval_env and hasattr(eval_env, 'get_state'): # with open(osp.join(logdir, 'eval_env_state.pkl'), 'wb') as f: # pickle.dump(eval_env.get_state(), f) ## save tf model if rank == 0 and (epoch + 1) % save_freq == 0: print("Saving the model!") os.makedirs(osp.join(logdir, "model"), exist_ok=True) saver.save(U.get_session(), logdir + "/model/ddpg", global_step=epoch)
def __init__(self, actor, critic, additional_critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., actor_reg=True, select_action=False, skillset=None): logger.debug("Parameterized DDPG params") logger.debug(str(locals())) logger.debug("-" * 20) # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') if select_action: self.temperature = tf.placeholder(tf.float32, shape=(), name='temperature') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.critic1 = additional_critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.select_action = select_action self.actor_reg = actor_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # action selection constant if self.select_action: self.skillset = skillset W_select = np.zeros( (skillset.len, skillset.num_params + skillset.len)) W_select[:skillset.len, :skillset.len] = 1. for i in range(skillset.len): starting_idx = skillset.params_start_idx[i] + skillset.len ending_idx = starting_idx + skillset.skillset[i].num_params W_select[i, starting_idx:ending_idx] = 1. print("Selection matrix:%r" % W_select) self.W_select = tf.constant(W_select, dtype=tf.float32, name="selection_mat") # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic target_critic1 = copy(additional_critic) target_critic1.name = 'target_critic1' self.target_critic1 = target_critic1 # Create networks and core TF parts that are shared across setup parts. target_actor_prediction_next_state_tf = target_actor(normalized_obs1) # critic self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # additional_critic self.normalized_critic_tf1 = additional_critic(normalized_obs0, self.actions) self.critic_tf1 = denormalize( tf.clip_by_value(self.normalized_critic_tf1, self.return_range[0], self.return_range[1]), self.ret_rms) if self.select_action: Q_obs1_0 = denormalize( target_critic( normalized_obs1, choose_actions(target_actor_prediction_next_state_tf, skillset, self.W_select)), self.ret_rms) Q_obs1_1 = denormalize( target_critic1( normalized_obs1, choose_actions(target_actor_prediction_next_state_tf, skillset, self.W_select)), self.ret_rms) else: Q_obs1_0 = denormalize( target_critic(normalized_obs1, target_actor_prediction_next_state_tf), self.ret_rms) Q_obs1_1 = denormalize( target_critic1(normalized_obs1, target_actor_prediction_next_state_tf), self.ret_rms) Q_obs1 = tf.minimum(Q_obs1_0, Q_obs1_1) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # clip the target Q value self.target_Q = tf.clip_by_value(self.target_Q, -1 / (1 - gamma), 0) if self.select_action: self.actor_with_all_params_tf = actor(obs=normalized_obs0, temperature=self.temperature) # create np and then convert to tf.constant # _, selection_mask = choose_actions(self.actor_with_all_params_tf, skillset, self.W_select, True) # actor_tf_clone_with_chosen_action = grad_manipulation_op.py_func(grad_manipulation_op.my_identity_func, [self.actor_with_all_params_tf, selection_mask], self.actor_with_all_params_tf.dtype, name="MyIdentity", grad=grad_manipulation_op._custom_identity_grad) # in backward pass discrete action for selection will be used as obtained using forward run actor_tf_clone_with_chosen_action = choose_actions( self.actor_with_all_params_tf, skillset, self.W_select, False) self.actor_tf = tf.reshape(actor_tf_clone_with_chosen_action, tf.shape(self.actor_with_all_params_tf)) else: self.actor_tf = actor(normalized_obs0) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates()
def __init__(self, actor, critic, additional_critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., inverting_grad=False, actor_reg=True): logger.debug("CDQ params") logger.debug(str(locals())) logger.debug("-" * 20) # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.critic1 = additional_critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.inverting_grad = inverting_grad self.actor_reg = actor_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor # creating target critic networks target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic target_critic1 = copy(additional_critic) target_critic1.name = 'target_critic1' self.target_critic1 = target_critic1 # Create networks and core TF parts that are shared across setup parts. target_actor_prediction_next_state_tf = target_actor(normalized_obs1) # for critic self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1_0 = denormalize( target_critic(normalized_obs1, target_actor_prediction_next_state_tf), self.ret_rms) # for critic1 self.normalized_critic_tf1 = additional_critic(normalized_obs0, self.actions) self.critic_tf1 = denormalize( tf.clip_by_value(self.normalized_critic_tf1, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1_1 = denormalize( target_critic1(normalized_obs1, target_actor_prediction_next_state_tf), self.ret_rms) Q_obs1 = tf.minimum(Q_obs1_0, Q_obs1_1) # same target is used for both the critic. self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # clip the target Q value self.target_Q = tf.clip_by_value(self.target_Q, -1 / (1 - gamma), 0) self.actor_tf = actor(normalized_obs0) if inverting_grad: actor_tf_clone_with_invert_grad = my_op.py_func( my_op.my_identity_func, [self.actor_tf, -1., 1.], self.actor_tf.dtype, name="MyIdentity", grad=my_op._custom_identity_grad) self.actor_tf = tf.reshape(actor_tf_clone_with_invert_grad, tf.shape(self.actor_tf)) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, additional_critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.05, eval_env=None, param_noise_adaption_interval=50, nb_eval_episodes=20, **kwargs): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high if "dologging" in kwargs: dologging = kwargs["dologging"] else: dologging = True if "tf_sum_logging" in kwargs: tf_sum_logging = kwargs["tf_sum_logging"] else: tf_sum_logging = False if "invert_grad" in kwargs: invert_grad = kwargs["invert_grad"] else: invert_grad = False if "actor_reg" in kwargs: actor_reg = kwargs["actor_reg"] else: actor_reg = False if dologging: logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = CDQ(actor, critic, additional_critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, inverting_grad=invert_grad, actor_reg=actor_reg) if dologging: logger.debug('Using agent with the following configuration:') if dologging: logger.debug(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank != -1: saver = tf.train.Saver(keep_checkpoint_every_n_hours=2, max_to_keep=5, save_relative_paths=True) save_freq = kwargs["save_freq"] else: saver = None # step = 0 global_t = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Set summary saver if dologging and tf_sum_logging and rank == 0: tf.summary.histogram("actor_grads", agent.actor_grads) tf.summary.histogram("critic_grads", agent.critic_grads) actor_trainable_vars = actor.trainable_vars for var in actor_trainable_vars: tf.summary.histogram(var.name, var) critic_trainable_vars = critic.trainable_vars for var in critic_trainable_vars: tf.summary.histogram(var.name, var) tf.summary.histogram("actions_out", agent.actor_tf) tf.summary.histogram("critic_out", agent.critic_tf) tf.summary.histogram("target_Q", agent.target_Q) summary_var = tf.summary.merge_all() writer_t = tf.summary.FileWriter( osp.join(logger.get_dir(), 'train'), sess.graph) else: summary_var = tf.no_op() # Prepare everything. agent.initialize(sess) sess.graph.finalize() #set_trace() ## restore if kwargs["restore_dir"] is not None: restore_dir = osp.join(kwargs["restore_dir"], "model") if (restore_dir is not None): print('Restore path : ', restore_dir) # checkpoint = tf.train.get_checkpoint_state(restore_dir) # if checkpoint and checkpoint.model_checkpoint_path: model_checkpoint_path = read_checkpoint_local(restore_dir) if model_checkpoint_path: saver.restore(U.get_session(), model_checkpoint_path) print("checkpoint loaded:", model_checkpoint_path) logger.info("checkpoint loaded:" + str(model_checkpoint_path)) tokens = model_checkpoint_path.split("-")[-1] # set global step global_t = int(tokens) print(">>> global step set:", global_t) agent.reset() obs = env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 ## containers for hindsight if kwargs["her"]: # logger.info("-"*50 +'\nWill create HER\n' + "-"*50) states, actions = [], [] print("Ready to go!") for epoch in range(global_t, nb_epochs): # stat containers epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] eval_episode_rewards = [] eval_qs = [] eval_episode_success = [] for cycle in range(nb_epoch_cycles): # print("cycle:%d"%cycle) # Perform rollouts. for t_rollout in range( int(nb_rollout_steps / MPI.COMM_WORLD.Get_size())): # print(rank, t_rollout) # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) #if((t+1)%100) == 0: # print(max_action*action, new_obs, r) t += 1 if rank == 0 and render: env.render() sleep(0.1) episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) ## storing info for hindsight states.append(obs.copy()) actions.append(action.copy()) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 if kwargs["her"]: # logger.info("-"*50 +'\nCreating HER\n' + "-"*50) ## create hindsight experience replay her_states, her_rewards = env.env.apply_hindsight( states, actions, new_obs.copy()) ## store her transitions: her_states: n+1, her_rewards: n for her_i in range(len(her_states) - 2): agent.store_transition(her_states[her_i], actions[her_i], her_rewards[her_i], her_states[her_i + 1], False) #store last transition agent.store_transition(her_states[-2], actions[-1], her_rewards[-1], her_states[-1], True) ## refresh the storage containers del states, actions states, actions = [], [] agent.reset() obs = env.reset() #print(obs) # print(rank, "Training!") # Train. for t_train in range(nb_train_steps): # print(rank, t_train) # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al, current_summary = agent.train(summary_var) epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() if dologging and tf_sum_logging and rank == 0: writer_t.add_summary( current_summary, epoch * nb_epoch_cycles * nb_train_steps + cycle * nb_train_steps + t_train) # print("Evaluating!") # Evaluate. if (eval_env is not None) and rank == 0: for _ in range(nb_eval_episodes): eval_episode_reward = 0. eval_obs = eval_env.reset() eval_obs_start = eval_obs.copy() eval_done = False while (not eval_done): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: sleep(0.1) print("Render!") eval_env.render() print("rendered!") eval_episode_reward += eval_r eval_qs.append(eval_q) eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_success.append( eval_info["done"] == "goal reached") if (eval_info["done"] == "goal reached"): logger.info( "success, training epoch:%d,starting config:" % epoch, eval_obs_start, 'final state', eval_obs) if dologging and rank == 0: print("Logging!") # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = normal_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = normal_mean( epoch_episode_rewards) if len(episode_rewards_history) > 0: combined_stats['rollout/return_history'] = normal_mean( np.mean(episode_rewards_history)) else: combined_stats['rollout/return_history'] = 0. combined_stats['rollout/episode_steps'] = normal_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = np.sum(epoch_episodes) combined_stats['rollout/actions_mean'] = normal_mean( epoch_actions) combined_stats['rollout/actions_std'] = normal_std( epoch_actions) combined_stats['rollout/Q_mean'] = normal_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = normal_mean( epoch_actor_losses) combined_stats['train/loss_critic'] = normal_mean( epoch_critic_losses) combined_stats['train/param_noise_distance'] = normal_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = normal_mean( eval_episode_rewards) combined_stats['eval/success'] = normal_mean( eval_episode_success) if len(eval_episode_rewards_history) > 0: combined_stats['eval/return_history'] = normal_mean( np.mean(eval_episode_rewards_history)) else: combined_stats['eval/return_history'] = 0. combined_stats['eval/Q'] = normal_mean(eval_qs) combined_stats['eval/episodes'] = normal_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = normal_mean(duration) combined_stats['total/steps_per_second'] = normal_mean( float(t) / float(duration)) combined_stats['total/episodes'] = normal_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: print("Dumping progress!") if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) ## save tf model if rank == 0 and (epoch + 1) % save_freq == 0: print("Saving the model!") os.makedirs(osp.join(logdir, "model"), exist_ok=True) saver.save(U.get_session(), logdir + "/model/cdq", global_step=epoch)