def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
class FCMADRL: def __init__(self): self.observation_space = SA_OBS_SPACE self.action_space = SA_ACTION_SPACE # self.agent = agent self.agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE, CA_ACTION_BOUND) self.dqn_solver = DQNSolver(SA_OBS_SPACE, SA_ACTION_SPACE) logging.basicConfig(file_name="logs/log.log", format='%(asctime)s %(message)s', file_mode='w+') self.logger = logging.getLogger() self.logger.setLevel(logging.DEBUG) def use_existing_dqn(self, dqn_model): self.dqn_solver.model = dqn_model def get_ddpg(self): return self.agent def get_dqn(self): return self.dqn_solver def get_dqn_model(self, dqn_solver): return dqn_solver.model """ ca_step() is just for testing purposes """ def ca_step(self, action): return np.random.choice( SA_ACTION_SPACE, CA_OBS_SPACE), np.random.choice(10), np.random.choice( [True, False]), {} """ sa_state(): To merge the two states received by the individual agents (one from central agent and one from the environment) into one vector """ def sa_state(self, x, obs, i): one = x two = obs[i] three = np.array([i]) f = np.append(one, two) f = np.append(f, three) return f def fcmadrl(self): # Randomly initialize critic,actor,target critic, target actor network and replay buffer exploration_noise = OUNoise(CA_ACTION_SPACE) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = CA_OBS_SPACE num_actions = CA_ACTION_SPACE self.logger.debug("Number of States:" + str(num_states)) self.logger.debug("Number of Actions:" + str(num_actions)) self.logger.debug("Number of Steps per episode:" + str(steps)) # saving reward: reward_st = np.array([0]) score_logger = ScoreLogger(ENV_NAME) # run = 0 for i in xrange(episodes): print "==== Starting episode no:", i, "====", "\n" # observation = env.reset() observation = ca_reset() reward_per_episode = 0 # run += 1 obs = env.reset() # step = 0 for t in xrange(steps): # rendering environment (optional) #env.render() print "Step: ", t x_arr = [] observation_arr = [] action_arr = [] action_n = [] state_arr = [] next_state_arr = [] action_n_arr = [] for z in range(env.n): self.take_action(action_arr, action_n, action_n_arr, exploration_noise, num_states, obs, observation, observation_arr, state_arr, x_arr, z) next_obs, reward_n, done_n, info_n = env.step(action_n) reward = reward_n[0] done = all(done_n) print "Reward_n: ", reward_n self.update_next_state(action_arr, next_obs, next_state_arr) self.memory_store(action_arr, action_n_arr, done_n, next_state_arr, observation_arr, reward_n, state_arr, x_arr) obs = next_obs # train critic and actor network if counter > 64: self.agent.train() reward_per_episode += reward counter += 1 # check if episode ends: if done or (t == steps - 1): print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode print "Printing reward to file" exploration_noise.reset( ) # reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('rewards/episode_reward.txt', reward_st, newline="\n") print "Run: " + str(i) + ", exploration: " + str( self.dqn_solver.exploration_rate) + ", score: " + str( reward_per_episode / t) score_logger.add_score(reward_per_episode / t, i) print '\n\n' break self.dqn_solver.experience_replay() if (i % CHECKPOINT == 0): self.dqn_solver.save_dqn_model(i) total_reward += reward_per_episode print "Average reward per episode {}".format(total_reward / episodes) return total_reward def update_next_state(self, action_arr, next_obs, next_state_arr): for z in range(env.n): ns = self.sa_state(action_arr[z], next_obs, z) ns = np.reshape(ns, [1, self.observation_space]) next_state_arr.append(ns) def take_action(self, action_arr, action_n, action_n_arr, exploration_noise, num_states, obs, observation, observation_arr, state_arr, x_arr, z): action = self.get_message(action_arr, exploration_noise, num_states, observation, x_arr) state = self.sa_state(action, obs, z) state = np.reshape(state, [1, self.observation_space]) state_arr.append(state) act = self.get_final_action(action_n, action_n_arr, state) self.logger.debug("SA_Action: " + str(act)) # print "CA State: ", x # print "CA Action: ", action # print "SA State: ", state # print "SA Action: ", act observation[z] = act observation_arr.append(np.array(list(observation))) def memory_store(self, action_arr, action_n_arr, done_n, next_state_arr, observation_arr, reward_n, state_arr, x_arr): for z in range(env.n): # add s_t,s_t+1,action,reward to experience memory # print x_arr[z], observation_arr[z], action_arr[z], reward_n[z], done_n[z] self.agent.add_experience(x_arr[z], observation_arr[z], action_arr[z], reward_n[z], done_n[z]) self.dqn_solver.remember(state_arr[z], action_n_arr[z], reward_n[z], next_state_arr[z], done_n[z]) def get_final_action(self, action_n, action_n_arr, state): act = self.dqn_solver.act(state) a = np.zeros(SA_ACTION_SPACE) a[act] = 1.0 action_n.append(a) action_n_arr.append(act) return act def get_message(self, action_arr, exploration_noise, num_states, observation, x_arr): x = observation # x_arr.append(x) x_arr.append(np.array(list(x))) action = self.agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise # Select action according to current policy and exploration noise action_arr.append(action) self.logger.debug("Action at Step: " + str(action)) # print "Action at step", t ," :",action,"\n" return action
policy=policy, es=es, qf=qf, batch_size=64, max_path_length=env.horizon, epoch_length=1000, min_pool_size=10000, n_epochs=args.num_epochs, discount=0.99, scale_reward=args.reward_scale, qf_learning_rate=1e-3, policy_learning_rate=1e-4, plot=False) run_experiment_lite( algo.train(), log_dir=None if args.use_ec2 else args.data_dir, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used exp_prefix="DDPG_" + args.env, seed=1, mode="ec2" if args.use_ec2 else "local", plot=False, # dry=True, terminate_machine=args.dont_terminate_machine, added_project_directories=[ osp.abspath(osp.join(osp.dirname(__file__), '.'))
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. saver = tf.train.Saver() step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
qf=qf, batch_size=64, max_path_length=env.horizon, epoch_length=1000, min_pool_size=10000, n_epochs=args.num_epochs, discount=0.99, scale_reward=args.reward_scale, qf_learning_rate=1e-3, policy_learning_rate=1e-4, plot=False ) run_experiment_lite( algo.train(), log_dir=None if args.use_ec2 else args.data_dir, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used exp_prefix="DDPG_" + args.env, seed=1, mode="ec2" if args.use_ec2 else "local", plot=False, # dry=True, terminate_machine=args.dont_terminate_machine, added_project_directories=[osp.abspath(osp.join(osp.dirname(__file__), '.'))] )