示例#1
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
示例#2
0
class FCMADRL:
    def __init__(self):
        self.observation_space = SA_OBS_SPACE
        self.action_space = SA_ACTION_SPACE
        # self.agent = agent
        self.agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE,
                          CA_ACTION_BOUND)
        self.dqn_solver = DQNSolver(SA_OBS_SPACE, SA_ACTION_SPACE)

        logging.basicConfig(file_name="logs/log.log",
                            format='%(asctime)s %(message)s',
                            file_mode='w+')
        self.logger = logging.getLogger()
        self.logger.setLevel(logging.DEBUG)

    def use_existing_dqn(self, dqn_model):
        self.dqn_solver.model = dqn_model

    def get_ddpg(self):
        return self.agent

    def get_dqn(self):
        return self.dqn_solver

    def get_dqn_model(self, dqn_solver):
        return dqn_solver.model

    """ 
    ca_step() is just for testing purposes 
    """

    def ca_step(self, action):
        return np.random.choice(
            SA_ACTION_SPACE,
            CA_OBS_SPACE), np.random.choice(10), np.random.choice(
                [True, False]), {}

    """ 
    sa_state(): To merge the two states received by the individual agents 
    (one from central agent and one from the environment) into one vector 
    """

    def sa_state(self, x, obs, i):
        one = x
        two = obs[i]
        three = np.array([i])
        f = np.append(one, two)
        f = np.append(f, three)
        return f

    def fcmadrl(self):
        # Randomly initialize critic,actor,target critic, target actor network and replay buffer
        exploration_noise = OUNoise(CA_ACTION_SPACE)
        counter = 0
        reward_per_episode = 0
        total_reward = 0
        num_states = CA_OBS_SPACE
        num_actions = CA_ACTION_SPACE

        self.logger.debug("Number of States:" + str(num_states))
        self.logger.debug("Number of Actions:" + str(num_actions))
        self.logger.debug("Number of Steps per episode:" + str(steps))
        # saving reward:
        reward_st = np.array([0])

        score_logger = ScoreLogger(ENV_NAME)

        # run = 0
        for i in xrange(episodes):
            print "==== Starting episode no:", i, "====", "\n"
            # observation = env.reset()
            observation = ca_reset()
            reward_per_episode = 0

            # run += 1
            obs = env.reset()
            # step = 0

            for t in xrange(steps):
                # rendering environment (optional)
                #env.render()
                print "Step: ", t

                x_arr = []
                observation_arr = []
                action_arr = []
                action_n = []
                state_arr = []
                next_state_arr = []
                action_n_arr = []

                for z in range(env.n):
                    self.take_action(action_arr, action_n, action_n_arr,
                                     exploration_noise, num_states, obs,
                                     observation, observation_arr, state_arr,
                                     x_arr, z)

                next_obs, reward_n, done_n, info_n = env.step(action_n)

                reward = reward_n[0]
                done = all(done_n)
                print "Reward_n: ", reward_n

                self.update_next_state(action_arr, next_obs, next_state_arr)
                self.memory_store(action_arr, action_n_arr, done_n,
                                  next_state_arr, observation_arr, reward_n,
                                  state_arr, x_arr)
                obs = next_obs
                # train critic and actor network
                if counter > 64:
                    self.agent.train()
                reward_per_episode += reward
                counter += 1
                # check if episode ends:
                if done or (t == steps - 1):
                    print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode
                    print "Printing reward to file"
                    exploration_noise.reset(
                    )  # reinitializing random noise for action exploration
                    reward_st = np.append(reward_st, reward_per_episode)
                    np.savetxt('rewards/episode_reward.txt',
                               reward_st,
                               newline="\n")

                    print "Run: " + str(i) + ", exploration: " + str(
                        self.dqn_solver.exploration_rate) + ", score: " + str(
                            reward_per_episode / t)
                    score_logger.add_score(reward_per_episode / t, i)
                    print '\n\n'

                    break
                self.dqn_solver.experience_replay()
            if (i % CHECKPOINT == 0):
                self.dqn_solver.save_dqn_model(i)

        total_reward += reward_per_episode
        print "Average reward per episode {}".format(total_reward / episodes)
        return total_reward

    def update_next_state(self, action_arr, next_obs, next_state_arr):
        for z in range(env.n):
            ns = self.sa_state(action_arr[z], next_obs, z)
            ns = np.reshape(ns, [1, self.observation_space])
            next_state_arr.append(ns)

    def take_action(self, action_arr, action_n, action_n_arr,
                    exploration_noise, num_states, obs, observation,
                    observation_arr, state_arr, x_arr, z):
        action = self.get_message(action_arr, exploration_noise, num_states,
                                  observation, x_arr)
        state = self.sa_state(action, obs, z)
        state = np.reshape(state, [1, self.observation_space])
        state_arr.append(state)
        act = self.get_final_action(action_n, action_n_arr, state)
        self.logger.debug("SA_Action: " + str(act))
        # print "CA State: ", x
        # print "CA Action: ", action
        # print "SA State: ", state
        # print "SA Action: ", act
        observation[z] = act
        observation_arr.append(np.array(list(observation)))

    def memory_store(self, action_arr, action_n_arr, done_n, next_state_arr,
                     observation_arr, reward_n, state_arr, x_arr):
        for z in range(env.n):
            # add s_t,s_t+1,action,reward to experience memory
            # print x_arr[z], observation_arr[z], action_arr[z], reward_n[z], done_n[z]
            self.agent.add_experience(x_arr[z], observation_arr[z],
                                      action_arr[z], reward_n[z], done_n[z])
            self.dqn_solver.remember(state_arr[z], action_n_arr[z],
                                     reward_n[z], next_state_arr[z], done_n[z])

    def get_final_action(self, action_n, action_n_arr, state):
        act = self.dqn_solver.act(state)
        a = np.zeros(SA_ACTION_SPACE)
        a[act] = 1.0
        action_n.append(a)
        action_n_arr.append(act)
        return act

    def get_message(self, action_arr, exploration_noise, num_states,
                    observation, x_arr):
        x = observation
        # x_arr.append(x)
        x_arr.append(np.array(list(x)))
        action = self.agent.evaluate_actor(np.reshape(x, [1, num_states]))
        noise = exploration_noise.noise()
        action = action[
            0] + noise  # Select action according to current policy and exploration noise
        action_arr.append(action)
        self.logger.debug("Action at Step: " + str(action))
        # print "Action at step", t ," :",action,"\n"
        return action
示例#3
0
            policy=policy,
            es=es,
            qf=qf,
            batch_size=64,
            max_path_length=env.horizon,
            epoch_length=1000,
            min_pool_size=10000,
            n_epochs=args.num_epochs,
            discount=0.99,
            scale_reward=args.reward_scale,
            qf_learning_rate=1e-3,
            policy_learning_rate=1e-4,
            plot=False)

run_experiment_lite(
    algo.train(),
    log_dir=None if args.use_ec2 else args.data_dir,
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    exp_prefix="DDPG_" + args.env,
    seed=1,
    mode="ec2" if args.use_ec2 else "local",
    plot=False,
    # dry=True,
    terminate_machine=args.dont_terminate_machine,
    added_project_directories=[
        osp.abspath(osp.join(osp.dirname(__file__), '.'))
示例#4
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
          normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
          popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
          tau=0.01, eval_env=None, param_noise_adaption_interval=50):

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
                 gamma=gamma, tau=tau, normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    saver = tf.train.Saver()

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
    qf=qf,
    batch_size=64,
    max_path_length=env.horizon,
    epoch_length=1000,
    min_pool_size=10000,
    n_epochs=args.num_epochs,
    discount=0.99,
    scale_reward=args.reward_scale,
    qf_learning_rate=1e-3,
    policy_learning_rate=1e-4,
    plot=False
)


run_experiment_lite(
    algo.train(),
    log_dir=None if args.use_ec2 else args.data_dir,
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    exp_prefix="DDPG_" + args.env,
    seed=1,
    mode="ec2" if args.use_ec2 else "local",
    plot=False,
    # dry=True,
    terminate_machine=args.dont_terminate_machine,
    added_project_directories=[osp.abspath(osp.join(osp.dirname(__file__), '.'))]
)