示例#1
0
    def setup(self,
              obs_shape,
              nb_actions,
              action_spec,
              noise_type,
              gamma=1.,
              tau=0.01,
              layer_norm=True):
        super(DDPGAgent, self).setup(obs_shape, nb_actions, action_spec,
                                     noise_type, gamma, tau, layer_norm)

        self.action_spec_internal = action_spec
        self.obs_dim = obs_shape
        action_noise = None
        param_noise = None

        # Parse noise_type
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

        # Configure components.
        self.memory = Memory(limit=int(500),
                             action_shape=(nb_actions, ),
                             observation_shape=obs_shape)
        self.critic = Critic(layer_norm=layer_norm, hidden_size=128)
        self.actor = Actor(nb_actions, layer_norm=layer_norm, hidden_size=128)

        tf.reset_default_graph()

        # max_action = env.action_space.high
        self.ddpg = DDPG(actor=self.actor,
                         critic=self.critic,
                         memory=self.memory,
                         observation_shape=obs_shape,
                         action_shape=(nb_actions, ),
                         gamma=gamma,
                         tau=tau,
                         action_noise=action_noise,
                         param_noise=param_noise)
示例#2
0
def test(env, actor, critic, memory, normalize_observations, gamma,
         reward_scale, nb_episodes, episode_length, checkpoint_dir):

    # Initialize DDPG agent (target network and replay buffer)
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 normalize_observations=normalize_observations,
                 reward_scale=reward_scale)

    # We need max_action because the NN output layer is a tanh.
    # So we must scale it back.
    max_action = env.action_space.high

    # Start testing loop
    with U.single_threaded_session() as sess:
        agent.initialize(sess)

        # setup saver
        saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=2)

        # restore all
        print("restoring variables")
        # Add ops to save and restore all the variables.
        saver.restore(sess, tf.train.latest_checkpoint(checkpoint_dir))
        step_times = []
        for eval_episode in range(nb_episodes):
            print("Evaluating episode {}...".format(eval_episode))
            obs = env.reset()
            for t in range(episode_length):

                # Select action a_t without noise
                a_t, _ = agent.pi(obs,
                                  apply_param_noise=False,
                                  apply_action_noise=False,
                                  compute_Q=False)
                assert a_t.shape == env.action_space.shape
                assert (a_t >= 0).all()

                # Execute action a_t and observe reward r_t and next state s_{t+1}
                start_step_time = time.time()
                obs, r_t, eval_done, info = env.step(max_action * a_t)
                end_step_time = time.time()
                step_time = end_step_time - start_step_time
                step_times.append(step_time)

                if eval_done:
                    print("  Episode done!")
                    obs = env.reset()
                    break
        print("Average step time: ", np.mean(step_times))
示例#3
0
def train(env, name, callback):
    model = deepq.models.mlp([100, 20, 20])

    act = DDPG.train(env,
                     q_func=model,
                     lr=1e-3,
                     max_timesteps=100000,
                     buffer_size=50000,
                     exploration_fraction=0.1,
                     exploration_final_eps=0.02,
                     print_freq=10,
                     callback=callback)
    print("Saving model to " + name + ".pkl")
    act.save("" + name + ".pkl")
示例#4
0
def main():
    args = parse_args()
    logger.configure()
    gamma = 0.99
    tau = 0.01
    normalize_returns = False
    normalize_observations = True
    batch_size = 64
    action_noise = None
    stddev = 0.2
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev),
                                         desired_action_stddev=float(stddev))
    critic_l2_reg = 1e-2
    actor_lr = 1e-4
    critic_lr = 1e-3
    popart = False
    clip_norm = None
    reward_scale = 1.

    env = prosthetics_env.Wrapper(osim_env.ProstheticsEnv(visualize=False),
                                  frameskip=4,
                                  reward_shaping=True,
                                  reward_shaping_x=1,
                                  feature_embellishment=True,
                                  relative_x_pos=True,
                                  relative_z_pos=True)

    top_model_dir = 'top-models/'

    # create tf sessions and graphs
    sess_list = []
    graph_list = []
    for i in range(len(args.model_files)):
        graph_list.append(tf.Graph())
        sess_list.append(tf.Session(graph=graph_list[i]))
    ddpg_agents = []
    for i in range(len(args.model_files)):
        model_name = args.model_files[i]
        sess = sess_list[i]
        graph = graph_list[i]
        l_size = args.layer_sizes[i]
        with sess.as_default():
        #with U.make_session(num_cpu=1, graph=g) as sess:
            with graph.as_default():
                #tf.global_variables_initializer()

                # restore agents from model files and store in ddpg_agents
                print("Restoring from..." + model_name)

                # Configure components.
                memory = Memory(limit=int(1e6), action_shape=env.action_space.shape,
                                observation_shape=env.observation_space.shape)
                critic = Critic(layer_norm=True, activation='relu', layer_sizes=[l_size, l_size])
                actor = Actor(env.action_space.shape[-1], layer_norm=True,
                              activation='relu', layer_sizes=[l_size, l_size])
                agent = DDPG(actor, critic, memory, env.observation_space.shape,
                             env.action_space.shape, gamma=gamma, tau=tau,
                             normalize_returns=normalize_returns,
                             normalize_observations=normalize_observations,
                             batch_size=batch_size, action_noise=action_noise,
                             param_noise=param_noise, critic_l2_reg=critic_l2_reg,
                             actor_lr=actor_lr, critic_lr=critic_lr,
                             enable_popart=popart, clip_norm=clip_norm,
                             reward_scale=reward_scale)

                # restore adam state and param noise
                restore_model_path = top_model_dir + model_name
                saver = tf.train.Saver(max_to_keep=500)

                # restore network weights
                saver.restore(sess, restore_model_path)

                adam_optimizer_store = pickle.load(open(restore_model_path
                                                        + ".pkl", "rb"))
                agent.actor_optimizer.m = adam_optimizer_store['actor_optimizer']['m']
                agent.actor_optimizer.v = adam_optimizer_store['actor_optimizer']['v']
                agent.actor_optimizer.t = adam_optimizer_store['actor_optimizer']['t']
                agent.critic_optimizer.m = adam_optimizer_store['critic_optimizer']['m']
                agent.critic_optimizer.v = adam_optimizer_store['critic_optimizer']['v']
                agent.critic_optimizer.t = adam_optimizer_store['critic_optimizer']['t']
                if 'param_noise' in adam_optimizer_store:
                    agent.param_noise = adam_optimizer_store['param_noise']

                # intialize and prepare agent session.
                agent.initialize(sess)
                #sess.graph.finalize()
                agent.reset()

                ddpg_agents.append(agent)

    agent = BlendedAgent(ddpg_agents, sess_list, graph_list)

    if args.evaluation:
        # setup eval env
        eval_env = prosthetics_env.EvaluationWrapper(osim_env.ProstheticsEnv(visualize=False),
                                                     frameskip=4,
                                                     reward_shaping=True,
                                                     reward_shaping_x=1,
                                                     feature_embellishment=True,
                                                     relative_x_pos=True,
                                                     relative_z_pos=True)
        eval_env.change_model(model=('3D').upper(), prosthetic=True, difficulty=0, seed=0)
        eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))

        nb_eval_steps = 1000
        # reward, mean_q, final_steps = evaluate_one_episode(eval_env, ddpg_agents, sess_list, graph_list,
        #                                                    nb_eval_steps=nb_eval_steps,
        #                                                    render=False)
        reward, mean_q, final_steps = evaluate_one_episode(eval_env, agent, nb_eval_steps, render=False)
        print("Reward: " + str(reward))
        print("Mean Q: " + str(mean_q))
        print("Final num steps: " + str(final_steps))

    # Submit to crowdai competition. What a hack. :)
    # if crowdai_client is not None and crowdai_token is not None and eval_env is not None:
    crowdai_submit_count = 0
    if args.crowdai_submit:
        remote_base = "http://grader.crowdai.org:1729"
        crowdai_client = Client(remote_base)
        eval_obs_dict = crowdai_client.env_create(args.crowdai_token, env_id="ProstheticsEnv")
        eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
            eval_obs_dict,
            reward_shaping=True,
            reward_shaping_x=1.,
            feature_embellishment=True,
            relative_x_pos=True,
            relative_z_pos=True)
        while True:
            action, _ = agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False)
            submit_action = prosthetics_env.openai_to_crowdai_submit_action(action)
            clipped_submit_action = np.clip(submit_action, 0., 1.)
            actions_equal = clipped_submit_action == submit_action
            if not np.all(actions_equal):
                logger.debug("crowdai_submit_count:", crowdai_submit_count)
                logger.debug("  openai-action:", action)
                logger.debug("  submit-action:", submit_action)
            crowdai_submit_count += 1
            [eval_obs_dict, reward, done, info] = crowdai_client.env_step(clipped_submit_action.tolist(), True)
            # [eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True)
            eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                eval_obs_dict,
                reward_shaping=True,
                reward_shaping_x=1.,
                feature_embellishment=True,
                relative_x_pos=True,
                relative_z_pos=True)
            if done:
                logger.debug("done: crowdai_submit_count:", crowdai_submit_count)
                eval_obs_dict = crowdai_client.env_reset()
                if not eval_obs_dict:
                    break
                logger.debug("done: eval_obs_dict exists after reset")
                eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                    eval_obs_dict,
                    reward_shaping=True,
                    reward_shaping_x=1.,
                    feature_embellishment=True,
                    relative_x_pos=True,
                    relative_z_pos=True)
        crowdai_client.submit()

    for i in range(len(sess_list)):
        sess_list[i].close()
示例#5
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          callback=None,
          pretrained='none'):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Copy an env for evaluation
    env_eval = copy.deepcopy(env.env)

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        # load pretrained agent if possible
        if pretrained == 'none':
            logger.info('Training from scratch...')
        else:
            logger.info('Loading pretrained model from {}'.format(pretrained))
            #assert os.path.exists(pretrained)
            saver.restore(sess, pretrained)

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        total_time = 0
        start_time = time.time()

        total_time_record = []
        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        #epochxposdict = []
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1
                    total_time += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        total_time_record.append(total_time)
                        #epochxposdict.append(info['pos'][0])
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    # eval for one episode
                    eval_episode_reward = 0.0
                    eval_done = False
                    eval_obs = eval_env.reset()
                    while not eval_done:
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action)
                        eval_episode_reward += eval_r
                        eval_qs.append(eval_q)
                    eval_episode_rewards.append(eval_episode_reward)
                    eval_episode_rewards_history.append(eval_episode_reward)
                """
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.
                """

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

            # Call the callback
            if callback is not None:
                if callback(locals(),
                            globals()):  # callback returns a boolean value
                    break
        # Evaluate the policy on env to record trajs
        eval_rewards, eval_steps, trajs_obs, trajs_actions = evaluate(
            env_eval, agent=agent)
        if callback is not None:
            callback.final_call(locals(), globals())
示例#6
0
                action_shape=env.action_space.shape,
                observation_shape=env.observation_space.shape)
critic = Critic(layer_norm=layer_norm)
actor = Actor(nb_actions, layer_norm=layer_norm)

tf.reset_default_graph()

agent = DDPG(actor,
             critic,
             memory,
             env.observation_space.shape,
             env.action_space.shape,
             gamma=gamma,
             tau=tau,
             normalize_returns=normalize_returns,
             normalize_observations=normalize_observations,
             batch_size=batch_size,
             action_noise=action_noise,
             param_noise=param_noise,
             critic_l2_reg=critic_l2_reg,
             actor_lr=actor_lr,
             critic_lr=critic_lr,
             enable_popart=popart,
             clip_norm=clip_norm,
             reward_scale=reward_scale)

max_iteration = 1
step_number = []
success = []
reason = {1: 0, 2: 0, 3: 0}

with U.single_threaded_session() as sess:
示例#7
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          **kwargs):

    # print("kwargs:",kwargs)

    rank = MPI.COMM_WORLD.Get_rank()
    print("rank:", rank)
    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)

    with U.single_threaded_session() as sess:
        # Prepare everything.
        # --------------- AMEND: For saving and restoring the model. added by xlv ------------------
        if kwargs['restore'] == True and kwargs['restore_path'] != None:
            logger.info("Restoring from saved model")
            saver = tf.train.import_meta_graph(restore_path +
                                               "trained_model.meta")
            saver.restore(sess, tf.train.latest_checkpoint(restore_path))
        else:
            logger.info("Starting from scratch!")
            sess.run(tf.global_variables_initializer())
        # ----------------------------------------------------------------------------------------
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = eval_obs = env.reset()

        # if eval_env is not None:
        #     eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []

        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0

        # every 30 epochs plot statistics and save it.
        nb_epochs_unit = 30
        ddpg_rewards = []
        eval_ddpg_rewards = []

        ddpg_suc_percents = []
        eval_suc_percents = []

        # ---- AMEND: added by xlv to calculate success percent -----
        suc_num = 0
        episode_num = 0
        # -----------------------------------------------------------
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    # new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    new_obs, r, done, suc, info = env.step(max_action * action)
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1
                        # --- AMEND: added by xlv to calculate success percent ---
                        episode_num += 1
                        if suc:
                            suc_num += 1
                        # -------------------------------------------------------
                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                # eval_episode_rewards = []
                # eval_qs = []
                # if eval_env is not None:
                #     eval_episode_reward = 0.
                #     for t_rollout in range(nb_eval_steps):
                #         eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                #         eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                #         if render_eval:
                #             eval_env.render()
                #         eval_episode_reward += eval_r
                #
                #         eval_qs.append(eval_q)
                #         if eval_done:
                #             eval_obs = eval_env.reset()
                #             eval_episode_rewards.append(eval_episode_reward)
                #             eval_episode_rewards_history.append(eval_episode_reward)
                #             eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)

            # Evaluation statistics.
            # if eval_env is not None:
            #     combined_stats['eval/return'] = eval_episode_rewards
            #     combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
            #     combined_stats['eval/Q'] = eval_qs
            #     combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

            # ------------------------------ plot statistics every nb_epochs_unit -----------------------------------
            ddpg_rewards.append(np.mean(episode_rewards_history))
            if (epoch + 1) % nb_epochs_unit == 0:
                ddpg_suc_percents.append(suc_num / episode_num)
                # ---------- Evaluate for 5 iters -----------------------
                nb_eval_epochs = 5
                nb_eval_epoch_cycles = 5
                eval_episode_num = 0
                eval_suc_num = 0

                eval_episode_reward = 0
                eval_episode_step = 0

                eval_epoch_episode_rewards = []
                eval_epoch_episode_steps = []
                for i_epoch in range(nb_eval_epochs):
                    logger.log(
                        "********** Start Evaluation. Iteration %i ************"
                        % i_epoch)
                    for i_cycle in range(nb_eval_epoch_cycles):
                        for t_rollout in range(nb_rollout_steps):
                            eval_action, eval_q = agent.pi(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                            assert eval_action.shape == env.action_space.shape
                            eval_obs, eval_r, eval_done, eval_suc, eval_info = env.step(
                                max_action * eval_action
                            )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                            eval_episode_reward += eval_r
                            eval_episode_step += 1
                            if eval_done:
                                eval_obs = env.reset()
                                eval_epoch_episode_rewards.append(
                                    eval_episode_reward)
                                eval_episode_rewards_history.append(
                                    eval_episode_reward)
                                eval_epoch_episode_steps.append(
                                    eval_episode_step)
                                eval_episode_reward = 0
                                eval_episode_step = 0

                                eval_episode_num += 1
                                if eval_suc:
                                    eval_suc_num += 1
                    logger.record_tabular(
                        "Eval_EpRewMean",
                        np.mean(eval_episode_rewards_history))
                    logger.record_tabular("Eval_EpNumUntilNow",
                                          eval_episode_num)
                    logger.record_tabular("Eval_EpNumSuc", eval_suc_num)
                    logger.record_tabular("Eval_EpSucPercent",
                                          eval_suc_num / eval_episode_num)
                    logger.dump_tabular()
                    eval_ddpg_rewards.append(
                        np.mean(eval_episode_rewards_history))
                eval_suc_percents.append(eval_suc_num / eval_episode_num)
                # ----------------------------------------------------------------------------------------------
                # --------------------- plotting and saving -------------------------
                if saver is not None:
                    logger.info("saving the trained model")
                    start_time_save = time.time()
                    if epoch + 1 == nb_epochs:
                        saver.save(sess,
                                   kwargs['MODEL_DIR'] + "/trained_model")
                    else:
                        saver.save(
                            sess, kwargs['MODEL_DIR'] + "/iter_" + str(
                                (epoch + 1) // nb_epochs_unit))

                plot_performance(range(len(ddpg_rewards)),
                                 ddpg_rewards,
                                 ylabel=r'avg reward per DDPG learning step',
                                 xlabel='ddpg iteration',
                                 figfile=os.path.join(kwargs['FIGURE_DIR'],
                                                      'ddpg_reward'),
                                 title='TRAIN')
                plot_performance(
                    range(len(ddpg_suc_percents)),
                    ddpg_suc_percents,
                    ylabel=
                    r'overall success percentage per algorithm step under DDPG',
                    xlabel='algorithm iteration',
                    figfile=os.path.join(kwargs['FIGURE_DIR'],
                                         'success_percent'),
                    title="TRAIN")

                plot_performance(range(len(eval_ddpg_rewards)),
                                 eval_ddpg_rewards,
                                 ylabel=r'avg reward per DDPG eval step',
                                 xlabel='ddpg iteration',
                                 figfile=os.path.join(kwargs['FIGURE_DIR'],
                                                      'eval_ddpg_reward'),
                                 title='EVAL')
                plot_performance(
                    range(len(eval_suc_percents)),
                    eval_suc_percents,
                    ylabel=
                    r'overall eval success percentage per algorithm step under DDPG',
                    xlabel='algorithm iteration',
                    figfile=os.path.join(kwargs['FIGURE_DIR'],
                                         'eval_success_percent'),
                    title="EVAL")

                # save data which is accumulated UNTIL iter i
                with open(
                        kwargs['RESULT_DIR'] + '/ddpg_reward_' + 'iter_' + str(
                            (epoch + 1) // nb_epochs_unit) + '.pickle',
                        'wb') as f2:
                    pickle.dump(ddpg_rewards, f2)
                with open(
                        kwargs['RESULT_DIR'] + '/success_percent_' + 'iter_' +
                        str((epoch + 1) // nb_epochs_unit) + '.pickle',
                        'wb') as fs:
                    pickle.dump(ddpg_suc_percents, fs)

                # save evaluation data accumulated until iter i
                with open(
                        kwargs['RESULT_DIR'] + '/eval_ddpg_reward_' + 'iter_' +
                        str((epoch + 1) // nb_epochs_unit) + '.pickle',
                        'wb') as f_er:
                    pickle.dump(eval_ddpg_rewards, f_er)
                with open(
                        kwargs['RESULT_DIR'] + '/eval_success_percent_' +
                        'iter_' + str(
                            (epoch + 1) // nb_epochs_unit) + '.pickle',
                        'wb') as f_es:
                    pickle.dump(eval_suc_percents, f_es)
示例#8
0
def main():
    with U.single_threaded_session() as sess:
        batch_size = 64
        current_noise_type = 'adaptive-param_0.2'
        _, stddev = current_noise_type.split('_')
        param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        param_noise_adaption_interval = 2
        env = gym.make("Pendulum-v0")

        nb_actions = env.action_space.shape[-1]
        layer_norm = True

        # Configure components.
        memory = Memory(limit=int(1e6), action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)
        critic = Critic(layer_norm=layer_norm)
        actor = Actor(nb_actions, layer_norm=layer_norm)

        # Seed everything to make things reproducible.
        seed = int(1000000 * np.random.rand())
        logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
        tf.set_random_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        env.seed(seed)

        max_action = env.action_space.high
        logger.info('scaling actions by {} before executing in env'.format(max_action))
        agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
                     batch_size=batch_size, param_noise=param_noise)
        logger.info('Using agent with the following configuration:')
        logger.info(str(agent.__dict__.items()))

        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()
        agent.reset()
        obs = env.reset()
        for t in itertools.count():
            episode_rewards = []
            done = False
            while not done:
                env.render()

                # Take action and update exploration to the newest value
                action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                new_obs, rew, done, _ = env.step(max_action * action)

                # Book-keeping.
                agent.store_transition(obs, action, rew, new_obs, done)
                obs = new_obs

                episode_rewards.append(rew)
                if done:
                    agent.reset()
                    obs = env.reset()

            nb_train_steps = 100
            epoch_adaptive_distances = []
            epoch_critic_losses = []
            epoch_actor_losses = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            if t % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", round(np.mean(episode_rewards), 1))
                logger.record_tabular('train/loss_actor', round(np.mean(epoch_actor_losses)))
                logger.record_tabular('train/loss_critic', round(np.mean(epoch_critic_losses)))
                logger.record_tabular('train/param_noise_distance', round(np.mean(epoch_adaptive_distances)))
                logger.dump_tabular()
示例#9
0
class DDPGAgent(BaseAgent):
    """A Deep Deterministic Policy Gradient implementation of an SC2 agent."""
    def __init__(self):
        super(DDPGAgent, self).__init__()
        return

    def setup(self,
              obs_shape,
              nb_actions,
              action_spec,
              noise_type,
              gamma=1.,
              tau=0.01,
              layer_norm=True):
        super(DDPGAgent, self).setup(obs_shape, nb_actions, action_spec,
                                     noise_type, gamma, tau, layer_norm)

        self.action_spec_internal = action_spec
        self.obs_dim = obs_shape
        action_noise = None
        param_noise = None

        # Parse noise_type
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

        # Configure components.
        self.memory = Memory(limit=int(500),
                             action_shape=(nb_actions, ),
                             observation_shape=obs_shape)
        self.critic = Critic(layer_norm=layer_norm, hidden_size=128)
        self.actor = Actor(nb_actions, layer_norm=layer_norm, hidden_size=128)

        tf.reset_default_graph()

        # max_action = env.action_space.high
        self.ddpg = DDPG(actor=self.actor,
                         critic=self.critic,
                         memory=self.memory,
                         observation_shape=obs_shape,
                         action_shape=(nb_actions, ),
                         gamma=gamma,
                         tau=tau,
                         action_noise=action_noise,
                         param_noise=param_noise)

    def step(self, obs):
        super(DDPGAgent, self).step(obs)
        acts, q = self.ddpg.pi(obs, apply_noise=True, compute_Q=True)
        # Move distribution from [-1, 1] to [0, 2] and convert to z-score
        actions_z = (2 - (acts + 1)) / 2
        return actions_z, q

    def reset(self):
        super(DDPGAgent, self).reset()
        self.ddpg.reset()

    def initialize(self, sess):
        super(DDPGAgent, self).initialize(sess)
        self.ddpg.initialize(sess)

    def store_transition(self, obs, action, r, new_obs, done):
        super(DDPGAgent, self).store_transition(obs, action, r, new_obs, done)
        self.ddpg.store_transition(obs, action, r, new_obs, done)

    def train(self):
        super(DDPGAgent, self).train()
        return self.ddpg.train()

    def adapt_param_noise(self):
        super(DDPGAgent, self).adapt_param_noise()
        return self.ddpg.adapt_param_noise()

    def backprop(self):
        super(DDPGAgent, self).backprop()
        self.ddpg.update_target_net()

    def get_memory_size(self):
        super(DDPGAgent, self).get_memory_size()
        return self.memory.nb_entries

    @property
    def action_spec(self):
        return self.action_spec_internal

    @property
    def obs_shape(self):
        return self.obs_dim
    def run(self):
        """Override Process.run()"""
        # Create environment
        env = create_environment(
            action_repeat=self.action_repeat,
            full=self.full,
            exclude_centering_frame=self.exclude_centering_frame,
            visualize=self.visualize,
            fail_reward=self.fail_reward,
            integrator_accuracy=self.integrator_accuracy)
        nb_actions = env.action_space.shape[-1]

        # keep tracks of the number of trajectory done
        num_traj = 0

        env.seed(os.getpid())
        set_global_seeds(os.getpid())

        # Create OU Noise
        action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                    sigma=0.2,
                                                    theta=0.1)

        # Allocate ReplayBuffer
        memory = Memory(limit=int(1e6),
                        action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)

        # Create DPPG agent
        agent = DDPG(self.actor,
                     self.critic,
                     memory,
                     env.observation_space.shape,
                     env.action_space.shape,
                     gamma=self.gamma,
                     tau=self.tau,
                     normalize_returns=self.normalize_returns,
                     normalize_observations=self.normalize_observations,
                     batch_size=self.batch_size,
                     action_noise=action_noise,
                     param_noise=self.param_noise,
                     critic_l2_reg=self.critic_l2_reg,
                     enable_popart=self.popart,
                     clip_norm=self.clip_norm,
                     reward_scale=self.reward_scale)

        # Build the sampling logic fn
        sampling_fn = make_sampling_fn(agent, env, self.episode_length,
                                       self.action_repeat, self.max_action,
                                       self.nb_episodes,
                                       self.action_noise_prob)

        # Start TF session
        with U.single_threaded_session() as sess:
            agent.initialize(sess)
            set_parameters = U.SetFromFlat(self.actor.trainable_vars)
            # Start sampling-worker loop.
            while True:
                # self.event.wait()  # Wait for a new message
                # self.event.clear()  # Upon message receipt, mark as read
                message, actor_ws = self.inputQ.get()  # Pop message
                if message == 'sample':
                    # Set weights
                    set_parameters(actor_ws)
                    # Do sampling
                    transitions = sampling_fn()
                    self.outputQ.put((self.process_index, transitions))

                    # update number of trajectories
                    num_traj += self.nb_episodes

                    # restore environment if needed
                    if num_traj >= self.max_env_traj:
                        env.restore()
                        num_traj = 0

                elif message == 'exit':
                    print('[Worker {}] Exiting...'.format(os.getpid()))
                    env.close()
                    break
示例#11
0
import numpy as np
import tensorflow as tf
from mpi4py import MPI
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from osim.env import ProstheticsEnv
import sys

# Settings
remote_base = "http://grader.crowdai.org:1729"
crowdai_token = "8592db9b224e4293d437776321861a32"

client = Client(remote_base)
critic = Critic(layer_norm=layer_norm)
actor = Actor(nb_actions, layer_norm=layer_norm)
memory=[]
agent = DDPG(actor, critic, env.observation_space.shape, env.action_space.shape)
# Create environment
observation = client.env_create(crowdai_token)


# IMPLEMENTATION OF YOUR CONTROLLER
# my_controller = ... (for example the one trained in keras_rl)
def my_controller():
	with U.make_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        filename="/home/vaisakhs_shaj/Desktop/MODEL/MODEL/tfSteps"+str(80000)+".model"
        saver.restore(sess,filename)
示例#12
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
示例#13
0
    def __init__(
            self,
            *,
            policy,
            ob_space,
            ac_space,
            nbatch_act,
            nbatch_train,
            nsteps,
            ent_coef,
            vf_coef,
            max_grad_norm,
            # ddpg related params
            layer_norm=False,
            tau=0.001,
            normalize_returns=False,
            normalize_observations=True,
            batch_size=128,
            critic_l2_reg=0.,
            actor_lr=1e-4,
            critic_lr=1e-3,
            popart=False,
            clip_norm=10.,
            reward_scale=1.):
        sess = tf.get_default_session()

        act_model = policy(sess,
                           ob_space,
                           ac_space,
                           nbatch_act,
                           1,
                           reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nbatch_train,
                             nsteps,
                             reuse=True)

        # init DDPG
        critic = Critic(layer_norm=layer_norm)
        actor = Actor(ac_space.shape[-1], layer_norm=layer_norm)
        memory = Memory(limit=int(1e6),
                        action_shape=ac_space.shape,
                        observation_shape=ob_space.shape)
        ddpg_agent = DDPG(actor,
                          critic,
                          memory,
                          ob_space.shape,
                          ac_space.shape,
                          gamma=0.99,
                          tau=tau,
                          normalize_returns=normalize_returns,
                          normalize_observations=normalize_observations,
                          batch_size=batch_size,
                          action_noise=None,
                          param_noise=None,
                          critic_l2_reg=critic_l2_reg,
                          actor_lr=actor_lr,
                          critic_lr=critic_lr,
                          enable_popart=popart,
                          clip_norm=clip_norm,
                          reward_scale=reward_scale)

        ddpg_agent.initialize(sess)
        ddpg_agent.reset()

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])
        if use_annealing:
            DDPG_AC = tf.placeholder(tf.float32, (None, ) + ac_space.shape)
            DDPG_W = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        if use_annealing:
            pi_mean = train_model.pi
            ac_loss = tf.reduce_mean(tf.square(pi_mean - DDPG_AC))
        # loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        # ----------------- DDPG -----------------
        if use_ddpg:
            loss = pg_loss - entropy * ent_coef
            if use_annealing:
                loss = pg_loss - entropy * ent_coef + ac_loss * DDPG_W
        else:
            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        # ----------------- DDPG -----------------
        with tf.variable_scope('model'):
            params = tf.trainable_variables()
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        _train = trainer.apply_gradients(grads)

        def train(lr,
                  cliprange,
                  obs,
                  returns,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  states=None,
                  ddpg_acs=None,
                  ddpg_w=0.):
            advs = returns - values
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            if not use_annealing:
                td_map = {
                    train_model.X: obs,
                    A: actions,
                    ADV: advs,
                    R: returns,
                    LR: lr,
                    CLIPRANGE: cliprange,
                    OLDNEGLOGPAC: neglogpacs,
                    OLDVPRED: values
                }
            else:
                td_map = {
                    train_model.X: obs,
                    A: actions,
                    ADV: advs,
                    R: returns,
                    LR: lr,
                    CLIPRANGE: cliprange,
                    OLDNEGLOGPAC: neglogpacs,
                    OLDVPRED: values,
                    DDPG_AC: ddpg_acs,
                    DDPG_W: ddpg_w
                }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            if not use_annealing:
                return sess.run(
                    [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
                    td_map)[:-1]
            else:
                return sess.run([
                    pg_loss, vf_loss, entropy, approxkl, clipfrac, ac_loss,
                    _train
                ], td_map)[:-1]

        if not use_annealing:
            self.loss_names = [
                'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
                'clipfrac'
            ]
        else:
            self.loss_names = [
                'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
                'clipfrac', 'ac_loss'
            ]

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.agent = ddpg_agent
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)  #pylint: disable=E1101
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from osim.env import ProstheticsEnv
import sys

env = ProstheticsEnv(visualize=False)
env.change_model(model='3D', difficulty=2, prosthetic=True)
layer_norm = True
nb_actions = 19
memory = Memory(limit=int(1.5e6),
                action_shape=env.action_space.shape,
                observation_shape=env.observation_space.shape)
critic = Critic(layer_norm=layer_norm)
actor = Actor(nb_actions, layer_norm=layer_norm)
agent = DDPG(actor,
             critic,
             memory,
             env.observation_space.shape,
             env.action_space.shape,
             gamma=0.99)
saver = tf.train.Saver()
# IMPLEMENTATION OF YOUR CONTROLLER
# my_controller = ... (for example the one trained in keras_rl)

sess = tf.InteractiveSession()
agent.initialize(sess)
sess.graph.finalize()
agent.reset()
filename = "/home/vaisakhs_shaj/Desktop/MODEL/tfSteps" + str(10000) + ".model"
saver.restore(sess, filename)
observation = env.reset()

示例#15
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          saved_model_basename,
          restore_model_name,
          crowdai_client,
          crowdai_token,
          reward_shaping,
          feature_embellishment,
          relative_x_pos,
          relative_z_pos,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    saved_model_dir = 'saved-models/'
    if saved_model_basename is None:
        saved_model_basename = ''.join(
            random.choices(string.ascii_lowercase + string.digits, k=8))
    saved_model_path = saved_model_dir + saved_model_basename
    if restore_model_name:
        restore_model_path = restore_model_name
        if not pathlib.Path(restore_model_path + '.index').is_file():
            restore_model_path = saved_model_dir + restore_model_name
    max_to_keep = 500
    eval_reward_threshold_to_keep = 300
    saver = tf.train.Saver(max_to_keep=max_to_keep)
    adam_optimizer_store = dict()
    adam_optimizer_store['actor_optimizer'] = dict()
    adam_optimizer_store['critic_optimizer'] = dict()

    #eval_episode_rewards_history = deque(maxlen=100)
    #episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        try:
            if restore_model_name:
                logger.info("Restoring from model at", restore_model_path)
                #saver.restore(sess, tf.train.latest_checkpoint(model_path))
                saver.restore(sess, restore_model_path)
            else:
                logger.info("Creating new model")
                sess.run(tf.global_variables_initializer(
                ))  # this should happen here and not in the agent right?
        except InvalidArgumentError as exc:
            if "Assign requires shapes of both tensors to match." in str(exc):
                print("Unable to restore model from {:s}.".format(
                    restore_model_path))
                print(
                    "Chances are you're trying to restore a model with reward embellishment into an environment without reward embellishment (or vice versa). Unfortunately this isn't supported (yet)."
                )
                print(exc.message)
                sys.exit()
            else:
                raise exc

        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()

        # restore adam optimizer
        try:
            if restore_model_name:
                logger.info("Restoring pkl file with adam state",
                            restore_model_path)
                #saver.restore(sess, tf.train.latest_checkpoint(model_path))
                adam_optimizer_store = pickle.load(
                    open(restore_model_path + ".pkl", "rb"))
                agent.actor_optimizer.m = adam_optimizer_store[
                    'actor_optimizer']['m']
                agent.actor_optimizer.v = adam_optimizer_store[
                    'actor_optimizer']['v']
                agent.actor_optimizer.t = adam_optimizer_store[
                    'actor_optimizer']['t']
                agent.critic_optimizer.m = adam_optimizer_store[
                    'critic_optimizer']['m']
                agent.critic_optimizer.v = adam_optimizer_store[
                    'critic_optimizer']['v']
                agent.critic_optimizer.t = adam_optimizer_store[
                    'critic_optimizer']['t']
                if 'param_noise' in adam_optimizer_store:
                    agent.param_noise = adam_optimizer_store['param_noise']
        except:
            print("Unable to restore adam state from {:s}.".format(
                restore_model_path))

        obs = env.reset()
        done = False
        episode_reward = 0.
        #episode_step = 0
        #episodes = 0
        #t = 0

        #epoch_episode_steps = []
        #epoch_episode_eval_rewards = []
        #epoch_episode_eval_steps = []
        #epoch_start_time = time.time()
        #epoch_actions = []
        #epoch_episodes = 0
        for epoch in range(nb_epochs):
            start_time = time.time()
            epoch_episode_rewards = []
            epoch_qs = []
            eval_episode_rewards = []
            eval_qs = []
            eval_steps = []
            epoch_actor_losses = []
            epoch_critic_losses = []
            worth_keeping = False
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    #new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    new_obs, r, done, info = env.step(action)
                    #t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    #episode_step += 1

                    # Book-keeping.
                    #epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        #episode_rewards_history.append(episode_reward)
                        #epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        #episode_step = 0
                        #epoch_episodes += 1
                        #episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                #epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        #epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Submit to crowdai competition. What a hack. :)
                #if crowdai_client is not None and crowdai_token is not None and eval_env is not None:
                crowdai_submit_count = 0
                if crowdai_client is not None and crowdai_token is not None:
                    eval_obs_dict = crowdai_client.env_create(
                        crowdai_token, env_id="ProstheticsEnv")
                    eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                        eval_obs_dict,
                        reward_shaping=reward_shaping,
                        reward_shaping_x=1.,
                        feature_embellishment=feature_embellishment,
                        relative_x_pos=relative_x_pos,
                        relative_z_pos=relative_z_pos)
                    while True:
                        action, _ = agent.pi(eval_obs_projection,
                                             apply_noise=False,
                                             compute_Q=False)
                        submit_action = prosthetics_env.openai_to_crowdai_submit_action(
                            action)
                        clipped_submit_action = np.clip(submit_action, 0., 1.)
                        actions_equal = clipped_submit_action == submit_action
                        if not np.all(actions_equal):
                            logger.debug("crowdai_submit_count:",
                                         crowdai_submit_count)
                            logger.debug("  openai-action:", action)
                            logger.debug("  submit-action:", submit_action)
                        crowdai_submit_count += 1
                        [eval_obs_dict, reward, done,
                         info] = crowdai_client.env_step(
                             clipped_submit_action.tolist(), True)
                        #[eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True)
                        eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                            eval_obs_dict,
                            reward_shaping=reward_shaping,
                            reward_shaping_x=1.,
                            feature_embellishment=feature_embellishment,
                            relative_x_pos=relative_x_pos,
                            relative_z_pos=relative_z_pos)
                        if done:
                            logger.debug("done: crowdai_submit_count:",
                                         crowdai_submit_count)
                            eval_obs_dict = crowdai_client.env_reset()
                            if not eval_obs_dict:
                                break
                            logger.debug(
                                "done: eval_obs_dict exists after reset")
                            eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                                eval_obs_dict,
                                reward_shaping=reward_shaping,
                                reward_shaping_x=1.,
                                feature_embellishment=feature_embellishment,
                                relative_x_pos=relative_x_pos,
                                relative_z_pos=relative_z_pos)
                    crowdai_client.submit()
                    return  # kids, don't try any of these (expedient hacks) at home!

            if eval_env:
                eval_episode_reward_mean, eval_q_mean, eval_step_mean = evaluate_n_episodes(
                    3, eval_env, agent, nb_eval_steps, render_eval)
                if eval_episode_reward_mean >= eval_reward_threshold_to_keep:
                    worth_keeping = True

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            if nb_epochs and nb_epoch_cycles and nb_train_steps > 0:
                #stats = agent.get_stats()
                #combined_stats = stats.copy()
                combined_stats = {}
                combined_stats['train/epoch_episode_reward_mean'] = np.mean(
                    epoch_episode_rewards)
                #combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
                #combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
                #combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
                combined_stats['train/epoch_Q_mean'] = np.mean(epoch_qs)
                combined_stats['train/epoch_loss_actor'] = np.mean(
                    epoch_actor_losses)
                combined_stats['train/epoch_loss_critic'] = np.mean(
                    epoch_critic_losses)
                #combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
                combined_stats['train/epoch_duration'] = duration
                #combined_stats['epoch/steps_per_second'] = float(t) / float(duration)
                #combined_stats['total/episodes'] = episodes
                #combined_stats['rollout/episodes'] = epoch_episodes
                #combined_stats['rollout/actions_std'] = np.std(epoch_actions)
                #combined_stats['memory/rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
            else:
                combined_stats = {}
            # Evaluation statistics.
            if eval_env:
                combined_stats[
                    'eval/epoch_episode_reward_mean'] = eval_episode_reward_mean  # np.mean(eval_episode_rewards)
                #combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                #combined_stats['eval/epoch_episode_reward_std'] = np.std(eval_episode_rewards)
                combined_stats[
                    'eval/epoch_Q_mean'] = eval_q_mean  # np.mean(eval_qs)
                #combined_stats['eval/episodes'] = len(eval_episode_rewards)
                combined_stats[
                    'eval/steps_mean'] = eval_step_mean  # np.mean(eval_steps)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            #combined_stats['total/epochs'] = epoch + 1
            #combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.info('')
            logger.info('Epoch', epoch)
            logger.dump_tabular()
            logdir = logger.get_dir()

            if worth_keeping and rank == 0 and nb_epochs and nb_epoch_cycles and nb_train_steps and nb_rollout_steps:
                logger.info(
                    'Saving model to',
                    saved_model_dir + saved_model_basename + '-' + str(epoch))
                saver.save(sess,
                           saved_model_path,
                           global_step=epoch,
                           write_meta_graph=False)
                adam_optimizer_store['actor_optimizer'][
                    'm'] = agent.actor_optimizer.m
                adam_optimizer_store['actor_optimizer'][
                    'v'] = agent.actor_optimizer.v
                adam_optimizer_store['actor_optimizer'][
                    't'] = agent.actor_optimizer.t

                adam_optimizer_store['critic_optimizer'][
                    'm'] = agent.critic_optimizer.m
                adam_optimizer_store['critic_optimizer'][
                    'v'] = agent.critic_optimizer.v
                adam_optimizer_store['critic_optimizer'][
                    't'] = agent.critic_optimizer.t

                adam_optimizer_store['param_noise'] = agent.param_noise

                pickle.dump(
                    adam_optimizer_store,
                    open((saved_model_path + "-" + str(epoch) + ".pkl"), "wb"))
                old_epoch = epoch - max_to_keep
                if old_epoch >= 0:
                    try:
                        os.remove(saved_model_path + "-" + str(old_epoch) +
                                  ".pkl")
                    except OSError:
                        pass

            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
示例#16
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          teacher,
          tau=0.01,
          eval_env=True,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()
    t = datetime.now().strftime('%H-%M')
    PATH = 'results/ddpg'.format(t)

    #assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        agent.restore_model(PATH)
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)

                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            action)
                        eval_env.background = get_q_background(
                            eval_env, agent.q, eval_action)

                        # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t
            agent.save_model(PATH, epoch)
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
def train_return(env,
                 param_noise,
                 actor,
                 critic,
                 memory,
                 nb_epochs=250,
                 nb_epoch_cycles=20,
                 reward_scale=1.,
                 render=False,
                 normalize_returns=False,
                 normalize_observations=True,
                 critic_l2_reg=1e-2,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 action_noise=None,
                 popart=False,
                 gamma=0.99,
                 clip_norm=None,
                 nb_train_steps=50,
                 nb_rollout_steps=2048,
                 batch_size=64,
                 tau=0.01,
                 param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    # Set up logging stuff only for a single worker.

    episode_rewards_history = deque(maxlen=100)
    #with U.single_threaded_session() as sess:
    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()
    obs = env.reset()
    episode_reward = 0.
    episode_step = 0
    episodes = 0
    t = 0

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        print('epoch number:', epoch)
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                assert action.shape == env.action_space.shape

                # Execute next action.
                if rank == 0 and render:
                    env.render()
                assert max_action.shape == action.shape
                new_obs, r, done, info = env.step(
                    max_action * action
                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs

                if done:
                    # Episode done.
                    epoch_episode_rewards.append(episode_reward)
                    episode_rewards_history.append(episode_reward)
                    epoch_episode_steps.append(episode_step)
                    episode_reward = 0.
                    episode_step = 0
                    epoch_episodes += 1
                    episodes += 1

                    agent.reset()
                    obs = env.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()
    return agent
def train(env,
          nb_epochs,
          nb_episodes,
          nb_epoch_cycles,
          episode_length,
          nb_train_steps,
          eval_freq,
          save_freq,
          nb_eval_episodes,
          actor,
          critic,
          memory,
          gamma,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          action_noise,
          param_noise,
          popart,
          clip_norm,
          batch_size,
          reward_scale,
          action_repeat,
          full,
          exclude_centering_frame,
          visualize,
          fail_reward,
          num_processes,
          num_processes_to_wait,
          num_testing_processes,
          learning_session,
          min_buffer_length,
          integrator_accuracy=5e-5,
          max_env_traj=100,
          tau=0.01):
    """
    Parameters
    ----------
    nb_epochs : the number of epochs to train.

    nb_episodes : the number of episodes for each epoch.

    episode_length : the maximum number of steps for each episode.

    gamma : discount factor.

    tau : soft update coefficient.

    clip_norm : clip on the norm of the gradient.
    """

    assert action_repeat > 0
    assert nb_episodes >= num_processes

    # Get params from learning session
    checkpoint_dir = learning_session.checkpoint_dir
    log_dir = learning_session.log_dir
    training_step = learning_session.last_training_step

    # Initialize DDPG agent (target network and replay buffer)
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=None,
                 critic_l2_reg=critic_l2_reg,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 training_step=training_step)

    # We need max_action because the NN output layer is a tanh.
    # So we must scale it back.
    max_action = env.action_space.high

    # Build Workers
    events = [Event() for _ in range(num_processes)]
    inputQs = [Queue() for _ in range(num_processes)]
    outputQ = Queue()
    # Split work among workers
    nb_episodes_per_worker = nb_episodes // num_processes

    workers = [
        SamplingWorker(i, actor, critic, episode_length,
                       nb_episodes_per_worker, action_repeat, max_action,
                       gamma, tau, normalize_returns, batch_size,
                       normalize_observations, param_noise, critic_l2_reg,
                       popart, clip_norm, reward_scale, events[i], inputQs[i],
                       outputQ, full, exclude_centering_frame,
                       integrator_accuracy, max_env_traj, visualize,
                       fail_reward) for i in range(num_processes)
    ]

    # Run the Workers
    for w in workers:
        w.start()

    # Create Round Robin tester
    tester = RoundRobinTester(
        num_testing_processes, actor, critic, episode_length, nb_eval_episodes,
        action_repeat, max_action, gamma, tau, normalize_returns, batch_size,
        normalize_observations, critic_l2_reg, popart, clip_norm, reward_scale,
        full, exclude_centering_frame, integrator_accuracy, max_env_traj,
        visualize, fail_reward)

    # Start training loop
    with U.single_threaded_session() as sess:
        agent.initialize(sess)

        writer = tf.summary.FileWriter(log_dir)
        writer.add_graph(sess.graph)

        # Initialize writer and statistics
        stats = EvaluationStatistics(tf_session=sess, tf_writer=writer)

        # setup saver
        saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=2)

        get_parameters = U.GetFlat(actor.trainable_vars)

        global_step = 0
        obs = env.reset()
        agent.reset()

        # Processes waiting for a new sampling task
        waiting_indices = [i for i in range(num_processes)]
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # If we have sampling workers waiting, dispatch a sampling job
                if waiting_indices:
                    actor_ws = get_parameters()
                    # Run parallel sampling
                    for i in waiting_indices:
                        inputQs[i].put(('sample', actor_ws))
                        events[i].set()  # Notify worker: sample baby, sample!
                    waiting_indices.clear()

                # Collect results when ready
                for i in range(num_processes_to_wait):
                    process_index, transitions = outputQ.get()
                    waiting_indices.append(process_index)
                    print('Collecting transition samples from Worker {}/{}'.
                          format(i + 1, num_processes_to_wait))
                    for t in transitions:
                        agent.store_transition(*t)

                # try to collect other samples if available
                for i in range(num_processes):
                    try:
                        process_index, transitions = outputQ.get_nowait()
                        if process_index not in waiting_indices:
                            waiting_indices.append(process_index)
                        print('Collecting transition samples from Worker {}'.
                              format(process_index))
                        for t in transitions:
                            agent.store_transition(*t)
                    except queue.Empty:
                        # No sampling ready, keep on training.
                        pass

                # Training phase
                if agent.memory.nb_entries > min_buffer_length:
                    for _ in range(nb_train_steps):
                        critic_loss, actor_loss = agent.train()
                        agent.update_target_net()

                        # Plot statistics
                        stats.add_critic_loss(critic_loss, global_step)
                        stats.add_actor_loss(actor_loss, global_step)
                        global_step += 1

                    # Evaluation phase
                    if cycle % eval_freq == 0:
                        print("Cycle number: ",
                              cycle + epoch * nb_epoch_cycles)
                        print("Sending testing job...")
                        actor_ws = get_parameters()

                        # Send a testing job
                        tester.test(actor_ws, global_step)

                        # Print stats (if any)
                        tester.log_stats(stats, logger)

                    if cycle % save_freq == 0:
                        # Save weights
                        save_path = saver.save(sess,
                                               checkpoint_dir,
                                               global_step=global_step)
                        print("Model saved in path: %s" % save_path)
                        # Dump learning session
                        learning_session.dump(agent.training_step)
                        print("Learning session dumped to: %s" %
                              str(learning_session.session_path))
                else:
                    print("Not enough entry in memory buffer")

        # Stop workers
        for i in range(num_processes):
            inputQs[i].put(('exit', None))
            events[i].set()  # Notify worker: exit!
        tester.close()  # Stop testing workers
        env.close()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()
    #print(np.abs(env.action_space.low))
    #print(np.abs(env.action_space.high))
    #assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    print(env.action_space)
    print(env.observation_space)
    #logger.info('scaling actions by {} before executing in env'.format(max_action))
    if load_memory:
        memory=pickle.load(open("/home/vaisakhs_shaj/Desktop/BIG-DATA/memory1000000.pickle","rb"))


    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
 
    

    '''
     # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None
    '''
    saver=tf.train.Saver()
    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=10)
    with U.make_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        if restore:
            filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(120000)+".model"
            saver.restore(sess,filename)
        obs = env.reset()

        if eval_env is not None:
            eval_obs = eval_env.reset()
        tr=0
        s=0
        while True:
            action=agent.pi(obs, apply_noise=False, compute_Q=False)[0] 
            obs, r, done, info = env.step(action)
            tr=tr+r
            s=s+1
            print(r)
            if done:
                print(tr)
                obs=env.reset()
                tr=0
                print(s)
                break
示例#20
0
def evaluate(env,
             nb_episodes,
             reward_scale,
             render,
             param_noise,
             action_noise,
             actor,
             critic,
             memory,
             critic_l2_reg,
             normalize_returns=False,
             normalize_observations=True,
             weight_file=None):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    with U.single_threaded_session() as sess:
        agent.initialize(sess)
        if weight_file:
            saver = tf.train.Saver(actor.trainable_vars +
                                   critic.trainable_vars)
            saver.restore(sess, weight_file)
            agent.actor_optimizer.sync()
            agent.critic_optimizer.sync()
        # sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        total_reward = 0.0
        max_steps = 2000
        for ep in range(nb_episodes):
            i = 0
            done = False
            episode_reward = 0.0
            while not done and i < max_steps:
                action, q, all_actions, sample = agent.pi(obs,
                                                          apply_noise=False,
                                                          compute_Q=True)
                assert action.shape == env.action_space.shape

                assert max_action.shape == action.shape
                obs, r, done, info = env.step(max_action * action)
                episode_reward += r
                # env.render()
                # print('Action:{}, reward:{}'.format(action, r))
                # time.sleep(0.1)
                i += 1
            total_reward += episode_reward
            logger.info("Episode:{}, reward:{}, steps:{}".format(
                ep, episode_reward, i))
            if done:
                obs = env.reset()

        logger.info("Average reward:{}, total reward:{}, episodes:{}".format(
            (total_reward / nb_episodes), total_reward, nb_episodes))
示例#21
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    """
    Runs the training of the Deep Deterministic Policy Gradien (DDPG) model

    DDPG: https://arxiv.org/pdf/1509.02971.pdf

    :param env: (Gym Environment) the environment
    :param nb_epochs: (int) the number of training epochs
    :param nb_epoch_cycles: (int) the number cycles within each epoch
    :param render_eval: (bool) enable rendering of the evalution environment
    :param reward_scale: (float) the value the reward should be scaled by
    :param render: (bool) enable rendering of the environment
    :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None)
    :param actor: (TensorFlow Tensor) the actor model
    :param critic: (TensorFlow Tensor) the critic model
    :param normalize_returns: (bool) should the critic output be normalized
    :param normalize_observations: (bool) should the observation be normalized
    :param critic_l2_reg: (float) l2 regularizer coefficient
    :param actor_lr: (float) the actor learning rate
    :param critic_lr: (float) the critic learning rate
    :param action_noise: (ActionNoise) the action noise type (can be None)
    :param popart: (bool) enable pop-art normalization of the critic output
        (https://arxiv.org/pdf/1602.07714.pdf)
    :param gamma: (float) the discount rate
    :param clip_norm: (float) clip the gradients (disabled if None)
    :param nb_train_steps: (int) the number of training steps
    :param nb_rollout_steps: (int) the number of rollout steps
    :param nb_eval_steps: (int) the number of evalutation steps
    :param batch_size: (int) the size of the batch for learning the policy
    :param memory: (Memory) the replay buffer
    :param tau: (float) the soft update coefficient (keep old values, between 0 and 1)
    :param eval_env: (Gym Environment) the evaluation environment (can be None)
    :param param_noise_adaption_interval: (int) apply param noise every N steps
    """
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 enable_popart=popart,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        tf.train.Saver()

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with tf_util.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        step = 0

        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for _ in range(nb_epoch_cycles):
                # Perform rollouts.
                for _ in range(nb_rollout_steps):
                    # Predict next action.
                    action, q_value = agent.policy(obs,
                                                   apply_noise=True,
                                                   compute_q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    new_obs, reward, done, _ = env.step(max_action * action)
                    step += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += reward
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q_value)
                    agent.store_transition(obs, action, reward, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    critic_loss, actor_loss = agent.train()
                    epoch_critic_losses.append(critic_loss)
                    epoch_actor_losses.append(actor_loss)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for _ in range(nb_eval_steps):
                        eval_action, eval_q = agent.policy(eval_obs,
                                                           apply_noise=False,
                                                           compute_q=True)
                        # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        eval_obs, eval_r, eval_done, _ = eval_env.step(
                            max_action * eval_action)
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(step) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(scalar):
                """
                check and return the input if it is a scalar, otherwise raise ValueError

                :param scalar: (Any) the object to check
                :return: (Number) the scalar if x is a scalar
                """
                if isinstance(scalar, np.ndarray):
                    assert scalar.size == 1
                    return scalar[0]
                elif np.isscalar(scalar):
                    return scalar
                else:
                    raise ValueError('expected scalar, got %s' % scalar)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = step

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as file_handler:
                        pickle.dump(env.get_state(), file_handler)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as file_handler:
                        pickle.dump(eval_env.get_state(), file_handler)
示例#22
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          perform=False,
          expert=None,
          save_networks=False,
          supervise=False,
          pre_epoch=60,
          actor_only=False,
          critic_only=False,
          both_ours_sup=False,
          gail=False,
          pofd=False):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 expert=expert,
                 save_networks=save_networks,
                 supervise=supervise,
                 actor_only=actor_only,
                 critic_only=critic_only,
                 both_ours_sup=both_ours_sup,
                 gail=gail,
                 pofd=pofd)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        network_saving_dir = os.path.join('./saved_networks',
                                          env.env.spec.id) + '/'
        if not os.path.exists(network_saving_dir):
            os.makedirs(network_saving_dir)
        agent.initialize(sess, saver, network_saving_dir, 10000, 30000)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        if expert is None:
            pretrain = False
        else:
            pretrain = True
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        small_buffer = []
        big_buffer = []
        for epoch in range(nb_epochs):
            if epoch >= pre_epoch and pretrain:
                pretrain = False
                logger.info('Stoped pretrain at epoch {}'.format(epoch))
            for cycle in range(nb_epoch_cycles):
                if not perform:
                    # Perform rollouts.
                    for t_rollout in range(nb_rollout_steps):
                        # Predict next action.
                        action, q = agent.pi(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                        assert action.shape == env.action_space.shape

                        # Execute next action.
                        if rank == 0 and render:
                            env.render()
                        assert max_action.shape == action.shape
                        new_obs, r, done, info = env.step(
                            max_action * action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                        t += 1
                        if rank == 0 and render:
                            env.render()
                        episode_reward += r
                        episode_step += 1

                        # Book-keeping.
                        epoch_actions.append(action)
                        epoch_qs.append(q)
                        agent.store_transition(obs, action, r, new_obs, done)
                        obs = new_obs

                        if done:
                            # Episode done.
                            epoch_episode_rewards.append(episode_reward)
                            episode_rewards_history.append(episode_reward)
                            epoch_episode_steps.append(episode_step)
                            episode_reward = 0.
                            episode_step = 0
                            epoch_episodes += 1
                            episodes += 1

                            agent.reset()
                            obs = env.reset()

                    # Train.
                    epoch_actor_losses = []
                    epoch_critic_losses = []
                    epoch_adaptive_distances = []
                    for t_train in range(nb_train_steps):
                        # Adapt param noise, if necessary.
                        if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                            distance = agent.adapt_param_noise()
                            epoch_adaptive_distances.append(distance)

                        cl, al = agent.train(pretrain)
                        epoch_critic_losses.append(cl)
                        epoch_actor_losses.append(al)
                        agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        old_eval_obs = eval_obs
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                        if perform:
                            small_buffer.append([
                                old_eval_obs, eval_action, eval_r, eval_obs,
                                eval_done
                            ])

                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

                            if perform and len(small_buffer) > 0:
                                big_buffer.append(small_buffer)
                                small_buffer = []
                                if len(big_buffer
                                       ) > 0 and len(big_buffer) % 1000 == 0:
                                    expert_dir = os.path.join(
                                        './expert', env.env.spec.id) + '/'
                                    if not os.path.exists(expert_dir):
                                        os.makedirs(expert_dir)
                                    pwritefile = open(
                                        os.path.join(expert_dir, 'expert.pkl'),
                                        'wb')
                                    pickle.dump(big_buffer, pwritefile, -1)
                                    pwritefile.close()
                                    logger.info('Expert data saved!')
                                    return

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            combined_stats = {}
            if not perform:
                stats = agent.get_stats()
                for key in sorted(stats.keys()):
                    combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            if not perform:
                combined_stats['rollout/return'] = mpi_mean(
                    epoch_episode_rewards)
                combined_stats['rollout/return_history'] = mpi_mean(
                    np.mean(episode_rewards_history))
                combined_stats['rollout/episode_steps'] = mpi_mean(
                    epoch_episode_steps)
                combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
                combined_stats['rollout/actions_mean'] = mpi_mean(
                    epoch_actions)
                combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
                combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

                # Train statistics.
                combined_stats['train/loss_actor'] = mpi_mean(
                    epoch_actor_losses)
                combined_stats['train/loss_critic'] = mpi_mean(
                    epoch_critic_losses)
                combined_stats['train/param_noise_distance'] = mpi_mean(
                    epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))
            if not perform:
                # Total statistics.
                combined_stats['total/duration'] = mpi_mean(duration)
                combined_stats['total/steps_per_second'] = mpi_mean(
                    float(t) / float(duration))
                combined_stats['total/episodes'] = mpi_mean(episodes)
                combined_stats['total/epochs'] = epoch + 1
                combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
    cm_pos = [state_desc["misc"]["mass_center_pos"][i] - pelvis[i] for i in range(2)]
    res = res + cm_pos + state_desc["misc"]["mass_center_vel"] + state_desc["misc"]["mass_center_acc"]

    return res
# Settings
remote_base = "http://grader.crowdai.org:1729"
crowdai_token = "01342e360022c2def5c2cc04c5843381"

Client = Client(remote_base)

layer_norm=True    
nb_actions=19    
memory = Memory(limit=int(1.5e6), action_shape=(158,), observation_shape=(19,))
critic = Critic(layer_norm=layer_norm)
actor = Actor(nb_actions, layer_norm=layer_norm)
agent = DDPG(actor, critic, memory, (158,), (19,),
    gamma=0.99)
saver=tf.train.Saver()
# IMPLEMENTATION OF YOUR CONTROLLER
# my_controller = ... (for example the one trained in keras_rl)

sess=tf.InteractiveSession()
agent.initialize(sess)
sess.graph.finalize()
agent.reset()
filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(30000)+".model"
saver.restore(sess,filename)
# Create environment
observation = Client.env_create(env_id="ProstheticsEnv",token=crowdai_token)

#print([n.name for n in tf.get_default_graph().as_graph_def().node])
def train(env,
          nb_epochs,
          nb_episodes,
          episode_length,
          nb_train_steps,
          eval_freq,
          nb_eval_episodes,
          actor,
          critic,
          memory,
          gamma,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          clip_norm,
          batch_size,
          reward_scale,
          tau=0.01):
    """
    Parameters
    ----------
    nb_epochs : the number of epochs to train.

    nb_episodes : the number of episodes for each epoch.

    episode_length : the maximum number of steps for each episode.

    gamma : discount factor.

    tau : soft update coefficient.

    clip_norm : clip on the norm of the gradient.
    """
    # Initialize DDPG agent (target network and replay buffer)
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=None,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    # We need max_action because the NN output layer is a tanh.
    # So we must scale it back.
    max_action = env.action_space.high

    with U.single_threaded_session() as sess:
        agent.initialize(sess)

        # Setup summary writer
        writer = _setup_tf_summary()
        writer.add_graph(sess.graph)

        stats = EvaluationStatistics(tf_session=sess, tf_writer=writer)
        sess.graph.finalize()

        global_step = 0
        obs = env.reset()
        agent.reset()
        for epoch in range(nb_epochs):
            for episode in range(nb_episodes):
                obs = env.reset()
                # Generate a trajectory
                for t in range(episode_length):
                    # Select action a_t according to current policy and
                    # exploration noise
                    a_t, _ = agent.pi(obs, apply_noise=True, compute_Q=False)
                    assert a_t.shape == env.action_space.shape

                    # Execute action a_t and observe reward r_t and next state s_{t+1}
                    new_obs, r_t, done, info = env.step(max_action * a_t)

                    # Store transition in the replay buffer
                    agent.store_transition(obs, a_t, r_t, new_obs, done)
                    obs = new_obs

                    if done:
                        agent.reset()
                        obs = env.reset()
                        break  # End episode

                # Training phase
                for t_train in range(nb_train_steps):
                    critic_loss, actor_loss = agent.train()
                    agent.update_target_net()

                    # Plot statistics
                    stats.add_critic_loss(critic_loss, global_step)
                    stats.add_actor_loss(actor_loss, global_step)
                    global_step += 1

                # Evaluation phase
                if episode % eval_freq == 0:
                    # Generate evaluation trajectories
                    for eval_episode in range(nb_eval_episodes):
                        obs = env.reset()
                        for t in range(episode_length):
                            env.render()

                            # Select action a_t according to current policy and
                            # exploration noise
                            a_t, _ = agent.pi(obs,
                                              apply_noise=False,
                                              compute_Q=False)
                            assert a_t.shape == env.action_space.shape

                            # Execute action a_t and observe reward r_t and next state s_{t+1}
                            obs, r_t, eval_done, info = env.step(max_action *
                                                                 a_t)
                            stats.add_reward(r_t)

                            if eval_done:
                                obs = env.reset()
                                break

                    # Plot average reward
                    stats.plot_reward(global_step)
示例#25
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          overwrite_memory,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          logdir,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          eval_jump,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          agentName=None,
          resume=0,
          max_to_keep=100):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver(max_to_keep=max_to_keep)
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)

    logF = open(os.path.join(logdir, 'log.txt'), 'a')
    logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a')
    logReward = open(os.path.join(logdir, 'logReward.txt'), 'a')

    with U.single_threaded_session() as sess:
        # Prepare everything.
        if (resume == 0):
            agent.initialize(sess, max_to_keep=max_to_keep)
        else:
            #restore = "{}-{}".format(agentName,resume)
            agent.initialize(sess,
                             path=os.path.abspath(logdir),
                             restore=agentName,
                             itr=resume,
                             overwrite=overwrite_memory,
                             max_to_keep=max_to_keep)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(resume, resume + nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        print("Epoch " + str(epoch) + " episodes " +
                              str(episodes) + " steps " + str(episode_step) +
                              " reward " + str(episode_reward))
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1
                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None and epoch % eval_jump == 0:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            print("Eval reward " + str(eval_episode_reward))
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None and epoch % eval_jump == 0:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            #    logdir = logger.get_dir()
            if rank == 0:
                logReward.write(
                    str(epoch) + "," + str(combined_stats["rollout/return"]) +
                    "\n")
                logReward.flush()
                logF.write(str(combined_stats["rollout/return"]) + "\n")
                json.dump(combined_stats, logStats)
                logF.flush()
                logStats.flush()

                #    if not os.path.exists(os.path.abspath(logdir)):
                #        os.makedirs(os.path.abspath(logdir), exist_ok=True)
                #    print("logdir = ", logdir)
                #    with open(os.path.join(logdir, "{}_{}".format(agentName, agent.itr.eval())), 'wb') as f:
                #        pickle.dump(agent, f)

                agent.save(path=logdir,
                           name=agentName,
                           overwrite=overwrite_memory)
                logger.info("agent {} saved".format(agent.itr.eval()))
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
示例#26
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          save_path=None,
          restore_path=None,
          hindsight_mode=None):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()

        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                transitions = []
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    transitions.append((obs, action, r, new_obs, done))
                    #agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # store regular transitions into replay memory
                for (obs, action, r, new_obs, done) in transitions:
                    agent.store_transition(obs, action, r, new_obs, done)

                if hindsight_mode in ['final', 'future']:
                    for (obs, action, r, new_obs,
                         done) in replay_final(transitions, env.env):
                        agent.store_transition(obs, action, r, new_obs, done)

                if hindsight_mode in ['future']:
                    for (obs, action, r, new_obs,
                         done) in replay_future(transitions, env.env):
                        agent.store_transition(obs, action, r, new_obs, done)

                # store hindsight transitions.
                '''for i in range(3):
                    # sample a random point in the trajectory
                    idx = np.random.randint(0, len(transitions))
                    obs, action, r, new_obs, done = transitions[idx]
                    # create a goal from that point
                    goal = env.env.obs_to_goal(new_obs)
                    for (obs, action, r, new_obs, done) in replay_with_goal(transitions[:idx+1], goal, env.env):
                        agent.store_transition(obs, action, r, new_obs, done)
                obs, action, r, new_obs, done = transitions[-1]

                # store a "final" transition.
                goal = env.env.obs_to_goal(new_obs)
                for (obs, action, r, new_obs, done) in replay_with_goal(transitions, goal, env.env):
                    agent.store_transition(obs, action, r, new_obs, done)'''

                # Train.

                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['reward'] = mpi_mean(epoch_episode_rewards)
            # combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history))
            combined_stats['episode_steps'] = mpi_mean(epoch_episode_steps)
            combined_stats['episodes'] = mpi_sum(epoch_episodes)
            # combined_stats['actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['actions_std'] = mpi_std(epoch_actions)
            combined_stats['Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['policy_loss'] = mpi_mean(epoch_actor_losses)
            combined_stats['value_loss'] = mpi_mean(epoch_critic_losses)
            combined_stats['param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/reward'] = mpi_mean(eval_episode_rewards)
                # combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q_mean'] = mpi_mean(eval_qs)
                # combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards))

            # Total statistics.
            # combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            # combined_stats['total/episodes'] = mpi_mean(episodes)
            # combined_stats['total/epochs'] = epoch + 1
            # combined_stats['total/steps'] = t
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
示例#27
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          restore=True):
    rank = MPI.COMM_WORLD.Get_rank()

    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    # max_action = env.action_space.high
    # logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape, (env.action_space.shape[0], ),
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 observation_range=(env.observation_space.low[0],
                                    env.observation_space.high[0]),
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up saving stuff only for a single worker.
    savingModelPath = "/home/joel/Documents/saved_models_OpenAI_gym/"
    if rank == 0:
        saver = tf.train.Saver(keep_checkpoint_every_n_hours=1)
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.

        # from https://github.com/openai/baselines/issues/162#issuecomment-397356482 and
        # https://www.tensorflow.org/api_docs/python/tf/train/import_meta_graph

        if restore == True:
            # restoring doesn't actually work
            logger.info("Restoring from saved model")
            saver = tf.train.import_meta_graph(savingModelPath +
                                               "ddpg_test_model.meta")
            saver.restore(sess, tf.train.latest_checkpoint(savingModelPath))
        else:
            logger.info("Starting from scratch!")
            sess.run(tf.global_variables_initializer()
                     )  # this should happen here and not in the agent right?

        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0
        t_rollout = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            start_time_epoch = time.time()
            for cycle in range(nb_epoch_cycles):
                start_time_cycle = time.time()
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # while(not done):
                    start_time_rollout = time.time()
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    logging.debug("q-value of selected action: {}".format(q))

                    # np.set_printoptions(precision=3)
                    logging.debug(
                        "selected (unscaled) action: " +
                        str(action))  # e.g. [ 0.04  -0.662 -0.538  0.324]
                    # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    target = scale_range(action, -1, 1, env.action_space.low,
                                         env.action_space.high)
                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert target.shape == env.action_space.shape
                    new_obs, r, done, info = env.step(target)
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done or t_rollout >= nb_rollout_steps - 1:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()
                    # t_rollout += 1
                    logger.info(
                        'runtime rollout-step {0}.{1}.{2}: {3}s'.format(
                            epoch, cycle, t_rollout,
                            time.time() - start_time_rollout))
                # for rollout_steps

                # Train.
                logging.info("Training the Agent")
                start_time_train = time.time()
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):  # 50 iterations
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise(
                        )  # e.g. 0.7446093559265137
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    logging.debug(
                        "critic loss: {}".format(cl))  # e.g. 25.988863
                    logging.debug(
                        "actor loss: {}".format(al))  # e.g. -0.008966461

                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.
                logger.info('runtime training actor & critic: {}s'.format(
                    time.time() - start_time_train))

                # Saving the trained model
                if (saver is not None):
                    logger.info("saving the trained model")
                    start_time_save = time.time()
                    saver.save(sess, savingModelPath + "ddpg_test_model")
                    logger.info('runtime saving: {}s'.format(time.time() -
                                                             start_time_save))

                done = False

                logger.info('runtime epoch-cycle {0}: {1}s'.format(
                    cycle,
                    time.time() - start_time_cycle))
            # for epoch_cycles

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
                logging.info("\t{0} : {1}".format(key, combined_stats[key]))
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

            # Saving the trained model
            if (saver is not None):
                logger.info("saving the trained model")
                start_time_save = time.time()
                saver.save(sess,
                           savingModelPath + "ddpg_model_epochSave",
                           global_step=epoch)
                logger.info('runtime saving: {}s'.format(time.time() -
                                                         start_time_save))

            logger.info('runtime epoch {0}: {1}s'.format(
                epoch,
                time.time() - start_time_epoch))
    def run(self):
        """Override Process.run()"""
        # Create environment
        env = create_environment(
            action_repeat=self.action_repeat,
            full=self.full,
            exclude_centering_frame=self.exclude_centering_frame,
            visualize=self.visualize,
            fail_reward=self.fail_reward,
            integrator_accuracy=self.integrator_accuracy)
        nb_actions = env.action_space.shape[-1]

        env.seed(os.getpid())
        set_global_seeds(os.getpid())

        num_traj = 0

        # Allocate ReplayBuffer
        memory = Memory(limit=int(1e6),
                        action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)

        # Create DPPG agent
        agent = DDPG(self.actor,
                     self.critic,
                     memory,
                     env.observation_space.shape,
                     env.action_space.shape,
                     gamma=self.gamma,
                     tau=self.tau,
                     normalize_returns=self.normalize_returns,
                     normalize_observations=self.normalize_observations,
                     batch_size=self.batch_size,
                     action_noise=None,
                     param_noise=None,
                     critic_l2_reg=self.critic_l2_reg,
                     enable_popart=self.popart,
                     clip_norm=self.clip_norm,
                     reward_scale=self.reward_scale)

        # Build the testing logic fn
        testing_fn = make_testing_fn(agent, env, self.episode_length,
                                     self.action_repeat, self.max_action,
                                     self.nb_episodes)

        # Start TF session
        with U.single_threaded_session() as sess:
            agent.initialize(sess)
            set_parameters = U.SetFromFlat(self.actor.trainable_vars)

            # Start sampling-worker loop.
            while True:
                message, actor_ws, global_step = self.inputQ.get(
                )  # Pop message
                if message == 'test':
                    # Set weights
                    set_parameters(actor_ws)
                    # Do testing
                    rewards, step_times, distances, episode_lengths = testing_fn(
                    )
                    self.outputQ.put((rewards, step_times, distances,
                                      episode_lengths, global_step))

                    # update number of trajectories
                    num_traj += self.nb_episodes

                    # restore environment if needed
                    if num_traj >= self.max_env_traj:
                        env.restore()
                        num_traj = 0

                elif message == 'exit':
                    print('[Worker {}] Exiting...'.format(os.getpid()))
                    env.close()
                    break
示例#29
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          load_network_id,
          latest,
          plot_info,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        if (load_network_id):
            agent.load_actor_critic(id=load_network_id)
        if (latest):
            agent.load_actor_critic(latest=True)

        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_distances2target = []
        epoch_episode_relative_alt = []
        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        logger.info('EPISODE OVER!')
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        epoch_episode_distances2target.append(
                            info['dist2target'])
                        epoch_episode_relative_alt.append(info['relative_alt'])
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        if (episodes % 10 == 0):
                            agent.save_actor_critic(id=episodes)

                        if (episodes % 2 == 0 and plot_info):
                            plot_information(epoch_episode_distances2target,
                                             epoch_episode_rewards,
                                             epoch_episode_relative_alt)
                            plt.pause(0.1)

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Update learning rates
            if (epoch % 5 == 0 and epoch > 0):
                agent.update_lr(agent.actor_lr * 0.65, agent.critic_lr * 0.65)

    logger.info('Finished training')
示例#30
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          save_model,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
        if not os.path.exists(os.path.join(logger.get_dir(), 'model')):
            os.makedirs(os.path.join(logger.get_dir(), 'model'))
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.make_session(
            num_cpu=4) as sess:  # U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
    def __init__(self,
                 env,
                 agent_index,
                 sess,
                 action_range=(-1., 1.),
                 reward_scale=0.1,
                 critic_l2_reg=1e-2,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 popart=False,
                 gamma=0.975,
                 clip_norm=10,
                 batch_size=64,
                 memory_size=1e6,
                 tau=0.01,
                 normalize_returns=False,
                 normalize_observations=False,
                 noise_type="adaptive-param_0.1",
                 layer_norm=True,
                 nb_layers=2,
                 nb_neurons=64,
                 activation='tanh',
                 **network_kwargs):
        super(DDPGAgent, self).__init__(agent_index)
        # self.sess = sess
        self.nb_actions = env.action_space[agent_index].n
        print('agent action_space ' + str(env.action_space[agent_index].n))
        self.state_size = env.observation_space[agent_index].shape
        self.action_range = action_range

        with tf.variable_scope('ddpg_' + str(agent_index)):
            critic = Critic(name='critic_' + str(agent_index),
                            layer_norm=layer_norm,
                            nb_layers=nb_layers,
                            nb_neurons=nb_neurons)
            actor = Actor(self.nb_actions,
                          name='actor_' + str(agent_index),
                          layer_norm=layer_norm,
                          nb_neurons=nb_neurons,
                          activation=activation)
            memory = Memory(limit=int(memory_size),
                            action_shape=(self.nb_actions, ),
                            observation_shape=self.state_size)
            action_noise = None
            param_noise = None
            if noise_type is not None:
                for current_noise_type in noise_type.split(','):
                    current_noise_type = current_noise_type.strip()
                    if current_noise_type == 'none':
                        pass
                    elif 'adaptive-param' in current_noise_type:
                        _, stddev = current_noise_type.split('_')
                        param_noise = AdaptiveParamNoiseSpec(
                            initial_stddev=float(stddev),
                            desired_action_stddev=float(stddev))
                    elif 'normal' in current_noise_type:
                        _, stddev = current_noise_type.split('_')
                        action_noise = NormalActionNoise(
                            mu=np.zeros(self.nb_actions),
                            sigma=float(stddev) * np.ones(self.nb_actions))
                    elif 'ou' in current_noise_type:
                        _, stddev = current_noise_type.split('_')
                        action_noise = OrnsteinUhlenbeckActionNoise(
                            mu=np.zeros(self.nb_actions),
                            sigma=float(stddev) * np.ones(self.nb_actions),
                            dt=env.world.dt,
                            theta=0.1)
                    else:
                        raise RuntimeError('unknown noise type "{}"'.format(
                            current_noise_type))

            self.agent = DDPG(actor,
                              critic,
                              memory,
                              self.state_size, (self.nb_actions, ),
                              action_range=self.action_range,
                              gamma=gamma,
                              tau=tau,
                              normalize_returns=normalize_returns,
                              normalize_observations=normalize_observations,
                              batch_size=batch_size,
                              action_noise=action_noise,
                              param_noise=param_noise,
                              critic_l2_reg=critic_l2_reg,
                              actor_lr=actor_lr,
                              critic_lr=critic_lr,
                              enable_popart=popart,
                              clip_norm=clip_norm,
                              reward_scale=reward_scale)

            logger.info('Using agent with the following configuration:')
            logger.info(str(self.agent.__dict__.items()))

        self.agent.initialize(sess)
        self.agent.reset()