示例#1
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0

        # saver.restore(tf.get_default_session(), '/home/projectvenom/Documents/AIPilot/AIPilot-ProjectVenom-master/model_SMode_exp4/Exp4_1/Exp4_SMode_best')
        
        roll_distance = deque([])
        roll_success_rate = deque([])
        roll_return = deque([])
        max_success_rate = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # episode_distance.append(info['distance'])

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:

                        if len(roll_distance) < 2000:
                            roll_distance.append(info['distance'])
                        else:
                            roll_distance.popleft()
                            roll_distance.append(info['distance'])

                        success_rate = 1 if info['distance'] <= 1 else 0

                        if len(roll_success_rate) < 2000:
                            roll_success_rate.append(success_rate)
                        else:
                            roll_success_rate.popleft()
                            roll_success_rate.append(success_rate)

                        if len(roll_return) < 2000:
                            roll_return.append(episode_reward)
                        else:
                            roll_return.popleft()
                            roll_return.append(episode_reward)

                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()
                
                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                  
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()
                
                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.
            
            saver.save(tf.get_default_session(), '/home/projectvenom/Documents/AIPilot/AIPilot-ProjectVenom-master/model/Exp5_mv')
            
            if np.mean(roll_success_rate) > max_success_rate:
                max_success_rate = np.mean(roll_success_rate)
                saver.save(tf.get_default_session(), '/home/projectvenom/Documents/AIPilot/AIPilot-ProjectVenom-master/model/Exp5_mv_best')

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()

            # only for test
            combined_stats['AI_success_rate'] = np.mean(roll_success_rate)
            combined_stats['AI_distance'] = np.mean(roll_distance)
            combined_stats['AI_return'] = np.mean(roll_return)
            # only for test

            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
示例#2
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          saved_model_basename,
          restore_model_name,
          crowdai_client,
          crowdai_token,
          reward_shaping,
          feature_embellishment,
          relative_x_pos,
          relative_z_pos,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    saved_model_dir = 'saved-models/'
    if saved_model_basename is None:
        saved_model_basename = ''.join(
            random.choices(string.ascii_lowercase + string.digits, k=8))
    saved_model_path = saved_model_dir + saved_model_basename
    if restore_model_name:
        restore_model_path = restore_model_name
        if not pathlib.Path(restore_model_path + '.index').is_file():
            restore_model_path = saved_model_dir + restore_model_name
    max_to_keep = 500
    eval_reward_threshold_to_keep = 300
    saver = tf.train.Saver(max_to_keep=max_to_keep)
    adam_optimizer_store = dict()
    adam_optimizer_store['actor_optimizer'] = dict()
    adam_optimizer_store['critic_optimizer'] = dict()

    #eval_episode_rewards_history = deque(maxlen=100)
    #episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        try:
            if restore_model_name:
                logger.info("Restoring from model at", restore_model_path)
                #saver.restore(sess, tf.train.latest_checkpoint(model_path))
                saver.restore(sess, restore_model_path)
            else:
                logger.info("Creating new model")
                sess.run(tf.global_variables_initializer(
                ))  # this should happen here and not in the agent right?
        except InvalidArgumentError as exc:
            if "Assign requires shapes of both tensors to match." in str(exc):
                print("Unable to restore model from {:s}.".format(
                    restore_model_path))
                print(
                    "Chances are you're trying to restore a model with reward embellishment into an environment without reward embellishment (or vice versa). Unfortunately this isn't supported (yet)."
                )
                print(exc.message)
                sys.exit()
            else:
                raise exc

        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()

        # restore adam optimizer
        try:
            if restore_model_name:
                logger.info("Restoring pkl file with adam state",
                            restore_model_path)
                #saver.restore(sess, tf.train.latest_checkpoint(model_path))
                adam_optimizer_store = pickle.load(
                    open(restore_model_path + ".pkl", "rb"))
                agent.actor_optimizer.m = adam_optimizer_store[
                    'actor_optimizer']['m']
                agent.actor_optimizer.v = adam_optimizer_store[
                    'actor_optimizer']['v']
                agent.actor_optimizer.t = adam_optimizer_store[
                    'actor_optimizer']['t']
                agent.critic_optimizer.m = adam_optimizer_store[
                    'critic_optimizer']['m']
                agent.critic_optimizer.v = adam_optimizer_store[
                    'critic_optimizer']['v']
                agent.critic_optimizer.t = adam_optimizer_store[
                    'critic_optimizer']['t']
                if 'param_noise' in adam_optimizer_store:
                    agent.param_noise = adam_optimizer_store['param_noise']
        except:
            print("Unable to restore adam state from {:s}.".format(
                restore_model_path))

        obs = env.reset()
        done = False
        episode_reward = 0.
        #episode_step = 0
        #episodes = 0
        #t = 0

        #epoch_episode_steps = []
        #epoch_episode_eval_rewards = []
        #epoch_episode_eval_steps = []
        #epoch_start_time = time.time()
        #epoch_actions = []
        #epoch_episodes = 0
        for epoch in range(nb_epochs):
            start_time = time.time()
            epoch_episode_rewards = []
            epoch_qs = []
            eval_episode_rewards = []
            eval_qs = []
            eval_steps = []
            epoch_actor_losses = []
            epoch_critic_losses = []
            worth_keeping = False
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    #new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    new_obs, r, done, info = env.step(action)
                    #t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    #episode_step += 1

                    # Book-keeping.
                    #epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        #episode_rewards_history.append(episode_reward)
                        #epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        #episode_step = 0
                        #epoch_episodes += 1
                        #episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                #epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        #epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Submit to crowdai competition. What a hack. :)
                #if crowdai_client is not None and crowdai_token is not None and eval_env is not None:
                crowdai_submit_count = 0
                if crowdai_client is not None and crowdai_token is not None:
                    eval_obs_dict = crowdai_client.env_create(
                        crowdai_token, env_id="ProstheticsEnv")
                    eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                        eval_obs_dict,
                        reward_shaping=reward_shaping,
                        reward_shaping_x=1.,
                        feature_embellishment=feature_embellishment,
                        relative_x_pos=relative_x_pos,
                        relative_z_pos=relative_z_pos)
                    while True:
                        action, _ = agent.pi(eval_obs_projection,
                                             apply_noise=False,
                                             compute_Q=False)
                        submit_action = prosthetics_env.openai_to_crowdai_submit_action(
                            action)
                        clipped_submit_action = np.clip(submit_action, 0., 1.)
                        actions_equal = clipped_submit_action == submit_action
                        if not np.all(actions_equal):
                            logger.debug("crowdai_submit_count:",
                                         crowdai_submit_count)
                            logger.debug("  openai-action:", action)
                            logger.debug("  submit-action:", submit_action)
                        crowdai_submit_count += 1
                        [eval_obs_dict, reward, done,
                         info] = crowdai_client.env_step(
                             clipped_submit_action.tolist(), True)
                        #[eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True)
                        eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                            eval_obs_dict,
                            reward_shaping=reward_shaping,
                            reward_shaping_x=1.,
                            feature_embellishment=feature_embellishment,
                            relative_x_pos=relative_x_pos,
                            relative_z_pos=relative_z_pos)
                        if done:
                            logger.debug("done: crowdai_submit_count:",
                                         crowdai_submit_count)
                            eval_obs_dict = crowdai_client.env_reset()
                            if not eval_obs_dict:
                                break
                            logger.debug(
                                "done: eval_obs_dict exists after reset")
                            eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation(
                                eval_obs_dict,
                                reward_shaping=reward_shaping,
                                reward_shaping_x=1.,
                                feature_embellishment=feature_embellishment,
                                relative_x_pos=relative_x_pos,
                                relative_z_pos=relative_z_pos)
                    crowdai_client.submit()
                    return  # kids, don't try any of these (expedient hacks) at home!

            if eval_env:
                eval_episode_reward_mean, eval_q_mean, eval_step_mean = evaluate_n_episodes(
                    3, eval_env, agent, nb_eval_steps, render_eval)
                if eval_episode_reward_mean >= eval_reward_threshold_to_keep:
                    worth_keeping = True

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            if nb_epochs and nb_epoch_cycles and nb_train_steps > 0:
                #stats = agent.get_stats()
                #combined_stats = stats.copy()
                combined_stats = {}
                combined_stats['train/epoch_episode_reward_mean'] = np.mean(
                    epoch_episode_rewards)
                #combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
                #combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
                #combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
                combined_stats['train/epoch_Q_mean'] = np.mean(epoch_qs)
                combined_stats['train/epoch_loss_actor'] = np.mean(
                    epoch_actor_losses)
                combined_stats['train/epoch_loss_critic'] = np.mean(
                    epoch_critic_losses)
                #combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
                combined_stats['train/epoch_duration'] = duration
                #combined_stats['epoch/steps_per_second'] = float(t) / float(duration)
                #combined_stats['total/episodes'] = episodes
                #combined_stats['rollout/episodes'] = epoch_episodes
                #combined_stats['rollout/actions_std'] = np.std(epoch_actions)
                #combined_stats['memory/rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
            else:
                combined_stats = {}
            # Evaluation statistics.
            if eval_env:
                combined_stats[
                    'eval/epoch_episode_reward_mean'] = eval_episode_reward_mean  # np.mean(eval_episode_rewards)
                #combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                #combined_stats['eval/epoch_episode_reward_std'] = np.std(eval_episode_rewards)
                combined_stats[
                    'eval/epoch_Q_mean'] = eval_q_mean  # np.mean(eval_qs)
                #combined_stats['eval/episodes'] = len(eval_episode_rewards)
                combined_stats[
                    'eval/steps_mean'] = eval_step_mean  # np.mean(eval_steps)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            #combined_stats['total/epochs'] = epoch + 1
            #combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.info('')
            logger.info('Epoch', epoch)
            logger.dump_tabular()
            logdir = logger.get_dir()

            if worth_keeping and rank == 0 and nb_epochs and nb_epoch_cycles and nb_train_steps and nb_rollout_steps:
                logger.info(
                    'Saving model to',
                    saved_model_dir + saved_model_basename + '-' + str(epoch))
                saver.save(sess,
                           saved_model_path,
                           global_step=epoch,
                           write_meta_graph=False)
                adam_optimizer_store['actor_optimizer'][
                    'm'] = agent.actor_optimizer.m
                adam_optimizer_store['actor_optimizer'][
                    'v'] = agent.actor_optimizer.v
                adam_optimizer_store['actor_optimizer'][
                    't'] = agent.actor_optimizer.t

                adam_optimizer_store['critic_optimizer'][
                    'm'] = agent.critic_optimizer.m
                adam_optimizer_store['critic_optimizer'][
                    'v'] = agent.critic_optimizer.v
                adam_optimizer_store['critic_optimizer'][
                    't'] = agent.critic_optimizer.t

                adam_optimizer_store['param_noise'] = agent.param_noise

                pickle.dump(
                    adam_optimizer_store,
                    open((saved_model_path + "-" + str(epoch) + ".pkl"), "wb"))
                old_epoch = epoch - max_to_keep
                if old_epoch >= 0:
                    try:
                        os.remove(saved_model_path + "-" + str(old_epoch) +
                                  ".pkl")
                    except OSError:
                        pass

            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
示例#3
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50, tensorboard_directory=None):
    rank = MPI.COMM_WORLD.Get_rank()

    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = 1
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    if not os.path.exists(tensorboard_directory):
        os.makedirs(tensorboard_directory)
    else:
        for file in os.listdir(tensorboard_directory):
            file_path = os.path.join(tensorboard_directory, file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(e)

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        episode_summaries, individual_summaries, batch_summaries, \
            episode_reward_pl, qfunc_loss_pl, actions_pl, prices_pl, individual_reward_pl, \
            individual_pnl_pl, individual_tc_pl, individual_estimated_q_pl = build_summaries(env.action_space.shape[0])
        sess.graph.finalize()
        writer = tf.summary.FileWriter(tensorboard_directory, sess.graph)

        agent.reset()
        obs_state = env.reset()
        obs = obs_state.features
        if eval_env is not None:
            eval_obs = eval_env.reset().features
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0
        train_steps = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    # assert max_action.shape == action.shape
                    new_obs_state, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    new_obs = new_obs_state.features
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    summary = sess.run(individual_summaries, feed_dict={
                                        actions_pl: action,
                                        prices_pl: obs_state.price,
                                        individual_reward_pl: r,
                                        individual_pnl_pl: info['pnl'],
                                        individual_tc_pl: info['tc'],
                                        individual_estimated_q_pl: q[0, 0]
                                    })
                    writer.add_summary(summary, t)

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs
                    obs_state = new_obs_state

                    if done:
                        # Episode done.
                        summary = sess.run(episode_summaries, feed_dict={
                                            episode_reward_pl: episode_reward
                                        })
                        writer.add_summary(summary, episodes)

                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs_state = env.reset()
                        obs = obs_state.features

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    train_steps += 1
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    summary = sess.run(batch_summaries, feed_dict={
                                        qfunc_loss_pl: cl
                                    })
                    writer.add_summary(summary, train_steps)
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        eval_obs = eval_obs.features
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset().features
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            print("EPOCH_EPISODE_REWARDS:", len(epoch_episode_rewards))
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
示例#4
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50, restore=True):
    rank = MPI.COMM_WORLD.Get_rank()

    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    # max_action = env.action_space.high
    # logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, (env.action_space.shape[0] -2,),
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, observation_range=(env.observation_space.low[0], env.observation_space.high[0]),
        action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up saving stuff only for a single worker.
    home = expanduser("~")
    savingModelPath = home + "/Documents/saved_models_OpenAI_gym/"
    if rank == 0:
        saver = tf.train.Saver(keep_checkpoint_every_n_hours=1)
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.

        # from https://github.com/openai/baselines/issues/162#issuecomment-397356482 and
        # https://www.tensorflow.org/api_docs/python/tf/train/import_meta_graph
        
        if restore == True:
            # restoring doesn't actually work
            logger.info("Restoring from saved model")
            saver = tf.train.import_meta_graph(savingModelPath + "ddpg_test_model.meta")
            saver.restore(sess, tf.train.latest_checkpoint(savingModelPath))
        else:
            logger.info("Starting from scratch!")
            sess.run(tf.global_variables_initializer()) # this should happen here and not in the agent right?


        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            start_time_epoch = time.time()
            for cycle in range(nb_epoch_cycles):
                start_time_cycle = time.time()
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    if(t_rollout == nb_rollout_steps - 2):
                        print("break here")
                    start_time_rollout = time.time()
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    # e.g. action = array([ 0.02667301,  0.9654905 , -0.5694418 , -0.40275186], dtype=float32)

                    np.set_printoptions(precision=3)
                    print("selected (unscaled) action: " + str(action)) # e.g. [ 0.04  -0.662 -0.538  0.324]
                    # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    target = np.insert(action, 3, [0.0, 0.0])
                    target = scale_range(target, -1, 1, env.action_space.low, env.action_space.high)
                    # e.g. target = array([0.17346749, 0.24137263, 0.10763955, 0.83703685, 1.8033525 , 1.8763105 ], dtype=float32)
                    # we keep the roll & pitch angle fixed
                    target[3] = 0.0
                    target[4] = np.pi/2
                    
                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert target.shape == env.action_space.shape
                    new_obs, r, done, info = env.step(target)
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()
                    
                    logger.info('runtime rollout-step {0}.{1}.{2}: {3}s'.format(epoch, cycle, t_rollout, time.time() - start_time_rollout))
                # for rollout_steps

                # Train.
                print("Training the Agent")
                start_time_train = time.time()
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps): # 50 iterations
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise() # e.g. 0.7446093559265137
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl) # e.g. 25.988863
                    epoch_actor_losses.append(al) # e.g. -0.008966461
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.
                logger.info('runtime training actor & critic: {}s'.format(time.time() - start_time_train))

                # Saving the trained model
                if(saver is not None):
                    logger.info("saving the trained model")
                    start_time_save = time.time()
                    saver.save(sess, savingModelPath + "ddpg_test_model")
                    logger.info('runtime saving: {}s'.format(time.time() - start_time_save))

                logger.info('runtime epoch-cycle {0}: {1}s'.format(cycle, time.time() - start_time_cycle))
            # for epoch_cycles

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
            
            # Saving the trained model
            if(saver is not None):
                logger.info("saving the trained model")
                start_time_save = time.time()
                saver.save(sess, savingModelPath + "ddpg_model_epochSave", global_step=epoch)
                logger.info('runtime saving: {}s'.format(time.time() - start_time_save))

            logger.info('runtime epoch {0}: {1}s'.format(epoch, time.time() - start_time_epoch))
示例#5
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          **kwargs):

    # print("kwargs:",kwargs)

    rank = MPI.COMM_WORLD.Get_rank()
    print("rank:", rank)
    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)

    with U.single_threaded_session() as sess:
        # Prepare everything.
        # --------------- AMEND: For saving and restoring the model. added by xlv ------------------
        if kwargs['restore'] == True and kwargs['restore_path'] != None:
            logger.info("Restoring from saved model")
            saver = tf.train.import_meta_graph(restore_path +
                                               "trained_model.meta")
            saver.restore(sess, tf.train.latest_checkpoint(restore_path))
        else:
            logger.info("Starting from scratch!")
            sess.run(tf.global_variables_initializer())
        # ----------------------------------------------------------------------------------------
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = eval_obs = env.reset()

        # if eval_env is not None:
        #     eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []

        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0

        # every 30 epochs plot statistics and save it.
        nb_epochs_unit = 30
        ddpg_rewards = []
        eval_ddpg_rewards = []

        ddpg_suc_percents = []
        eval_suc_percents = []

        # ---- AMEND: added by xlv to calculate success percent -----
        suc_num = 0
        episode_num = 0
        # -----------------------------------------------------------
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    # new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    new_obs, r, done, suc, info = env.step(max_action * action)
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1
                        # --- AMEND: added by xlv to calculate success percent ---
                        episode_num += 1
                        if suc:
                            suc_num += 1
                        # -------------------------------------------------------
                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                # eval_episode_rewards = []
                # eval_qs = []
                # if eval_env is not None:
                #     eval_episode_reward = 0.
                #     for t_rollout in range(nb_eval_steps):
                #         eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                #         eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                #         if render_eval:
                #             eval_env.render()
                #         eval_episode_reward += eval_r
                #
                #         eval_qs.append(eval_q)
                #         if eval_done:
                #             eval_obs = eval_env.reset()
                #             eval_episode_rewards.append(eval_episode_reward)
                #             eval_episode_rewards_history.append(eval_episode_reward)
                #             eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)

            # Evaluation statistics.
            # if eval_env is not None:
            #     combined_stats['eval/return'] = eval_episode_rewards
            #     combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
            #     combined_stats['eval/Q'] = eval_qs
            #     combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

            # ------------------------------ plot statistics every nb_epochs_unit -----------------------------------
            ddpg_rewards.append(np.mean(episode_rewards_history))
            if (epoch + 1) % nb_epochs_unit == 0:
                ddpg_suc_percents.append(suc_num / episode_num)
                # ---------- Evaluate for 5 iters -----------------------
                nb_eval_epochs = 5
                nb_eval_epoch_cycles = 5
                eval_episode_num = 0
                eval_suc_num = 0

                eval_episode_reward = 0
                eval_episode_step = 0

                eval_epoch_episode_rewards = []
                eval_epoch_episode_steps = []
                for i_epoch in range(nb_eval_epochs):
                    logger.log(
                        "********** Start Evaluation. Iteration %i ************"
                        % i_epoch)
                    for i_cycle in range(nb_eval_epoch_cycles):
                        for t_rollout in range(nb_rollout_steps):
                            eval_action, eval_q = agent.pi(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                            assert eval_action.shape == env.action_space.shape
                            eval_obs, eval_r, eval_done, eval_suc, eval_info = env.step(
                                max_action * eval_action
                            )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                            eval_episode_reward += eval_r
                            eval_episode_step += 1
                            if eval_done:
                                eval_obs = env.reset()
                                eval_epoch_episode_rewards.append(
                                    eval_episode_reward)
                                eval_episode_rewards_history.append(
                                    eval_episode_reward)
                                eval_epoch_episode_steps.append(
                                    eval_episode_step)
                                eval_episode_reward = 0
                                eval_episode_step = 0

                                eval_episode_num += 1
                                if eval_suc:
                                    eval_suc_num += 1
                    logger.record_tabular(
                        "Eval_EpRewMean",
                        np.mean(eval_episode_rewards_history))
                    logger.record_tabular("Eval_EpNumUntilNow",
                                          eval_episode_num)
                    logger.record_tabular("Eval_EpNumSuc", eval_suc_num)
                    logger.record_tabular("Eval_EpSucPercent",
                                          eval_suc_num / eval_episode_num)
                    logger.dump_tabular()
                    eval_ddpg_rewards.append(
                        np.mean(eval_episode_rewards_history))
                eval_suc_percents.append(eval_suc_num / eval_episode_num)
                # ----------------------------------------------------------------------------------------------
                # --------------------- plotting and saving -------------------------
                if saver is not None:
                    logger.info("saving the trained model")
                    start_time_save = time.time()
                    if epoch + 1 == nb_epochs:
                        saver.save(sess,
                                   kwargs['MODEL_DIR'] + "/trained_model")
                    else:
                        saver.save(
                            sess, kwargs['MODEL_DIR'] + "/iter_" + str(
                                (epoch + 1) // nb_epochs_unit))

                plot_performance(range(len(ddpg_rewards)),
                                 ddpg_rewards,
                                 ylabel=r'avg reward per DDPG learning step',
                                 xlabel='ddpg iteration',
                                 figfile=os.path.join(kwargs['FIGURE_DIR'],
                                                      'ddpg_reward'),
                                 title='TRAIN')
                plot_performance(
                    range(len(ddpg_suc_percents)),
                    ddpg_suc_percents,
                    ylabel=
                    r'overall success percentage per algorithm step under DDPG',
                    xlabel='algorithm iteration',
                    figfile=os.path.join(kwargs['FIGURE_DIR'],
                                         'success_percent'),
                    title="TRAIN")

                plot_performance(range(len(eval_ddpg_rewards)),
                                 eval_ddpg_rewards,
                                 ylabel=r'avg reward per DDPG eval step',
                                 xlabel='ddpg iteration',
                                 figfile=os.path.join(kwargs['FIGURE_DIR'],
                                                      'eval_ddpg_reward'),
                                 title='EVAL')
                plot_performance(
                    range(len(eval_suc_percents)),
                    eval_suc_percents,
                    ylabel=
                    r'overall eval success percentage per algorithm step under DDPG',
                    xlabel='algorithm iteration',
                    figfile=os.path.join(kwargs['FIGURE_DIR'],
                                         'eval_success_percent'),
                    title="EVAL")

                # save data which is accumulated UNTIL iter i
                with open(
                        kwargs['RESULT_DIR'] + '/ddpg_reward_' + 'iter_' + str(
                            (epoch + 1) // nb_epochs_unit) + '.pickle',
                        'wb') as f2:
                    pickle.dump(ddpg_rewards, f2)
                with open(
                        kwargs['RESULT_DIR'] + '/success_percent_' + 'iter_' +
                        str((epoch + 1) // nb_epochs_unit) + '.pickle',
                        'wb') as fs:
                    pickle.dump(ddpg_suc_percents, fs)

                # save evaluation data accumulated until iter i
                with open(
                        kwargs['RESULT_DIR'] + '/eval_ddpg_reward_' + 'iter_' +
                        str((epoch + 1) // nb_epochs_unit) + '.pickle',
                        'wb') as f_er:
                    pickle.dump(eval_ddpg_rewards, f_er)
                with open(
                        kwargs['RESULT_DIR'] + '/eval_success_percent_' +
                        'iter_' + str(
                            (epoch + 1) // nb_epochs_unit) + '.pickle',
                        'wb') as f_es:
                    pickle.dump(eval_suc_percents, f_es)
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          aux_apply,
          aux_tasks,
          tc_lambda,
          prop_lambda,
          caus_lambda,
          repeat_lambda,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    # Setup aux tasks' lambdas
    aux_lambdas = {
        'tc': tc_lambda,
        'prop': prop_lambda,
        'caus': caus_lambda,
        'repeat': repeat_lambda
    }

    # Create agent
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 aux_tasks=aux_tasks,
                 aux_lambdas=aux_lambdas)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.make_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            ep_rollout_times = []
            ep_train_times = []
            for cycle in range(nb_epoch_cycles):
                rollout_startt = time.time()
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    #print("action mean:{} -- Q: {}".format(np.mean(action), q))

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1
                        agent.reset()
                        obs = env.reset()

                # for the first 5 cycles just gather data
                if epoch == 0 and cycle < 5:
                    continue

                train_startt = time.time()
                ep_rollout_times.append(train_startt - rollout_startt)

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_aux_losses = {}
                epoch_aux_losses['grads/actor_grads'] = []
                epoch_aux_losses['grads/critic_grads'] = []
                epoch_aux_losses['grads/aux_grads'] = []
                for name in aux_tasks:
                    epoch_aux_losses['aux/' + name] = []
                epoch_adaptive_distances = []

                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al, auxl = agent.train()

                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    for name, value in auxl.items():
                        if 'grads' in name:
                            epoch_aux_losses['grads/' + name].append(
                                np.abs(value))
                        else:
                            epoch_aux_losses['aux/' + name].append(
                                np.abs(value))

                    agent.update_target_net()

                ep_train_times.append(time.time() - train_startt)

                if eval_env is not None:
                    # Evaluate.
                    eval_episode_rewards = []
                    eval_qs = []
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            print('rollout avg time (s): {}'.format(np.mean(ep_rollout_times)))
            print('train avg time (s): {}'.format(np.mean(ep_train_times)))
            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Auxiliary statistics.
            if aux_tasks is not None:
                for name, values in epoch_aux_losses.items():
                    combined_stats[name] = np.mean(values)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          use_vision=False):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    if use_vision:
        obs_shape = (100, 100, 3)  # TODO: make adjustable
    else:
        obs_shape = env.observation_space["observation"].shape
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space["observation"].shape,
                 obs_shape,
                 env.action_space.shape,
                 env.observation_space["desired_goal"].shape,
                 env.observation_space["desired_goal"].shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()

        obs_dict = env.reset()
        obs = obs_dict
        goal = obs_dict["desired_goal"]
        goalobs = obs_dict["desired_goal"]

        if eval_env is not None:
            eval_obs = eval_env.reset()

        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            print("starting epoch", epoch, "of", nb_epochs)
            for cycle in range(nb_epoch_cycles):
                print("   starting rollout", cycle, "of", nb_epoch_cycles)
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    print("      starting step", t_rollout, "of",
                          nb_rollout_steps)
                    # Predict next action.
                    if use_vision:
                        obs_train = obs["pixels"]
                        obs_train = cv2.resize(obs_train, (100, 100))
                    else:  #TODO: Make this take correct shape
                        obs_train = obs["observation"]
                    action, q = agent.pi(obs_train,
                                         goalobs,
                                         state=obs["observation"],
                                         apply_noise=True,
                                         compute_Q=True)

                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()

                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    if use_vision:
                        new_obs_train = new_obs["pixels"]
                        new_obs_train = cv2.resize(new_obs_train, (100, 100))
                    else:  #TODO: Make this take correct shape
                        new_obs_train = new_obs["observation"]

                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs_train, action, r, new_obs_train,
                                           done, obs["observation"],
                                           new_obs["observation"], goal,
                                           goalobs)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs_dict = env.reset()
                        obs = obs_dict
                        goal = obs_dict["desired_goal"]
                        goalobs = obs_dict["desired_goal"]

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    print("   >>> starting train step", t_train, "of",
                          nb_train_steps)
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_obs = eval_env.reset()
                    goalobs = eval_obs["desired_goal"]
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        print("   >>> starting eval step", t_rollout, "of",
                              nb_eval_steps)
                        if use_vision:
                            eval_obs_train = eval_obs["pixels"]
                            eval_obs_train = cv2.resize(
                                eval_obs_train, (100, 100))
                        else:  # TODO: Make this take correct shape
                            eval_obs_train = eval_obs["observation"]
                        eval_action, eval_q = agent.pi(eval_obs_train,
                                                       goalobs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            goalobs = eval_obs["desired_goal"]
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            # stats = agent.get_stats() # TODO: add bback in
            combined_stats = {}
            # for key in sorted(stats.keys()): # TODO: add back in
            #     combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
def train_return(env,
                 param_noise,
                 actor,
                 critic,
                 memory,
                 nb_epochs=250,
                 nb_epoch_cycles=20,
                 reward_scale=1.,
                 render=False,
                 normalize_returns=False,
                 normalize_observations=True,
                 critic_l2_reg=1e-2,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 action_noise=None,
                 popart=False,
                 gamma=0.99,
                 clip_norm=None,
                 nb_train_steps=50,
                 nb_rollout_steps=2048,
                 batch_size=64,
                 tau=0.01,
                 param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    # Set up logging stuff only for a single worker.

    episode_rewards_history = deque(maxlen=100)
    #with U.single_threaded_session() as sess:
    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()
    obs = env.reset()
    episode_reward = 0.
    episode_step = 0
    episodes = 0
    t = 0

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        print('epoch number:', epoch)
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                assert action.shape == env.action_space.shape

                # Execute next action.
                if rank == 0 and render:
                    env.render()
                assert max_action.shape == action.shape
                new_obs, r, done, info = env.step(
                    max_action * action
                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs

                if done:
                    # Episode done.
                    epoch_episode_rewards.append(episode_reward)
                    episode_rewards_history.append(episode_reward)
                    epoch_episode_steps.append(episode_step)
                    episode_reward = 0.
                    episode_step = 0
                    epoch_episodes += 1
                    episodes += 1

                    agent.reset()
                    obs = env.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()
    return agent
class DDPGAgent(Policy):
    def __init__(self,
                 env,
                 agent_index,
                 sess,
                 action_range=(-1., 1.),
                 reward_scale=0.1,
                 critic_l2_reg=1e-2,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 popart=False,
                 gamma=0.975,
                 clip_norm=10,
                 batch_size=64,
                 memory_size=1e6,
                 tau=0.01,
                 normalize_returns=False,
                 normalize_observations=False,
                 noise_type="adaptive-param_0.1",
                 layer_norm=True,
                 nb_layers=2,
                 nb_neurons=64,
                 activation='tanh',
                 **network_kwargs):
        super(DDPGAgent, self).__init__(agent_index)
        # self.sess = sess
        self.nb_actions = env.action_space[agent_index].n
        print('agent action_space ' + str(env.action_space[agent_index].n))
        self.state_size = env.observation_space[agent_index].shape
        self.action_range = action_range

        with tf.variable_scope('ddpg_' + str(agent_index)):
            critic = Critic(name='critic_' + str(agent_index),
                            layer_norm=layer_norm,
                            nb_layers=nb_layers,
                            nb_neurons=nb_neurons)
            actor = Actor(self.nb_actions,
                          name='actor_' + str(agent_index),
                          layer_norm=layer_norm,
                          nb_neurons=nb_neurons,
                          activation=activation)
            memory = Memory(limit=int(memory_size),
                            action_shape=(self.nb_actions, ),
                            observation_shape=self.state_size)
            action_noise = None
            param_noise = None
            if noise_type is not None:
                for current_noise_type in noise_type.split(','):
                    current_noise_type = current_noise_type.strip()
                    if current_noise_type == 'none':
                        pass
                    elif 'adaptive-param' in current_noise_type:
                        _, stddev = current_noise_type.split('_')
                        param_noise = AdaptiveParamNoiseSpec(
                            initial_stddev=float(stddev),
                            desired_action_stddev=float(stddev))
                    elif 'normal' in current_noise_type:
                        _, stddev = current_noise_type.split('_')
                        action_noise = NormalActionNoise(
                            mu=np.zeros(self.nb_actions),
                            sigma=float(stddev) * np.ones(self.nb_actions))
                    elif 'ou' in current_noise_type:
                        _, stddev = current_noise_type.split('_')
                        action_noise = OrnsteinUhlenbeckActionNoise(
                            mu=np.zeros(self.nb_actions),
                            sigma=float(stddev) * np.ones(self.nb_actions),
                            dt=env.world.dt,
                            theta=0.1)
                    else:
                        raise RuntimeError('unknown noise type "{}"'.format(
                            current_noise_type))

            self.agent = DDPG(actor,
                              critic,
                              memory,
                              self.state_size, (self.nb_actions, ),
                              action_range=self.action_range,
                              gamma=gamma,
                              tau=tau,
                              normalize_returns=normalize_returns,
                              normalize_observations=normalize_observations,
                              batch_size=batch_size,
                              action_noise=action_noise,
                              param_noise=param_noise,
                              critic_l2_reg=critic_l2_reg,
                              actor_lr=actor_lr,
                              critic_lr=critic_lr,
                              enable_popart=popart,
                              clip_norm=clip_norm,
                              reward_scale=reward_scale)

            logger.info('Using agent with the following configuration:')
            logger.info(str(self.agent.__dict__.items()))

        self.agent.initialize(sess)
        self.agent.reset()

    def action(self, obs, apply_noise=False, compute_Q=False):
        if compute_Q:
            return self.agent.pi(obs,
                                 apply_noise=apply_noise,
                                 compute_Q=compute_Q)
        else:
            return self.agent.pi(obs,
                                 apply_noise=apply_noise,
                                 compute_Q=compute_Q)[0]

    def reset(self):
        return self.agent.reset()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()
    #print(np.abs(env.action_space.low))
    #print(np.abs(env.action_space.high))
    #assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high

    logger.info('scaling actions by {} before executing in env'.format(max_action))
    if load_memory:
        memory=pickle.load(open("/home/vaisakhs_shaj/Desktop/BIG-DATA/memory1000000.pickle","rb"))
        '''
        samps = memoryPrev.sample(batch_size=memoryPrev.nb_entries)
        print(len(samps['obs0'][1]))
        for i in range(memoryPrev.nb_entries):
            memory.append(samps['obs0'][i], samps['actions'][i], samps['rewards'][i], samps['obs1'][i],  samps['terminals1'][i])
        '''
        print("=============memory loaded================")

    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))
    envs = [make_env(seed) for seed in range(nproc)]
    envs = SubprocVecEnv(envs)
    
    '''
     # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None
    '''
    saver=tf.train.Saver()
    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=10)

    with U.make_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        if restore:
            filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(15000)+".model"
            saver.restore(sess,filename)
            print("loaded!!!!!!!!!!!!!")
            #p=[v.name for v in tf.all_variables()]
            #print(p)
        
        obs = envs.reset()

        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_reward3 = 0.
        episode_step = 0
        episode_step3 = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = deque(maxlen=10)
        epoch_episode_steps3 = deque(maxlen=10)
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        learning_starts = 10000
        for epoch in range(nb_epochs):
            print("cycle-memory")
            print(max_action)
            for cycle in range(nb_epoch_cycles):
                print(cycle,"-",memory.nb_entries,end=" ")
                sys.stdout.flush()
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action = np.stack([agent.pi(obs[i], apply_noise=True, compute_Q=False)[0] for i in range(nproc)])
                    q = np.stack([agent.pi(obs[i], apply_noise=True, compute_Q=True)[1] for i in range(nproc)])
                    # action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    #assert action.shape == env.action_space.shape
                    #print(i)
                    # Execute next action in parallel.
                    if rank == 0 and render:
                        env.render()
                    #assert max_action.shape == action.shape
                    new_obs, r, done, info = envs.step(action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    #print(r)
                    #print(r[1])
                    sys.stdout.flush()
                    episode_reward += r[1]
                    #episode_reward3 += r[2]
                    episode_step += 1
                    #episode_step3 += 1
                    '''
                    if episode_step==300:
                        e=episode_step
                        re=episode_reward
                    if episode_step>300:
                        episode_step=e
                        episode_reward=re
                    '''
                    #print(episode_step)

                    book_keeping_obs=obs
                    obs = new_obs
                    #print(envs[1])
                    #print(episode_reward)
                    # Book-keeping in parallel.
                    epoch_actions.append(np.mean(action))
                    epoch_qs.append(np.mean(q))
                    for i in range(nproc):
                        agent.store_transition(book_keeping_obs[i], action[i], r[i], new_obs[i], done[i])
                        #print(done)
                        if done[i]:
                            # Episode done.
                            #print("====done====",episode_reward)
                            if i==1:
                                
                                epoch_episode_rewards.append(episode_reward)
                                #rint(epoch_episode_rewards)
                                #episode_rewards_history.append(episode_reward)
                                epoch_episode_steps.append(episode_step)
                                episode_reward = 0.
                                #episode_reward3 = 0
                                episode_step = 0
                                epoch_episodes += 1
                                episodes += 1
                            '''
                            if i==2:
                                
                                #epoch_episode_rewards.append(episode_reward3)
                                #rint(epoch_episode_rewards)
                                episode_rewards_history.append(episode_reward3)
                                epoch_episode_steps3.append(episode_step3)
                                episode_reward3 = 0
                                episode_step3 = 0
                            '''    

                            agent.reset()
                            temp = envs.reset()
                            obs[i]=temp[i]
                            
                            
                    
                

                    '''
                    Variables in TensorFlow only have values inside sessions.
                    Once the session is over, the variables are lost.
                    saver,save and saver .restore depends on session and has to be inside the 
                    session.
                    '''
                
                        

                   

                    # Train.
                    epoch_actor_losses = []
                    epoch_critic_losses = []
                    epoch_adaptive_distances = []
                    for t_train in range(nb_train_steps):
                        # Adapt param noise, if necessary.
                        if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                            distance = agent.adapt_param_noise()
                            epoch_adaptive_distances.append(distance)

                        cl, al = agent.train()
                        epoch_critic_losses.append(cl)
                        epoch_actor_losses.append(al)
                        agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_rl

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.
                #print(episode_rewards_history) 
            if (t)%20000 == 0:
                fname="/home/vaisakhs_shaj/Desktop/BIG-DATA/memoryStill"+str(memory.nb_entries)+".pickle"
                pickle.dump(memory,open(fname,"wb"),protocol=-1)
            if t % 5000 == 0:
                print("=======saving interim model==========")
                filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(t)+".model"
                saver.save(sess,filename)
            mpi_size = MPI.COMM_WORLD.Get_size()
            
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps2'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/episode_steps3'] = np.mean(epoch_episode_steps3)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            
            if eval_env is not None:
                combined_stats['eval/return'] = np.mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = np.mean(eval_qs)
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
               

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            print(logdir)
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
def train(env_id, env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50, my_render=True, eg_explore=True,reward_param_scaling=1.0,
    reward_param_thr = 70, reward_param_type='const'):

    print('Start training for env: '+env_id)
    #change to your dir of choice for saving
    save_path = os.getcwd()
    print('Save data at '+save_path+'. Change to your desired path.')

    dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.pkl'
    append_num = 0
    while os.path.exists(os.path.join(save_path,dump_name)):
        dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.'+str(append_num)+'.pkl'
        append_num+=1

    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_com_sav = []

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            # collect data for saving plot
            save_data = {'act': [],
                         'obs': [],
                         'qpos':[],
                         'rew':[], # reward for this episode
                         'freq_com':[], # communication frequency
                         'act_ts': [],
                         'obs_ts': [],
                         'qpos_ts': [],
                         'rew_ts': [],  # reward for this episode
                         'freq_com_ts': [],  # communication frequency
                         'comm_r_factor':reward_param_scaling,
                         'eplen_ts':[] # len of test episodes
                         }

            # decay the exploration
            e_greed = 0.5 - 0.1 * np.log10( (t%10000) + 1)
            explore_switch = (t < 20000 and eg_explore and e_greed > 0)
            print('total steps: '+str(t)+', eps greedy rate: '+str(e_greed)+', explore is '+str(explore_switch))

            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.

                # init u_old, don't forget to change test also
                u_old = 1.0 * env.action_space.sample() / max_action

                num_no_com = 0
                for t_rollout in range(nb_rollout_steps):

                    # Predict next action.
                    # edit this to be param version
                    a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=True, compute_Q=True)
                    a0 = a_raw[0]
                    a1 = a_raw[1]

                    # eps greedy, flip the coin
                    # make eps decay first 10k updates
                    dice_greed = np.random.uniform()
                    if explore_switch and dice_greed < e_greed:
                        com = ( np.random.uniform() > 0.5 )
                    else:
                        com = (a0 > a1)

                    # action according to com switch
                    if com:
                        r_com = 0.0
                        action = np.copy(a_raw[2:]) #motor cmd
                    else:
                       if reward_param_type=='const':
                            r_com = 1. # const reward
                        elif reward_param_type=='linear':
                            r_com = (1.0 / (nb_rollout_steps - reward_param_thr)) * (nb_rollout_steps - num_no_com) # linear interp reward
                        elif reward_param_type=='inv':
                            r_com = 1.0 / (1.0 + (np.maximum(num_no_com - reward_param_thr, 0)))  # inv decay reward
                        else:
                            print('no such reward type!')
                            assert 1==0

                        r_com = reward_param_scaling * r_com
                        action = np.copy(u_old)
                        num_no_com += 1

                    assert action.shape == env.action_space.shape

                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        pass
                        # env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(a_raw)
                    epoch_qs.append(q)

                    agent.store_transition(np.concatenate([obs,u_old],axis=0), a_raw, r+r_com, np.concatenate([np.squeeze(new_obs), action],axis=0) , done)
                    obs = np.squeeze(new_obs)

                    save_data['act'].append(np.array(action))
                    save_data['obs'].append(np.array(obs))
                    if hasattr(env.unwrapped, 'data'):
                        save_data['qpos'].append(np.array(env.unwrapped.data.qpos))

                    u_old = np.copy(action)

                    if done:
                        # Episode done.
                        epoch_com_sav.append(np.asarray(1.0*num_no_com/episode_step))

                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()


                print('communication savings: ' + str(num_no_com)) # check com number
                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()

            # log stuff
            save_data['rew'].append(np.mean(epoch_episode_rewards))
            save_data['freq_com'].append(np.mean(epoch_com_sav))

            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

        ###===============================================
        # test the fully-trained agent
        env = env.unwrapped

        print('*Final testing*')
        n_test = 1
        n_ts_rollout = 500
        # obs = env.env.reset()
        for i_test in range(n_test):
            if i_test%50==0:
                print('test iteration: '+str(i_test))
            obs = env.reset()
            # take some actions
            # start with small during test time
            u_old = 0 * env.action_space.sample() / max_action

            num_no_com = 0

            ts_step = 0
            ts_reward = 0
            for i_test_rollout in range(n_ts_rollout):
                # Predict next action.
                # edit this to be param version
                a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=False, compute_Q=True)
                a0 = a_raw[0]
                a1 = a_raw[1]

                com = (a0 > a1)

                # action according to com switch
                if com:
                    action = np.copy(a_raw[2:])
                else:
                    action = np.copy(u_old)
                    num_no_com += 1

                assert action.shape == env.action_space.shape

                new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                # print('Done: '+str(done))
                ts_reward += r # do i really need to change this? change back to r only
                ts_step += 1


                # record trajectory                # save_data['rew'].append(np.array(r)) # need to change here, what's a good performance measure?
                save_data['act_ts'].append(max_action *action) # record the actual u
                save_data['obs_ts'].append(np.array(obs))


                u_old = np.copy(action)
                obs = np.copy(new_obs) # update obs

            # # store episode rew as performance measure
            # save_data['eplen_ts'].append(np.array(i_test_rollout+1))
            # save_data['rew_ts'].append(np.array(ts_reward))
            # save_data['freq_com_ts'].append(np.array(1.0*num_no_com/(i_test_rollout+1)))

            agent.reset() # doesn't matter if not stochastic

        # plot the trajectory
        ### states
        xs = np.asarray(save_data['obs_ts'])
        ths = np.arctan2(xs[:, 1], xs[:, 0])

        ### control
        us = np.asarray(save_data['act_ts'])

        id_seg = 0

        horz_plt = 500
        plt.figure(figsize=[15, 20])
        plt.subplot(211)
        plt.plot(ths[id_seg * horz_plt:(id_seg + 1) * horz_plt], label='th')
        plt.plot(xs[:, 2][id_seg * horz_plt:(id_seg + 1) * horz_plt], color='g', label='th_dot')
        plt.legend()
        plt.title('state plot')

        plt.subplot(212)
        plt.plot(us[id_seg * horz_plt:(id_seg + 1) * horz_plt], color='r')
        plt.title('control plot')

        plt.show()
示例#12
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          perform=False,
          expert=None,
          save_networks=False):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 expert=expert,
                 save_networks=save_networks)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        network_saving_dir = os.path.join('./saved_networks',
                                          env.env.spec.id) + '/'
        if not os.path.exists(network_saving_dir):
            os.makedirs(network_saving_dir)
        agent.initialize(sess, saver, network_saving_dir, 10000, 30000)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        small_buffer = []
        big_buffer = []
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                if not perform:
                    # Perform rollouts.
                    for t_rollout in range(nb_rollout_steps):
                        # Predict next action.
                        action, q = agent.pi(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                        assert action.shape == env.action_space.shape

                        # Execute next action.
                        if rank == 0 and render:
                            env.render()
                        assert max_action.shape == action.shape
                        new_obs, r, done, info = env.step(
                            max_action * action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        t += 1
                        if rank == 0 and render:
                            env.render()
                        episode_reward += r
                        episode_step += 1

                        # Book-keeping.
                        epoch_actions.append(action)
                        epoch_qs.append(q)
                        agent.store_transition(obs, action, r, new_obs, done)
                        obs = new_obs

                        if done:
                            # Episode done.
                            epoch_episode_rewards.append(episode_reward)
                            episode_rewards_history.append(episode_reward)
                            epoch_episode_steps.append(episode_step)
                            episode_reward = 0.
                            episode_step = 0
                            epoch_episodes += 1
                            episodes += 1

                            agent.reset()
                            obs = env.reset()

                    # Train.
                    epoch_actor_losses = []
                    epoch_critic_losses = []
                    epoch_adaptive_distances = []
                    for t_train in range(nb_train_steps):
                        # Adapt param noise, if necessary.
                        if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                            distance = agent.adapt_param_noise()
                            epoch_adaptive_distances.append(distance)

                        cl, al = agent.train()
                        epoch_critic_losses.append(cl)
                        epoch_actor_losses.append(al)
                        agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        old_eval_obs = eval_obs
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])

                        if perform:
                            small_buffer.append([
                                old_eval_obs, eval_action, eval_r, eval_obs,
                                eval_done
                            ])

                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

                            if perform and len(small_buffer) > 0:
                                big_buffer.append(small_buffer)
                                small_buffer = []
                                if len(big_buffer
                                       ) > 0 and len(big_buffer) % 1000 == 0:
                                    expert_dir = os.path.join(
                                        './expert', env.env.spec.id) + '/'
                                    if not os.path.exists(expert_dir):
                                        os.makedirs(expert_dir)
                                    pwritefile = open(
                                        os.path.join(expert_dir, 'expert.pkl'),
                                        'wb')
                                    pickle.dump(big_buffer, pwritefile, -1)
                                    pwritefile.close()
                                    logger.info('Expert data saved!')
                                    return

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            combined_stats = {}
            if not perform:
                stats = agent.get_stats()
                for key in sorted(stats.keys()):
                    combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            if not perform:
                combined_stats['rollout/return'] = mpi_mean(
                    epoch_episode_rewards)
                combined_stats['rollout/return_history'] = mpi_mean(
                    np.mean(episode_rewards_history))
                combined_stats['rollout/episode_steps'] = mpi_mean(
                    epoch_episode_steps)
                combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
                combined_stats['rollout/actions_mean'] = mpi_mean(
                    epoch_actions)
                combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
                combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

                # Train statistics.
                combined_stats['train/loss_actor'] = mpi_mean(
                    epoch_actor_losses)
                combined_stats['train/loss_critic'] = mpi_mean(
                    epoch_critic_losses)
                combined_stats['train/param_noise_distance'] = mpi_mean(
                    epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))
            if not perform:
                # Total statistics.
                combined_stats['total/duration'] = mpi_mean(duration)
                combined_stats['total/steps_per_second'] = mpi_mean(
                    float(t) / float(duration))
                combined_stats['total/episodes'] = mpi_mean(episodes)
                combined_stats['total/epochs'] = epoch + 1
                combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
示例#13
0
class DDPGEnvLearner(EnvLearner):
    def __init__(self, env_in):
        EnvLearner.__init__(self, env_in)
        # from baselines.ddpg.models import Actor, Critic
        # Parse noise_type
        action_noise = None
        param_noise = None
        noise_type = 'adaptive-param_0.2'
        layer_norm = True
        nb_actions = self.state_dim
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

        # Configure components.

        self.buff_len = 10
        self.buffer = deque(self.buff_init * self.buff_len,
                            maxlen=self.buff_len)
        obs_space = (self.buff_init[0].size * self.buff_len, )
        self.memory = Memory(limit=int(1e6),
                             action_shape=env_in.observation_space.shape,
                             observation_shape=obs_space)
        self.critic = models.Critic(layer_norm=layer_norm)
        self.actor = models.Actor(nb_actions, layer_norm=layer_norm)

        self.agent = DDPG(self.actor,
                          self.critic,
                          self.memory,
                          obs_space,
                          env_in.observation_space.shape,
                          gamma=0.99,
                          tau=0.01,
                          normalize_returns=False,
                          normalize_observations=True,
                          batch_size=64,
                          action_noise=action_noise,
                          param_noise=param_noise,
                          critic_l2_reg=1e-2,
                          actor_lr=1e-5,
                          critic_lr=1e-5,
                          enable_popart=False,
                          clip_norm=None,
                          reward_scale=1.)

    def initialize(self, session, load=False):
        self.sess = session
        if not load:
            self.sess.run(tf.global_variables_initializer())
        self.agent.initialize(self.sess)

    def train(self,
              train,
              total_steps,
              valid=None,
              log_interval=10,
              early_stopping=-1,
              saver=None,
              save_str=None):
        G, yS, yR, yD, X, S, A = self.__prep_data__(train, batch_size=0)
        X = X[0]
        S = S[0]
        self.agent.reset()
        # max_action = self.env.action_space.high
        batch_size = 64
        t = 0
        episode_reward = 0
        episode_step = 0
        episodes = 0
        epoch_episodes = 0
        epoch_episode_rewards = []
        nb_epoch_cycles = 10
        nb_rollout_steps = 100
        nb_epochs = int(len(train) / (nb_epoch_cycles * nb_rollout_steps))

        nb_train_steps = total_steps
        param_noise_adaption_interval = 50
        i = 0

        for epoch in range(nb_epochs):
            start_time = time.time()
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.

                    # (obs_in, action_in, _, new_obs_in, done, episode_step) = train[i]

                    # obs = np.array([np.concatenate([obs_in/self.state_mul_const,
                    #                                 action_in/self.act_mul_const])]).flatten()
                    obs = X[i]
                    done = train[i][4]
                    action, q = self.agent.pi(obs,
                                              apply_noise=True,
                                              compute_Q=True)
                    r = -np.linalg.norm(S[i] / self.state_mul_const -
                                        action) / action.shape[0]

                    # if not done and i < len(train):
                    #     new_obs = np.array([np.concatenate([new_obs_in / self.state_mul_const,
                    #                                     train[i][1] / self.act_mul_const])]).flatten()
                    # else:
                    #     new_obs = np.array([np.concatenate([new_obs_in / self.state_mul_const,
                    #                                     np.zeros_like(action_in)])]).flatten()
                    if i < len(train):
                        new_obs = X[i + 1]
                    else:
                        new_obs = np.zeros_like(X[i])
                    t += 1
                    i += 1
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    self.agent.store_transition(obs, action, r, new_obs, done)

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_reward = 0.
                        epoch_episodes += 1
                        episodes += 1

                        self.agent.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if self.memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = self.agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = self.agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    self.agent.update_target_net()
            print('Epoch ' + str(epoch) + '/' + str(nb_epochs) +
                  ' with avg rew of: ' +
                  str(sum(epoch_episode_rewards) /
                      len(epoch_episode_rewards)) + ' in ' +
                  str(time.time() - start_time) + 's')
            if epoch % log_interval == 0 and epoch > 0:
                if saver is not None and save_str is not None:
                    save_path = saver.save(self.sess,
                                           'models/' + str(save_str) + '.ckpt')
                    print("Model saved in path: %s" % save_path)
        if saver is not None and save_str is not None:
            save_path = saver.save(self.sess,
                                   'models/' + str(save_str) + '.ckpt')
            print("Model saved in path: %s" % save_path)

    def step(self, obs_in, action_in, episode_step, save=True, buff=None):
        import copy
        obs = obs_in / self.state_mul_const
        action = action_in / self.act_mul_const
        if save:
            if episode_step == 0:
                self.buffer = deque(self.buff_init * self.buff_len,
                                    maxlen=self.buff_len)
            self.buffer.append(
                np.array([np.concatenate([obs, action])]).flatten())
        else:
            if buff is None:
                buff = copy.copy(self.buffer)
            if episode_step == 0:
                buff = deque(self.buff_init * self.buff_len,
                             maxlen=self.buff_len)
            buff.append(np.array([np.concatenate([obs, action])]).flatten())

        if buff is not None:
            x = np.array([np.concatenate(buff).flatten()])[0]
        else:
            x = np.array([np.concatenate(self.buffer).flatten()])[0]
        new_obs, _ = self.agent.pi(x, apply_noise=True, compute_Q=True)
        return new_obs
示例#14
0
def train(env_id, env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50, my_render=True, eg_explore=True,reward_param_scaling=1.0,
    reward_param_thr = 70, reward_param_type='const'):
    
    #save data
    ####################################
    full_path = txt_path + '_etc_RL.txt'
    file = open(full_path,'w')
    print('Start training for env: '+env_id)
    #change to your dir of choice for saving
    save_path = os.getcwd() 
    print('Save data at '+save_path+'. Change to your desired path.')

    
    dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.pkl'
    append_num = 0
    while os.path.exists(os.path.join(save_path,dump_name)):
        dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.'+str(append_num)+'.pkl'
        append_num+=1

    rank = MPI.COMM_WORLD.Get_rank()
    print('second rank is ',rank)
    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.##############danny
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver(max_to_keep = 1)
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0
        
        
        epoch = 0
        start_time = time.time()

        epoch_com_sav = []

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        count_fwj = 0 # used to count how many step() is played
        for epoch in range(nb_epochs):
            print(nb_epochs)
            # collect data for saving plot
            save_data = {'act': [],
                         'obs': [],
                         'qpos':[],
                         'rew':[], # reward for this episode
                         'freq_com':[], # communication frequency
                         'act_ts': [],
                         'obs_ts': [],
                         'qpos_ts': [],
                         'rew_ts': [],  # reward for this episode
                         'freq_com_ts': [],  # communication frequency
                         'comm_r_factor':reward_param_scaling,
                         'eplen_ts':[] # len of test episodes
                         }

            # decay the exploration
            e_greed = 0.5 - 0.1 * np.log10( (t%10000) + 1)
            explore_switch = (t < 20000 and eg_explore and e_greed > 0)
            print('total steps: '+str(t)+', eps greedy rate: '+str(e_greed)+', explore is '+str(explore_switch))
            
            for cycle in range(nb_epoch_cycles):
                
                # Perform rollouts.

                # init u_old, don't forget to change test also
                u_old = 1.0 * env.action_space.sample() / max_action

                num_no_com = 0
                for t_rollout in range(nb_rollout_steps):

                    count_fwj+=1
                    print ('test played is ###################',count_fwj)
                    # Predict next action.
                    # edit this to be param version
                    if len(obs) is not 6:
                        obs.append(obs[2]-obs[0])
                        obs.append(obs[3]-obs[1])
                        a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=False, compute_Q=True)

                    else: 
                        #a_1 = np.zeros()
                        a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=False, compute_Q=True)
                        print('value of q is',q)
                        if count_fwj % 50000 == 0:
                            saver.save(sess,'./home/test_3_ar.ckpt',global_step =1)
                    a0 = a_raw[0]
                    a1 = a_raw[1]

                    # eps greedy, flip the coin
                    # make eps decay first 10k updates
                    dice_greed = np.random.uniform()
                    if explore_switch and dice_greed < e_greed:
                        com = ( np.random.uniform() > 0.5 )
                    else:
                        com = (a0 > a1)

                    # action according to com switch
                    if com:
                        r_com = 0.0
                        action = np.copy(a_raw[2:]) #No communication
                        num_no_com += 1             #No communication
                    else:
                        if reward_param_type=='const':
                            r_com = 1. # const reward
                        elif reward_param_type=='linear':
                            r_com = (1.0 / (nb_rollout_steps - reward_param_thr)) * (nb_rollout_steps - num_no_com) # linear interp reward
                        elif reward_param_type=='inv':
                            r_com = 1.0 / (1.0 + (np.maximum(num_no_com - reward_param_thr, 0)))  # inv decay reward
                        else:
                            print('no such reward type!')
                            assert 1==0

                        r_com = reward_param_scaling * r_com
                        #action = np.copy(u_old)
                        action = np.copy(a_raw[2:])
                        num_no_com += 1

                    assert action.shape == env.action_space.shape

                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(action) 

                    print(done)

                    file.write(str(new_obs)+',q_value_is,'+str(q)+',step_reward,'+str(r)+',action used,' + str(max_action*action)+'\n')


                    t += 1
                    if rank == 0 and render:
                        pass

                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(a_raw)
                    epoch_qs.append(q)

                    agent.store_transition(np.concatenate([obs,u_old],axis=0), a_raw, r+r_com, np.concatenate([np.squeeze(new_obs), action],axis=0) , done)
                    obs = np.squeeze(new_obs)

                    save_data['act'].append(np.array(action))
                    save_data['obs'].append(np.array(obs))
                    if hasattr(env.unwrapped, 'data'):
                        save_data['qpos'].append(np.array(env.unwrapped.data.qpos))

                    u_old = np.copy(action)

                    if done:


                        # Episode done.
                        epoch_com_sav.append(np.asarray(1.0*num_no_com/episode_step))

                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        print('one game finished, count is =================================',env.count)
                        file.write('count is,'+str(env.count))
                        file.write('done is,' + str(done))
                        file.write('long term reward is,' + str(env.long_term_reward))
                        file.write('#'*12+'one game finished\n')

                        agent.reset()
                        obs = env.reset()

                #end of loop nb_rollout
                print('communication savings: ' + str(num_no_com)) # check com number
                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()

            # log stuff
            save_data['rew'].append(np.mean(epoch_episode_rewards))
            save_data['freq_com'].append(np.mean(epoch_com_sav))

            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

        ###===============================================
        # test the fully-trained agent
        env = env.unwrapped
示例#15
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          overwrite_memory,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          logdir,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          agentName=None,
          resume=0):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)

    logF = open(os.path.join(logdir, 'log.txt'), 'a')
    logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a')

    with U.single_threaded_session() as sess:
        # Prepare everything.
        if (resume == 0):
            agent.initialize(sess)
        else:
            #restore = "{}-{}".format(agentName,resume)
            agent.initialize(sess,
                             path=os.path.abspath(logdir),
                             restore=agentName,
                             itr=resume,
                             overwrite=overwrite_memory)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        print("Epoch " + str(epoch) + " episodes " +
                              str(episodes) + " steps " + str(episode_step) +
                              " reward " + str(episode_reward))
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1
                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            print("Eval reward " + str(eval_episode_reward))
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            #    logdir = logger.get_dir()
            if rank == 0:
                logF.write(str(combined_stats["rollout/return"]) + "\n")
                json.dump(combined_stats, logStats)
                logF.flush()
                logStats.flush()
                #    agent.save(path = os.path.abspath(logdir), name = agentName, overwrite = overwrite_memory)
                agent.save(path=logdir,
                           name=agentName,
                           overwrite=overwrite_memory)
                logger.info("agent {} saved".format(agent.itr.eval()))
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
示例#16
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          callback=None,
          pretrained='none'):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Copy an env for evaluation
    env_eval = copy.deepcopy(env.env)

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        # load pretrained agent if possible
        if pretrained == 'none':
            logger.info('Training from scratch...')
        else:
            logger.info('Loading pretrained model from {}'.format(pretrained))
            #assert os.path.exists(pretrained)
            saver.restore(sess, pretrained)

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        total_time = 0
        start_time = time.time()

        total_time_record = []
        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        #epochxposdict = []
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1
                    total_time += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        total_time_record.append(total_time)
                        #epochxposdict.append(info['pos'][0])
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    # eval for one episode
                    eval_episode_reward = 0.0
                    eval_done = False
                    eval_obs = eval_env.reset()
                    while not eval_done:
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action)
                        eval_episode_reward += eval_r
                        eval_qs.append(eval_q)
                    eval_episode_rewards.append(eval_episode_reward)
                    eval_episode_rewards_history.append(eval_episode_reward)
                """
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.
                """

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

            # Call the callback
            if callback is not None:
                if callback(locals(),
                            globals()):  # callback returns a boolean value
                    break
        # Evaluate the policy on env to record trajs
        eval_rewards, eval_steps, trajs_obs, trajs_actions = evaluate(
            env_eval, agent=agent)
        if callback is not None:
            callback.final_call(locals(), globals())
示例#17
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
示例#18
0
def train(env,
          num_timesteps,
          nb_trials,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          test_interval,
          batch_size,
          memory,
          output,
          load_file,
          save=False,
          tau=0.01,
          evaluation=False,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    observation_range = [env.observation_space.low, env.observation_space.high]
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 observation_range=observation_range)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    trial_return_history = deque(maxlen=100)
    eval_trial_return_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        #dir_path = os.path.dirname(os.path.realpath(__file__))
        #tf.summary.FileWriter(dir_path, sess.graph)

        trial = 0
        ts = 0

        if load_file != '':
            saver.restore(sess, load_file)

        start_time = time.time()

        trial_returns = []
        trial_steps = []
        actions = []
        qs = []
        train_actor_losses = []
        train_critic_losses = []
        train_adaptive_distances = []

        while True:
            test = (test_interval >= 0
                    and trial % (test_interval + 1) == test_interval)

            if not test:
                # Perform rollout.
                env.set_test(test=False)
                obs = env.reset()
                agent.reset()
                done = 0
                trial_return = 0.
                trial_step = 0
                while done == 0:
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    ts += 1
                    if rank == 0 and render:
                        env.render()
                    trial_return += r
                    trial_step += 1

                    # Book-keeping.
                    actions.append(action)
                    qs.append(q)
                    agent.store_transition(
                        obs, action, r, new_obs,
                        done == 2)  # terminal indicator is 2
                    obs = new_obs

                    # Train.
                    if memory.nb_entries >= batch_size:
                        for t_train in range(nb_train_steps):
                            # Adapt param noise, if necessary.
                            if trial % param_noise_adaption_interval == 0:
                                distance = agent.adapt_param_noise()
                                train_adaptive_distances.append(distance)

                            cl, al = agent.train()
                            train_critic_losses.append(cl)
                            train_actor_losses.append(al)
                            agent.update_target_net()

                # Episode done.
                trial_steps.append(trial_step)
                trial_returns.append(trial_return)
                trial_return_history.append(trial_return)

            else:
                # Evaluate.
                eval_trial_return = 0.
                eval_trial_steps = 0
                if evaluation is not None:
                    env.set_test(test=True)
                    eval_obs = env.reset()
                    agent.reset()
                    eval_done = 0
                    while eval_done == 0:
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            env.render()
                        eval_trial_return += eval_r
                        eval_trial_steps += 1
                    # Episode done.
                    eval_trial_return_history.append(eval_trial_return)

                # Log stats.
                duration = time.time() - start_time
                combined_stats = {}
                if memory.nb_entries > 0:
                    # Print only if learing was happaning
                    stats = agent.get_stats()
                    for key in sorted(stats.keys()):
                        combined_stats[key] = mpi_mean(stats[key])

                    # Rollout statistics.
                    combined_stats['rollout/Q_mean'] = mpi_mean(qs)
                    combined_stats['rollout/actions_mean'] = mpi_mean(actions)
                    combined_stats['rollout/actions_std'] = mpi_std(actions)
                    combined_stats['rollout/trial_steps'] = mpi_mean(
                        trial_steps)
                    combined_stats['rollout/return'] = mpi_mean(trial_returns)
                    combined_stats['rollout/return_history'] = mpi_mean(
                        trial_return_history)

                    # Train statistics.
                    combined_stats['train/loss_actor'] = mpi_mean(
                        train_actor_losses)
                    combined_stats['train/loss_critic'] = mpi_mean(
                        train_critic_losses)
                    combined_stats['train/param_noise_distance'] = mpi_mean(
                        train_adaptive_distances)

                # Evaluation statistics.
                if evaluation is not None:
                    combined_stats['eval/Q'] = mpi_mean(eval_q)
                    combined_stats['eval/return'] = eval_trial_return
                    combined_stats['eval/return_history'] = mpi_mean(
                        eval_trial_return_history)
                    combined_stats['eval/steps'] = eval_trial_steps

                # Total statistics.
                combined_stats['total/duration'] = mpi_mean(duration)
                combined_stats['total/steps_per_second'] = mpi_mean(
                    float(ts) / float(duration))
                combined_stats['total/trials'] = trial
                combined_stats['total/steps'] = ts

                for key in sorted(combined_stats.keys()):
                    logger.record_tabular(key, combined_stats[key])
                logger.dump_tabular()
                logger.info('')
                logdir = logger.get_dir()
                if rank == 0 and logdir:
                    if hasattr(env, 'get_state'):
                        with open(os.path.join(logdir, 'env_state.pkl'),
                                  'wb') as f:
                            pickle.dump(env.get_state(), f)
                    if evaluation and hasattr(env, 'get_state'):
                        with open(os.path.join(logdir, 'eval_env_state.pkl'),
                                  'wb') as f:
                            pickle.dump(env.get_state(), f)

                # Reset statistics.
                trial_returns = []
                trial_steps = []
                actions = []
                qs = []
                train_actor_losses = []
                train_critic_losses = []
                train_adaptive_distances = []
                # End of evaluate and log statistics

            # Check if this is the last trial
            trial += 1
            if nb_trials and trial >= nb_trials:
                break
            if num_timesteps and ts >= num_timesteps:
                break

        # Saving policy and value function
        if save and saver and output != '':
            saver.save(sess, './%s' % output)