示例#1
0
def learn(
        network,
        env,
        data_path='',
        model_path='./model/',
        model_name='ddpg_none_fuzzy_150',
        file_name='test',
        model_based=False,
        memory_extend=False,
        model_type='linear',
        restore=False,
        dyna_learning=False,
        seed=None,
        nb_epochs=5,  # with default settings, perform 1M steps total
        nb_sample_cycle=5,
        nb_epoch_cycles=150,
        nb_rollout_steps=400,
        nb_model_learning=10,
        nb_sample_steps=50,
        nb_samples_extend=5,
        reward_scale=1.0,
        noise_type='normal_0.2',  #'adaptive-param_0.2',  ou_0.2, normal_0.2
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        batch_size=32,  # per MPI worker
        tau=0.01,
        param_noise_adaption_interval=50,
        **network_kwargs):

    nb_actions = env.action_space.shape[0]
    memory = Memory(limit=int(1e5),
                    action_shape=env.action_space.shape[0],
                    observation_shape=env.observation_space.shape)

    if model_based:
        """ store fake_data"""
        fake_memory = Memory(limit=int(1e5),
                             action_shape=env.action_space.shape[0],
                             observation_shape=env.observation_space.shape)
        """ select model or not """
        if model_type == 'gp':
            kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
            dynamic_model = GaussianProcessRegressor(kernel=kernel)
            reward_model = GaussianProcessRegressor(kernel=kernel)
        elif model_type == 'linear':
            dynamic_model = LinearRegression()
            reward_model = LinearRegression()
        elif model_type == 'mlp':
            dynamic_model = MLPRegressor(hidden_layer_sizes=(100, ),
                                         activation='relu',
                                         solver='adam',
                                         alpha=0.0001,
                                         batch_size='auto',
                                         learning_rate='constant',
                                         learning_rate_init=0.001,
                                         power_t=0.5,
                                         max_iter=200,
                                         shuffle=True,
                                         random_state=None,
                                         tol=0.0001,
                                         verbose=False,
                                         warm_start=False,
                                         momentum=0.9,
                                         nesterovs_momentum=True,
                                         early_stopping=False,
                                         validation_fraction=0.1,
                                         beta_1=0.9,
                                         beta_2=0.999,
                                         epsilon=1e-08)
            reward_model = MLPRegressor(hidden_layer_sizes=(100, ),
                                        activation='relu',
                                        solver='adam',
                                        alpha=0.0001,
                                        batch_size='auto',
                                        learning_rate='constant',
                                        learning_rate_init=0.001,
                                        power_t=0.5,
                                        max_iter=200,
                                        shuffle=True,
                                        random_state=None,
                                        tol=0.0001,
                                        verbose=False,
                                        warm_start=False,
                                        momentum=0.9,
                                        nesterovs_momentum=True,
                                        early_stopping=False,
                                        validation_fraction=0.1,
                                        beta_1=0.9,
                                        beta_2=0.999,
                                        epsilon=1e-08)
        else:
            logger.info(
                "You need to give the model_type to fit the dynamic and reward!!!"
            )

    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)
    """ set noise """
    action_noise = None
    param_noise = None

    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))
    """action scale"""
    max_action = env.action_high_bound
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    """ agent ddpg """
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape[0],
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    sess = U.get_session()

    if restore:
        agent.restore(sess, model_path, model_name)
    else:
        agent.initialize(sess)
        sess.graph.finalize()
    agent.reset()

    episodes = 0
    epochs_rewards = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32)
    epochs_times = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32)
    epochs_steps = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32)
    epochs_states = []
    for epoch in range(nb_epochs):
        logger.info(
            "======================== The {} epoch start !!! ========================="
            .format(epoch))
        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_times = []
        epoch_actions = []
        epoch_episode_states = []
        epoch_qs = []
        epoch_episodes = 0
        for cycle in range(nb_epoch_cycles):
            start_time = time.time()
            obs, state, done = env.reset()
            obs_reset = cp.deepcopy(obs)
            episode_reward = 0.
            episode_step = 0
            episode_states = []
            logger.info(
                "================== The {} episode start !!! ==================="
                .format(cycle))
            for t_rollout in range(nb_rollout_steps):
                logger.info(
                    "================== The {} steps finish  !!! ==================="
                    .format(t_rollout))
                """ Predict next action """
                action, q, _, _ = agent.step(obs,
                                             stddev,
                                             apply_noise=True,
                                             compute_Q=True)

                new_obs, next_state, r, done, safe_or_not, final_action = env.step(
                    max_action * action, t_rollout)

                if safe_or_not is False:
                    break

                episode_reward += r
                episode_step += 1
                episode_states.append([
                    cp.deepcopy(state),
                    cp.deepcopy(final_action),
                    np.array(cp.deepcopy(r)),
                    cp.deepcopy(next_state)
                ])

                epoch_actions.append(action)
                epoch_qs.append(q)

                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs
                state = next_state

                if done:
                    break
                """ extend the memory """
                if model_based and cycle > (nb_model_learning +
                                            1) and memory_extend:
                    pred_x = np.zeros((1, 18), dtype=np.float32)
                    for j in range(nb_samples_extend):
                        m_action, _, _, _ = agent.step(obs,
                                                       stddev,
                                                       apply_noise=True,
                                                       compute_Q=False)
                        pred_x[:, :12] = obs
                        pred_x[:, 12:] = m_action
                        m_new_obs = dynamic_model.predict(pred_x)[0]
                        """ get real reward """
                        # state = env.inverse_state(m_new_obs)
                        # m_reward = env.get_reward(state, m_action)
                        m_reward = reward_model.predict(pred_x)[0]
                        agent.store_transition(obs, m_action, m_reward,
                                               m_new_obs, done)
            """ generate new data and fit model"""
            if model_based and cycle > nb_model_learning:
                logger.info(
                    "==============================  Model Fit !!! ==============================="
                )
                input_x = np.concatenate(
                    (memory.observations0.data[:memory.nb_entries],
                     memory.actions.data[:memory.nb_entries]),
                    axis=1)
                input_y_obs = memory.observations1.data[:memory.nb_entries]
                input_y_reward = memory.rewards.data[:memory.nb_entries]
                dynamic_model.fit(input_x, input_y_obs)
                reward_model.fit(input_x, input_y_reward)

                if dyna_learning:
                    logger.info(
                        "=========================  Collect data !!! ================================="
                    )
                    pred_obs = np.zeros((1, 18), dtype=np.float32)
                    for sample_index in range(nb_sample_cycle):
                        fake_obs = obs_reset
                        for t_episode in range(nb_sample_steps):
                            fake_action, _, _, _ = agent.step(fake_obs,
                                                              stddev,
                                                              apply_noise=True,
                                                              compute_Q=False)
                            pred_obs[:, :12] = fake_obs
                            pred_obs[:, 12:] = fake_action
                            next_fake_obs = dynamic_model.predict(pred_obs)[0]
                            fake_reward = reward_model.predict(pred_obs)[0]
                            # next_fake_obs = dynamic_model.predict(np.concatenate((fake_obs, fake_action)))[0]
                            # fake_reward = reward_model.predict(np.concatenate((fake_obs, fake_action)))[0]
                            fake_obs = next_fake_obs
                            fake_terminals = False
                            fake_memory.append(fake_obs, fake_action,
                                               fake_reward, next_fake_obs,
                                               fake_terminals)
            """ noise decay """
            stddev = float(stddev) * 0.95

            duration = time.time() - start_time
            epoch_episode_rewards.append(episode_reward)
            epoch_episode_steps.append(episode_step)
            epoch_episode_times.append(cp.deepcopy(duration))
            epoch_episode_states.append(cp.deepcopy(episode_states))

            epochs_rewards[epoch, cycle] = episode_reward
            epochs_steps[epoch, cycle] = episode_step
            epochs_times[epoch, cycle] = cp.deepcopy(duration)

            logger.info(
                "============================= The Episode_Times:: {}!!! ============================"
                .format(epoch_episode_rewards))
            logger.info(
                "============================= The Episode_Times:: {}!!! ============================"
                .format(epoch_episode_times))

            epoch_episodes += 1
            episodes += 1
            """ Training process """
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                logger.info("")
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()
            """ planning training """
            if model_based and cycle > (nb_model_learning +
                                        1) and dyna_learning:
                for t_train in range(nb_train_steps):
                    # setting for adapt param noise, if necessary.
                    if fake_memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)
                    batch = fake_memory.sample(batch_size=batch_size)
                    fake_cl, fake_al = agent.train_fake_data(batch)
                    epoch_critic_losses.append(fake_cl)
                    epoch_actor_losses.append(fake_al)
                    agent.update_target_net()

        epochs_states.append(cp.deepcopy(epoch_episode_states))

        # # save data
        np.save(
            data_path + 'train_reward_' + algorithm_name + '_' + noise_type +
            file_name, epochs_rewards)
        np.save(
            data_path + 'train_step_' + algorithm_name + '_' + noise_type +
            file_name, epochs_steps)
        np.save(
            data_path + 'train_states_' + algorithm_name + '_' + noise_type +
            file_name, epochs_states)
        np.save(
            data_path + 'train_times_' + algorithm_name + '_' + noise_type +
            file_name, epochs_times)

    # # agent save
    agent.store(model_path + 'train_model_' + algorithm_name + '_' +
                noise_type + file_name)
示例#2
0
def learn(
        network,
        env,
        path='',
        model_path='./model/',
        model_name='ddpg_none_fuzzy',
        restore=False,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=20,
        nb_rollout_steps=50,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type=None,  #'adaptive-param_0.2'
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        batch_size=32,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=50,
        **network_kwargs):

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 1

    rank = MPI.COMM_WORLD.Get_rank()
    nb_actions = env.action_space.n
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    memory = Memory(limit=int(1e5),
                    action_shape=env.action_space.n,
                    observation_shape=env.observation_space.shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    nb_actions = env.action_space.n

    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = env.max_action
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.n,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()

    if restore:
        agent.restore(sess, model_path, model_name)
    else:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

    # if eval_env is not None:
    #     eval_obs = eval_env.reset()
    # nenvs = obs.shape[0]
    # episode_reward = np.zeros(nenvs, dtype=np.float32) #vector
    # episode_step = np.zeros(nenvs, dtype=int) # vector

    agent.reset()
    episodes = 0
    t = 0
    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_episode_states = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            # if nenvs > 1:
            #     # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
            #     # of the environments, so resetting here instead
            #     agent.reset()
            obs, done = env.reset()
            episode_reward = 0.
            episode_step = 0
            episode_states = []
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)

                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                new_obs, r, done, info = env.step(max_action * action,
                                                  t_rollout)

                # tion in env (as far as DDPG is concerned, every action is in [-1, 1])
                # note these outputs are batched from vecenv

                t += 1
                if rank == 0 and render:
                    env.render()

                episode_reward += r
                episode_step += 1
                episode_states.append(cp.deepcopy(obs))

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(obs, action, r, new_obs, done)

                #the batched data will be unrolled in memory.py's append.
                obs = new_obs

                # for d in range(len(done)):
                #     if done[d]:
                #         # Episode done.
                #         epoch_episode_rewards.append(episode_reward[d])
                #         episode_rewards_history.append(episode_reward[d])
                #         epoch_episode_steps.append(episode_step[d])
                #         episode_reward[d] = 0.
                #         episode_step[d] = 0
                #         epoch_episodes += 1
                #         episodes += 1
                #         if nenvs == 1:
                #             agent.reset()
                if done:
                    # Episode done.
                    epoch_episode_rewards.append(episode_reward)
                    episode_rewards_history.append(episode_reward)
                    epoch_episode_steps.append(episode_step)
                    epoch_episode_states.append(cp.deepcopy(episode_states))
                    # episode_reward = 0.
                    # episode_step = 0.
                    epoch_episodes += 1
                    episodes += 1
                    # if nenvs == 1:
                    #     agent.reset()
                    break

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        max_action * eval_action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        mpi_size = MPI.COMM_WORLD.Get_size()

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()

        # save data
        np.save(path + 'train_reward_none_fuzzy', epoch_episode_rewards)
        np.save(path + 'train_step_none_fuzzy', epoch_episode_steps)
        np.save(path + 'train_states_none_fuzzy', epoch_episode_states)

        # agent save
        agent.store(model_path + model_name)