Exemplo n.º 1
0
def create_variational_GP_model(weights, kernel_fn):
    """
    Instantiate the GP model corresponding to the weight matrix used in the policy network

    Usage:
        ```python
        agent = DQN(...)
        init_state = env.reset() # reset
        agent.predict(init_state) # burn the format of the input matrix to get the weight matrices!!
        gp_model = create_model(weights=agent.main_model.get_weights(), kernel_fn=RBFKernelFn)
        ```
    """
    input_shape = flatten_weight(weights).shape[0]
    print("GP Model: Input Shape is {}".format(input_shape))
    dtype = np.float32
    num_inducing_points = 40
    loss = lambda y, rv_y: rv_y.variational_loss(y)

    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=[input_shape], dtype=dtype),
        tf.keras.layers.Dense(1, kernel_initializer='ones', use_bias=False),
        tfp.layers.VariationalGaussianProcess(
            num_inducing_points=num_inducing_points,
            kernel_provider=kernel_fn(dtype=dtype),
            event_shape=[1],
            # inducing_index_points_initializer=tf.constant_initializer(
            # 	np.linspace(*x_range, num=num_inducing_points,
            # 				dtype=x.dtype)[..., np.newaxis]),
            unconstrained_observation_noise_variance_initializer=(
                tf.constant_initializer(np.array(0.54).astype(dtype))),
        ),
    ])
    model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
                  loss=loss)
    return model
Exemplo n.º 2
0
def create_TF_GP_model(weights, kernel_fn):
    """
    TF implementation of GPR is utterly useless
    they require us to set the index points in advance, we cannot apply the model to unknown data.
    how stupid TF team is....
    """
    input_shape = flatten_weight(weights).shape[0]
    print("GP Model: Input Shape is {}".format(input_shape))
    dtype = np.float64
    loss = lambda y, rv_y: rv_y.variational_loss(y)

    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=[input_shape], dtype=dtype),
        tfp.distributions.GaussianProcess(kernel=kernel_fn),
    ])
    model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
                  loss=loss)
    return model
            for t_train in range(episode_len):  # in mujoco, this will be 1,000 iterations!
                states, actions, rewards, next_states, dones = replay_buffer.sample(agent.params.batch_size)
                loss = agent.update(states, actions, rewards, next_states, dones)
                soft_target_model_update_eager(agent.target_actor, agent.actor, tau=agent.params.soft_update_tau)
                soft_target_model_update_eager(agent.target_critic, agent.critic, tau=agent.params.soft_update_tau)

            tf.contrib.summary.scalar("reward", total_reward, step=i)
            tf.contrib.summary.scalar("exec time", time.time() - start, step=i)
            if i >= agent.params.reward_buffer_ep:
                tf.contrib.summary.scalar("Moving Ave Reward", np.mean(reward_buffer), step=i)

            # store the episode reward
            reward_buffer.append(total_reward)
            time_buffer.append(time.time() - start)

            if global_timestep.numpy() > agent.params.learning_start and i % agent.params.reward_buffer_ep == 0:
                log.logging(global_timestep.numpy(), i, np.sum(time_buffer), reward_buffer, np.mean(loss), 0, [0])

            # === Store the trained policy and Scores ===
            scores.append(total_reward)  # save the reward which has the same amount as `policies` buffer
            weights_vec = flatten_weight(agent.actor.get_weights())
            policies.append(weights_vec)

            # check the stopping condition
            if global_timestep.numpy() > agent.params.num_frames:
                print("=== Training is Done ===")
                env.close()
                break

np.save("policy_weight", np.array(policies))
np.save("scores", np.array(scores))