params.log_dir = "../logs/logs/{}".format(params.env_name)
    params.model_dir = "../logs/models/{}".format(params.env_name)

env = gym.make(params.env_name)

# set seed
env.seed(params.seed)
tf.compat.v1.random.set_random_seed(params.seed)

agent = DDPG(Actor, Critic, env.action_space.shape[0], params)
replay_buffer = ReplayBuffer(params.memory_size)
reward_buffer = deque(maxlen=params.reward_buffer_ep)
summary_writer = tf.contrib.summary.create_file_writer(params.log_dir)

init_state = env.reset()  # reset
agent.predict(init_state)  # burn the format of the input matrix to get the weight matrices!!
gp_model, update = create_bayes_net()
optimiser = tf.compat.v1.train.AdamOptimizer()
num_sample = 100  # number of sampling

get_ready(agent.params)

global_timestep = tf.compat.v1.train.get_or_create_global_step()
time_buffer = deque(maxlen=agent.params.reward_buffer_ep)
log = logger(agent.params)

with summary_writer.as_default():
    # for summary purpose, we put all codes in this context
    with tf.contrib.summary.always_record_summaries():
        policies, scores = list(), list()
        for i in itertools.count():
Exemplo n.º 2
0
    with tf.contrib.summary.always_record_summaries():

        for i in itertools.count():
            state = env.reset()
            total_reward = 0
            self_rewards = 0
            start = time.time()
            agent.random_process.reset_states()
            done = False
            episode_len = 0
            while not done:
                traj.append(state)
                if global_timestep.numpy() < agent.params.learning_start:
                    action = env.action_space.sample()
                else:
                    action = agent.predict(state)
                next_state, reward, done, info = env.step(
                    np.clip(action, -1.0, 1.0))
                replay_buffer.add(state, action, reward, next_state, done)
                """
                === Update the models
                """
                if global_timestep.numpy() > agent.params.learning_start:
                    states, actions, rewards, next_states, dones = replay_buffer.sample(
                        agent.params.batch_size)
                    loss = agent.update(states, actions, rewards, next_states,
                                        dones)
                    soft_target_model_update_eager(
                        agent.target_actor,
                        agent.actor,
                        tau=agent.params.soft_update_tau)