Exemplo n.º 1
0
def main(cnf):
    env, agent = create_world(cnf)
    cnf = cnf.main
    agent._replay_buffer.load_data('./per_exp/eval_grads/buffer_data/')
    agent._policy.actor.load_weights(
        './per_exp/eval_grads/model/converged_actor')
    set_trace()

    buff = agent._replay_buffer
    buff.size = N
    buff.ptr = N
    buff.max_size = N
    buff.tree.n_entries = N

    agent = agent._policy
    for t in range(N + 5000):
        state, action, reward, next_state, done, *_ = buff.sample_uniformly(
            128)
        error = agent._train_step(state, action, reward, next_state, done,
                                  False, None)
        if t == 1000:
            set_trace()
        #print(f'td error {tf.norm(error)} iteration {i}')
        if not t % 5000:
            print(f'{t} of 1005000')
            agent.critic.save_weights('converged_critic')
Exemplo n.º 2
0
def main(cnf):
    env, agent = create_world(cnf)
    cnf = cnf.main
    agent._replay_buffer.load_data('./per_exp/eval_grads/buffer_data/')

    buff = agent._replay_buffer
    buff.size = N
    buff.ptr = N
    buff.max_size = N
    buff.tree.n_entries = N

    #agent.load_model('./per_exp/eval_grads/model/TD3_Vrep_save_transitions')
    #max_action= tf.constant([5.585, 5.585, 3.071, 3.071, 1.919, 0.698, 0.698, 1.], dtype=tf.float32)
    #new_actor = Actor(22, 8, max_action, [300, 300], 9e-6)
    new_actor = agent._policy.actor
    agent = agent._policy

    for t in range(100000):
        state, action, reward, next_state, done, *_ = buff.sample_uniformly(
            128)
        error, *_ = agent.train(buff, 128, t, True, None)
        #print(f'td error {tf.norm(error)} iteration {i}')
        if not t % 5000:
            print(f'{t} of 100000')
            agent.actor.save_weights('converged_actor')
            wandb.log({'actor_loss': error})
Exemplo n.º 3
0
def main(cnf):
    env, agent = create_world(cnf)
    cnf_old = cnf
    cnf = cnf.main
    agent._replay_buffer.load_data('./per_exp/eval_grads/buffer_data/')
    buff = agent._replay_buffer
    buff.size = N
    buff.ptr = N
    buff.max_size = N
    buff.tree.n_entries = N
    agent._policy.actor.load_weights(
        './per_exp/eval_grads/model/converged_actor')
    trained_actor = agent._policy.actor
    print('Successfully loaded')
    print('Compute true gradient of true critic')
    sim_avg = []
    td_avg = []
    for k in range(SAMPLES):
        # True gradient of true critic
        true_critic = create_agent(cnf_old, env)
        train_the_critic(true_critic, buff, N_TRAIN_TRUE_CRITIC)
        true_critic_sample = true_critic._policy.critic
        state, *_ = buff.get_buffer()
        with tf.GradientTape() as tape:
            action = trained_actor(state)
            q_value, _ = true_critic_sample(tf.concat([state, action], axis=1))
            actor_loss = -tf.reduce_mean(q_value)
        gradients_true = tape.gradient(actor_loss,
                                       trained_actor.trainable_variables)
        gradients_true = [tf.reshape(x, [-1]) for x in gradients_true]
        state, action, reward, next_state, done, *_ = buff.get_buffer()
        print(f' state shape {state.shape}')
        for idx, s in enumerate(state):
            print(f'{idx} of 1000000')
            s = tf.reshape(s, [1, s.shape[0]])
            ns = tf.reshape(next_state[idx], [1, next_state[idx].shape[0]])
            d = tf.reshape(done[idx], [1, done[idx].shape[0]])
            r = tf.reshape(reward[idx], [1, reward[idx].shape[0]])
            approx_critic = true_critic
            td_errors = approx_critic._policy._compute_td_error(
                s, trained_actor(s), r, ns, d)
            with tf.GradientTape() as tape:
                action = trained_actor(s)
                q_value, _ = approx_critic._policy.critic(
                    tf.concat([s, action], axis=1))
                actor_loss = -tf.reduce_mean(q_value)
            gradients_sample = tape.gradient(actor_loss,
                                             trained_actor.trainable_variables)
            gradients_sample = [tf.reshape(x, [-1]) for x in gradients_sample]
            sims = [
                -simil_metric(x, y)
                for x, y in zip(gradients_true, gradients_sample)
            ]
            sims = tf.reduce_mean(sims)
            sim_avg.append(sims)
            td_avg.append(td_errors)
    np.save(f'sim_avg.npy', sim_avg)
    np.save(f'td_avg.npy', td_avg)
Exemplo n.º 4
0
def main(cnf):
    env, agent = create_world(cnf)
    cnf = cnf.main
    # create objects
    logger = Logger(cnf.log, cnf.minilog, env.time_limit)
    # Load previously trained model.
    if cnf.load_model:
        agent.load_model(f'./experiments/models/{agent._load_string}')
    # Training loop
    state, done = env.reset(), False
    switch = 0
    reward_fn = tf.Variable(0)
    for t in range(int(cnf.max_timesteps)):
        state_old = state
        if not t % cnf.switch_time:
            switch = (switch + 1) % 2
        action = agent.select_action(state,
                                     noise_bool=True,
                                     reward_fn=reward_fn)
        next_state, reward, done, _ = env.step(action, reward_fn)
        success_cd = [done if env.success else 0][0]
        # get intrinsic reward from agent.transitbuffer computation
        intr_rew = agent.replay_add(state, action, reward, next_state, done,
                                    success_cd)
        maybe_verbose_output(t, agent, env, action, cnf, state, intr_rew)
        logger.inc(t, reward)
        if not cnf.flat_agent and not cnf.minilog:
            logger.most_important_plot(agent, state, action, reward,
                                       next_state, success_cd)
        if cnf.save_attention:
            attention_grad_meta(state, agent, t, flat=cnf.flat_agent)
        state = next_state
        if done:
            reward_fn.assign([0 if switch else 1][0])
            # Train at the end of the episode for the appropriate times. makes collecting
            # norms stds and losses easier
            if t > cnf.start_timesteps:
                agent.train(t, logger.episode_timesteps)
            print(
                f"Total T: {t+1} Episode Num: {logger.episode_num+1} Episode T: {logger.episode_timesteps} Reward: {logger.episode_reward}"
            )
            logger.log(t, intr_rew)
            agent.reset()
            logger.reset()
            state, done = env.reset(), False
        # Evaluate episode
        if (t + 1) % cnf.eval_freq == 0:
            avg_ep_rew, avg_intr_rew, success_rate, rate_correct_solves, untouchable_steps = agent.evaluation(
                env)
            state, done = env.reset(), False
            agent.reset()
            logger.reset(post_eval=True)
            logger.log_eval(t, avg_ep_rew, avg_intr_rew, success_rate,
                            rate_correct_solves, untouchable_steps)
            if cnf.save_model:
                agent.save_model(f'./experiments/models/{agent._load_string}')
Exemplo n.º 5
0
def main(cnf):
    env, agent = create_world(cnf)
    agent.load_model('here')
    cnf = cnf.main
    agent.meta_replay_buffer.load_data('./per_exp/buffer_data/')

    buff = agent.meta_replay_buffer
    buff.size = N 
    print(buff.alpha)
    new_rew = []
    if True:
        co = 0
        for re in buff.reward[:N]:
            if re == -1.:
                new_rew.append(re)
            else:
                for i in range(1):
                    new_rew.append(re)
        buff.reward = np.asarray(new_rew)

    buff.max_size = N
    buff.tree.n_entries = N

    idx = np.where(buff.reward == -1.)[0]

    m1 = []
    m2 = []

    counter = 0
    idxs = np.zeros([N, N])

    errors = np.zeros([N, N])
    for t in range(1000):
        #print(f'train {t} of 10000')
        agent._meta_agent.train(buff, 10, t, False, None)
        #print(buff.batch_idxs)
        state, action, reward, next_state, done = buff.get_buffer() 
        error = agent._meta_agent._compute_td_error(state, action, reward, next_state, done)
        errors[t] = error[:, 0]



        for i in range(buff.batch_idxs.shape[0]):
                if buff.reward[buff.batch_idxs[i]] != -1.:
                    counter += 1
                    print(counter)
                print(f' counter {counter}')
                m1.append(buff.batch_idxs[i])
                m2.append(t)
    np.save('m1.npy', m1)    
    np.save('m2.npy', m2)    
    np.save(f'errors_small_{repetitions}.npy', errors)
    env.close()
Exemplo n.º 6
0
def main(cnf):
    env, agent = create_world(cnf)
    cnf_old = cnf
    cnf = cnf.main
    agent._replay_buffer.load_data('./per_exp/eval_grads/buffer_data/')
    accum = Accumulator()

    buff = agent._replay_buffer
    buff.size = N 
    buff.ptr = N
    buff.max_size = N
    buff.tree.n_entries = N
    agent._policy.actor.load_weights('./per_exp/eval_grads/model/converged_actor')

    trained_actor = agent._policy.actor
    print('Compute true gradient of true critic')
    batch_range = np.array([2, 128, 256, 512, 1024, 2048])
    ret = []
    print('Update Buffer')
    for batch_size in batch_range:
        simil_list = []
        print(f'Batch {batch_size}')
        for i in range(SAMPLES):
            # True gradient of true critic 
            true_critic= create_agent(cnf_old, env)
            train_the_critic(true_critic, buff, N_TRAIN_TRUE_CRITIC)
            true_critic_sample = true_critic._policy.critic
            approx_critic = true_critic
            update_buffer(buff, approx_critic)
            state, *_ = buff.get_buffer()
            with tf.GradientTape() as tape:
                action = trained_actor(state)
                q_value, _  = true_critic_sample(tf.concat([state, action], axis=1))
                actor_loss = -tf.reduce_mean(q_value)
            gradients_true = tape.gradient(actor_loss, trained_actor.trainable_variables)
            gradients_true  = [tf.reshape(x, [-1]) for x in gradients_true]
            simil_avg = []
            for i in range(10):
                state, *_ = buff.sample(batch_size)
                with tf.GradientTape() as tape:
                    action = trained_actor(state)
                    q_value, _  = approx_critic._policy.critic(tf.concat([state, action], axis=1))
                    actor_loss = -tf.reduce_mean(q_value)
                gradients_sample = tape.gradient(actor_loss, trained_actor.trainable_variables)
                gradients_sample = [tf.reshape(x, [-1]) for x in gradients_sample]
                sims = [-simil_metric(x, y) for x, y in zip(gradients_true, gradients_sample)]
                sims = tf.reduce_mean(sims)
                simil_avg.append(sims.numpy())
            simil_list.append(np.mean(simil_avg))
        ret.append(simil_list)
    np.save(f'simil_list_{cnf_old.agent.sub_per}_{cnf_old.buffer.alpha}.npy', ret)
Exemplo n.º 7
0
def main(cnf):
    env, agent = create_world(cnf)
    cnf = cnf.main
    # create objects
    logger = Logger(cnf.log, cnf.time_limit)
    # Load previously trained model.
    if cnf.load_model:
        agent.load_model(f'./experiments/models/{agent._file_name}')
    # Training loop
    state, done = env.reset(), False
    for t in range(int(cnf.max_timesteps)):
        action = agent.select_noisy_action(state)
        maybe_verbose_output(t, agent, env, action, cnf, state)
        next_state, reward, done, _ = env.step(action)
        intr_rew = agent.replay_add(state, action, reward, next_state, done)
        if t > cnf.start_timesteps and not t % cnf.train_every:
            agent.train(t)
        state = next_state
        logger.inc(t, reward)

        if done:
            print(
                f"Total T: {t+1} Episode Num: {logger.episode_num+1}+ Episode T: {logger.episode_timesteps} Reward: {logger.episode_reward}"
            )
            # Reset environment
            agent.reset()
            hard_reset = logger.log(t, intr_rew)
            logger.reset()
            state, done = env.reset(), False
        # Evaluate episode
        if (t + 1) % cnf.eval_freq == 0:
            avg_ep_rew, avg_intr_rew, success_rate = agent.evaluation(env)
            state, done = env.reset(), False
            agent.reset()
            logger.reset(post_eval=True)
            logger.log_eval(t, avg_ep_rew, avg_intr_rew, success_rate)
            if cnf.save_model:
                agent.save_model(f'./experiments/models/{agent._file_name}')
Exemplo n.º 8
0
import numpy as np
import tensorflow as tf
from environments.coppeliagym import CoppeliaEnv
from utils.utils import setup, create_world
import collections

args = ['--ee_j_pos', '--vrep', '--render']
args = setup(args)
env, agent = create_world(args)

for eps in range(10):
    state = env.reset()
    env.render()
    for t in range(args.time_limit):
        if t < 150:
            action = env.action_space.high
        else:
            action = env.action_space.low
        np.putmask(action, [1, 1, 1, 1, 0, 1, 1, 1], 0)
        next_state, *_ = env.step(action)
        print(next_state[7])