Exemplo n.º 1
0
                loss = loss_function(prediction, q_target)
                gradients = tape.gradient(loss,
                                          agent.model.trainable_variables)
                optimizer.apply_gradients(
                    zip(gradients, agent.model.trainable_variables))

        new_weights = agent.model.get_weights()

        # set new weights
        agent.set_weights(new_weights)
        manager.set_agent(new_weights)
        # get new weights
        agent = manager.get_agent()
        # update aggregator
        time_steps = manager.test(test_steps)
        manager.update_aggregator(loss=loss, time_steps=time_steps)
        # print progress
        print(
            f"epoch ::: {e}  loss ::: {loss}   avg env steps ::: {np.mean(time_steps)}"
        )

        # you can also alter your managers parameters
        if e % 5 == 0:
            epsilon = epsilon * .9
            manager.set_epsilon(epsilon=epsilon)
            print(f"New epsilon: {epsilon}")

        # if e % saving_after == 0:
        #     #you can save models
        #     manager.save_model(saving_path, e)
Exemplo n.º 2
0
        manager.set_agent(agent.get_weights())

        print('TEST')

        # Update aggregator
        steps, current_rewards = manager.test(
            max_steps=1000,
            test_episodes=10,
            render=False,
            evaluation_measure="time_and_reward",
        )

        #if (e+1) % 5 == 0:
        manager.test(max_steps=1000, test_episodes=1, render=True)
        manager.update_aggregator(loss=losses,
                                  reward=current_rewards,
                                  time=steps)

        # Collect all rewards
        rewards.extend(current_rewards)
        # Average reward over last 100 episodes
        avg_reward = sum(rewards[-100:]) / min(len(rewards), 100)

        # Print progress
        print(
            f"epoch ::: {e}  loss ::: {np.mean(losses)}   avg_current_reward ::: {np.mean(current_rewards)}   avg_reward ::: {avg_reward}   avg_timesteps ::: {np.mean(steps)}"
        )

        if avg_reward > env.spec.reward_threshold:
            print(f'\n\nEnvironment solved after {e+1} episodes!')
            # Save model
Exemplo n.º 3
0
                # positive critic loss for gradient descent with MSE
                critic_loss = tf.reduce_mean((mc - agent.v(state))**2)

            critic_gradients = tape.gradient(
                critic_loss, agent.model.critic.trainable_variables)
            optimizer.apply_gradients(
                zip(critic_gradients, agent.model.critic.trainable_variables))

            total_loss += actor_loss + critic_loss

            # Update the agent
            manager.set_agent(agent.get_weights())
            agent = manager.get_agent()

        reward = manager.test(test_steps, evaluation_measure="reward")
        manager.update_aggregator(loss=total_loss, reward=reward)
        # print progress
        print(
            f"epoch ::: {e}  loss ::: {total_loss}   avg env steps ::: {np.mean(reward)}"
        )

        if e % saving_after == 0:
            # you can save models
            manager.save_model(saving_path, e)

    # and load mmodels
    manager.load_model(saving_path)
    print("done")
    print("testing optimized agent")
    manager.test(test_steps, test_episodes=10, render=True)
Exemplo n.º 4
0
        data = manager.get_data()
        manager.store_in_buffer(data)

        # sample data to optimize on from buffer
        experience_dict = manager.sample(sample_size)

        print('optimizing...')
        for states, actions, rewards, states_new, not_dones in zip(
                *[experience_dict[k] for k in optim_keys]):
            train_step(agent.model, states, actions, rewards, states_new,
                       not_dones, learning_rate, gamma)

        # set new weights, get optimized agent
        manager.set_agent(agent.model.get_weights())

        # update aggregator
        time_steps, reward_agg = manager.test(
            test_steps, evaluation_measure='time_and_reward')
        manager.update_aggregator(time_steps=time_steps, rewards=reward_agg)

        if e % saving_after == 0:
            show_q(agent.model, e, saving_path, env_kwargs['action_dict'])

        print(
            f"epoch ::: {e}    avg env steps ::: {np.mean(time_steps)}    avg reward ::: {np.mean(reward_agg)}"
        )

    print('done')
    print('testing optimized agent')
    manager.test(test_steps, render=True, evaluation_measure='time_and_reward')
Exemplo n.º 5
0
def train_td3(args, model, action_dimension=None):

    print(args)

    tf.keras.backend.set_floatx('float32')

    ray.init(log_to_driver=False)

    # hyper parameters
    buffer_size = args.buffer_size  # 10e6 in their repo, not possible with our ram
    epochs = args.epochs
    saving_path = os.getcwd() + "/" + args.saving_dir
    saving_after = 5
    sample_size = args.sample_size
    optim_batch_size = args.batch_size
    gamma = args.gamma
    test_steps = 100  # 1000 in their repo
    policy_delay = 2
    rho = .046
    policy_noise = args.policy_noise
    policy_noise_clip = .5
    msg_dim = args.msg_dim  # 32 in their repo
    learning_rate = args.learning_rate

    save_args(args, saving_path)

    env_test_instance = gym.make('BipedalWalker-v3')
    if action_dimension is None:
        action_dimension = copy(env_test_instance.action_space.shape[0])
    model_kwargs = {
        # action dimension for modular actions
        'action_dimension': action_dimension,
        'min_action': copy(env_test_instance.action_space.low)[0],
        'max_action': copy(env_test_instance.action_space.high)[0],
        'msg_dimension': msg_dim,
        'fix_sigma': True,
        'hidden_units': args.hidden_units
    }
    del env_test_instance

    manager = SampleManager(model,
                            'BipedalWalker-v3',
                            num_parallel=(os.cpu_count() - 1),
                            total_steps=150,
                            action_sampling_type="continuous_normal_diagonal",
                            is_tf=True,
                            model_kwargs=model_kwargs)

    optim_keys = [
        'state',
        'action',
        'reward',
        'state_new',
        'not_done',
    ]

    manager.initialize_buffer(buffer_size, optim_keys)

    manager.initialize_aggregator(path=saving_path,
                                  saving_after=saving_after,
                                  aggregator_keys=["loss", "reward"])

    agent = manager.get_agent()

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    # fill buffer
    print("Filling buffer before training..")
    while len(manager.buffer.buffer[
            manager.buffer.keys[0]]) < manager.buffer.size:
        # Gives you state action reward trajectories
        data = manager.get_data()
        manager.store_in_buffer(data)

    # track time while training
    timer = time.time()
    last_t = timer

    target_agent = manager.get_agent()
    for e in range(epochs):
        # off policy
        sample_dict = manager.sample(sample_size, from_buffer=True)
        print(f"collected data for: {sample_dict.keys()}")

        # cast values to float32 and create data dict
        sample_dict['state'] = tf.cast(sample_dict['state'], tf.float32)
        sample_dict['action'] = tf.cast(sample_dict['action'], tf.float32)
        sample_dict['reward'] = tf.cast(sample_dict['reward'], tf.float32)
        sample_dict['state_new'] = tf.cast(sample_dict['state_new'],
                                           tf.float32)
        sample_dict['not_done'] = tf.cast(sample_dict['not_done'], tf.float32)
        data_dict = dict_to_dict_of_datasets(sample_dict,
                                             batch_size=optim_batch_size)

        total_loss = 0
        for state, action, reward, state_new, not_done in \
                zip(data_dict['state'],
                    data_dict['action'],
                    data_dict['reward'],
                    data_dict['state_new'],
                    data_dict['not_done']):

            action_new = target_agent.act(state_new)
            # add noise to action_new
            action_new = action_new + tf.clip_by_value(
                tf.random.normal(action_new.shape, 0., policy_noise),
                -policy_noise_clip, policy_noise_clip)
            # clip action_new to action space
            action_new = tf.clip_by_value(
                action_new, manager.env_instance.action_space.low,
                manager.env_instance.action_space.high)

            # calculate target with double-Q-learning
            state_action_new = tf.concat([state_new, action_new], axis=-1)
            q_values0 = target_agent.model.critic0(state_action_new)
            q_values1 = target_agent.model.critic1(state_action_new)
            q_values = tf.concat([q_values0, q_values1], axis=-1)
            q_targets = tf.squeeze(tf.reduce_min(q_values, axis=-1))
            critic_target = reward + gamma * not_done * q_targets

            state_action = tf.concat([state, action], axis=-1)

            # update critic 0
            with tf.GradientTape() as tape:
                q_output = agent.model.critic0(state_action)
                loss = tf.keras.losses.MSE(tf.squeeze(critic_target),
                                           tf.squeeze(q_output))

            total_loss += loss
            gradients = tape.gradient(loss,
                                      agent.model.critic0.trainable_variables)
            optimizer.apply_gradients(
                zip(gradients, agent.model.critic0.trainable_variables))

            # update critic 1
            with tf.GradientTape() as tape:
                q_output = agent.model.critic1(state_action)
                loss = tf.keras.losses.MSE(tf.squeeze(critic_target),
                                           tf.squeeze(q_output))

            total_loss += loss
            gradients = tape.gradient(loss,
                                      agent.model.critic1.trainable_variables)
            optimizer.apply_gradients(
                zip(gradients, agent.model.critic1.trainable_variables))

            # update actor with delayed policy update
            if e % policy_delay == 0:
                with tf.GradientTape() as tape:
                    actor_output = agent.model.actor(state)
                    action = reparam_action(actor_output,
                                            agent.model.action_dimension,
                                            agent.model.min_action,
                                            agent.model.max_action)
                    state_action = tf.concat([state, action], axis=-1)
                    q_val = agent.model.critic0(state_action)
                    actor_loss = -tf.reduce_mean(q_val)

                total_loss += actor_loss
                actor_gradients = tape.gradient(
                    actor_loss, agent.model.actor.trainable_variables)
                optimizer.apply_gradients(
                    zip(actor_gradients,
                        agent.model.actor.trainable_variables))

            # Update agent
            manager.set_agent(agent.get_weights())
            agent = manager.get_agent()

            if e % policy_delay == 0:
                # Polyak averaging
                new_weights = list(rho * np.array(target_agent.get_weights()) +
                                   (1. - rho) * np.array(agent.get_weights()))
                target_agent.set_weights(new_weights)

        reward = manager.test(test_steps, evaluation_measure="reward")
        manager.update_aggregator(loss=total_loss, reward=reward)
        print(
            f"epoch ::: {e}  loss ::: {total_loss}   avg reward ::: {np.mean(reward)}"
        )

        if e % saving_after == 0:
            manager.save_model(saving_path, e)

        # needed time and remaining time estimation
        current_t = time.time()
        time_needed = (current_t - last_t) / 60.
        time_remaining = (current_t - timer) / 60. / (e + 1) * (epochs -
                                                                (e + 1))
        print(
            'Finished epoch %d of %d. Needed %1.f min for this epoch. Estimated time remaining: %.1f min'
            % (e + 1, epochs, time_needed, time_remaining))
        last_t = current_t

    manager.load_model(saving_path)
    print("done")
    print("testing optimized agent")
    manager.test(test_steps, test_episodes=10, render=True)

    ray.shutdown()
Exemplo n.º 6
0
        # iterating through dataset
        old_table = agent.get_weights()
        for s, a, r, n in zip(sample_dict['state'], sample_dict['action'],
                              sample_dict['reward'], sample_dict['state_new']):

            s_x, s_y = s  # unpacking state
            n_x, n_y = n  # unpacking new state

            # Apply Q-learning formula
            old_table[a, s_x,
                      s_y] += alpha * (r +
                                       gamma * np.max(old_table[:, n_x, n_y]) -
                                       old_table[a, s_x, s_y])

        # set new weights
        manager.set_agent(old_table)

        # get new weights
        agent = manager.get_agent()

        time_steps = manager.test(test_steps)

        # update aggregator
        manager.update_aggregator(time_steps=time_steps)

        print(f"epoch ::: {e}  avg env steps ::: {np.mean(time_steps)}")

    print("Done!")

    print("Testing optimized agent...")
    manager.test(test_steps, test_episodes=3, render=True)
Exemplo n.º 7
0
        # TODO: optimize agent

        dummy_losses = [
            np.mean(np.random.normal(size=(64, 100)), axis=0)
            for _ in range(1000)
        ]

        new_weights = agent.model.get_weights()

        # set new weights
        manager.set_agent(new_weights)
        # get new weights
        agent = manager.get_agent()
        # update aggregator
        time_steps = manager.test(test_steps)
        manager.update_aggregator(loss=dummy_losses, time_steps=time_steps)
        # print progress
        print(
            f"epoch ::: {e}  loss ::: {np.mean([np.mean(l) for l in dummy_losses])}   avg env steps ::: {np.mean(time_steps)}"
        )

        # yeu can also alter your managers parameters
        manager.set_epsilon(epsilon=0.99)

        if e % saving_after == 0:
            # you can save models
            manager.save_model(saving_path, e)

    # and load mmodels
    manager.load_model(saving_path)
    print("done")
                manager.set_agent(new_weights)

            # get new agent
            agent = manager.get_agent()

            # update aggregator
            time_steps, reward = manager.test(
                test_steps,
                render=True if e % 10 == 0 else False,
                evaluation_measure="time_and_reward")
            #print("time_steps: ", len(time_steps))
            #print("reward: ", len(reward))
            #print("actor_loss:", len(actor_losses))
            #print("critic_loss: ", len(critic_losses))
            manager.update_aggregator(actor_loss=actor_losses,
                                      time_steps=time_steps,
                                      reward=reward)
            print(
                f"epoch ::: {e}  actor_loss ::: {np.mean([np.mean(l) for l in actor_losses])}  avg env steps ::: {np.mean(time_steps)}   avg reward ::: {np.mean(reward)}"
            )
            if e % saving_after == 0:
                manager.save_model(saving_path, e)
            #manager.agg.save_graphic()
            print("---")

    manager.load_model(saving_path)
    print("done")
    print("testing optimized agent")
    manager.test(test_steps,
                 test_episodes=10,
                 render=True,
Exemplo n.º 9
0
            # TODO: optimize agent
            old_q[h, w, a] += error
            agent.set_weights(old_q)

        # Set new weights
        manager.set_agent(agent.get_weights())

        # Update aggregator
        steps = manager.test(
            max_steps=100,
            test_episodes=10,
            render=True,
            evaluation_measure="time",
        )
        manager.update_aggregator(error=error_aggregator, steps=steps)
        # Print progress
        print(
            f"epoch ::: {e}  error ::: {np.mean(error_aggregator)}   avg_timesteps ::: {np.mean(steps)}"
        )
        agent.model.print_optimal(action_dict)

    print("testing optimized agent")
    manager.test(
        max_steps=100,
        test_episodes=10,
        render=True,
        do_print=True,
        evaluation_measure="time",
    )
Exemplo n.º 10
0
        for s, a, r, ns, nd in dataset:

            # ensure that the datasets have at least 10 elements
            # otherwise we run into problems with the MSE loss
            if len(s) >= 10:
                loss = train_q_network(agent, s, a, r, ns, nd, optimizer)
                losses.append(loss)

        print(f'average loss: {np.mean(losses)}')

        # update the weights of the manager
        manager.set_agent(agent.get_weights())

        print('# ================= validation ================== #')
        render = e % RENDER_EPISODES == 0
        time_steps, rewards = manager.test(MAX_TEST_STEPS, TEST_EPISODES, evaluation_measure='time_and_reward', render=render, do_print=False)
        manager.update_aggregator(loss=losses, time_steps=time_steps, reward=rewards)
        print(f'average reward:     {np.mean(rewards)}')
        print(f'average time steps: {np.mean(time_steps)}')

        if e % SAVE_EPISODES == 0:
            print('# ================= save model ================== #')
            agent.model.deep_q_net.save(os.path.join(saving_path_model, f'epoch_{e}'))

    print('# ============== TRAINING FINISHED ============== #')
    print('# ============== SAVE FINAL MODELS ============== #')
    agent.model.deep_q_net.save(os.path.join(saving_path_model, f'final'))

    print('# ================= FINAL TEST ================== #')
    manager.test(MAX_TEST_STEPS, 10, render=True, do_print=True, evaluation_measure='time_and_reward')
                loss_actor = -action_probs * advantage - entropy_coeff * entropy
                gradients_actor = tape.gradient(
                    loss_actor, agent.model.trainable_variables)
                optimizer_actor.apply_gradients(
                    zip(gradients_actor, agent.model.trainable_variables))

        # set new weights
        manager.set_agent(agent.get_weights())
        # get new agent
        agent = manager.get_agent()
        # update aggregator
        if epoch % 5 != 0:
            time_steps = manager.test(test_steps, test_episodes=10)
        else:
            time_steps = manager.test(test_steps,
                                      test_episodes=10,
                                      render=True)
        manager.update_aggregator(loss_critic=np.mean(loss_critic),
                                  loss_actor=np.mean(loss_actor),
                                  time_steps=time_steps)
        # print progress
        print(
            f"epoch ::: {epoch}  critic loss ::: {np.mean(loss_critic)} actor loss ::: {np.mean(loss_actor)} avg env steps ::: {np.mean(time_steps)}"
        )

    # and load models
    #manager.load_model(saving_path)
    print("done")
    print("testing optimized agent")
    manager.test(test_steps, test_episodes=10, render=True, do_print=True)