Exemplo n.º 1
0
    def __init__(self, args, device, pop_size, elite_prop, load_pop_dir):
        if pop_size < 1:
            raise ValueError(
                "Population size has to be one or greater, otherwise this doesn't make sense"
            )
        self.pop_size = pop_size
        self.population = []  # a list of lists/generators of model parameters
        self.selected = []  # a buffer for the selected individuals
        self.to_select = int(self.pop_size * elite_prop)
        if self.to_select == 0:
            self.to_select = 1
        self.fitnesses = []
        self.reached = []
        self.instinct_average_list = []
        self.args = args

        self.sigma = 0.01
        self.sigma_decay = 0.999
        self.min_sigma = 0.001

        # if recover GA, load a list of files representing the population
        if args.load_ga:
            saved_files = get_population_files(load_pop_dir)

        ref_env_name = register_set_goal(0)

        reference_envs = make_vec_envs(ref_env_name,
                                       np.random.randint(2**32),
                                       1,
                                       args.gamma,
                                       None,
                                       device,
                                       allow_early_resets=True,
                                       normalize=args.norm_vectors)

        for n in range(pop_size + self.to_select):
            if args.load_ga:
                file_idx = n % len(saved_files)
                start_model, start_lr = torch.load(saved_files[file_idx])
                print("Load individual from {}".format(saved_files[file_idx]))
            else:
                start_model = (init_ppo(
                    reference_envs,
                    log(args.init_sigma),
                ))
                start_lr = args.lr

            ind = Individual(start_model, device, rank=n, learn_rate=start_lr)

            if n < self.pop_size:
                self.population.append(ind)
                self.fitnesses.append(0)
                self.reached.append(0)
                self.instinct_average_list.append(0)
            else:
                self.selected.append(ind)
            print("Built {} individuals out of {}".format(
                n, (pop_size + self.to_select)))
Exemplo n.º 2
0
                'rm_dist_to_nogo': args.rm_dist_to_nogo,
                'nogo_large': args.large_nogos}
    )

    envs = make_vec_envs(
        ENV_NAME, args.seed, 1, args.gamma, None, torch.device("cpu"), False
    )
    print("start the train function")
    import math

    ###### Load the saved model and the learning rate ######
    load_m = torch.load(
        "/Users/djgr/code/instincts/modular_rl/trained_models/pulled_from_server/second_phase_instinct/2_deterministic_goals/small_zones_NOdistance2zones_PPO/dist2nogo_individual_CTRL_731.pt"
    )
    init_model = load_m[0]
    learning_rate = load_m[1]
    #args.lr = learning_rate
    init_sigma = args.init_sigma

    from math import log
    fitness = train_maml_like_ppo_(
        #init_model,
        init_ppo(envs, log(init_sigma)),
        args,
        args.lr,
        num_episodes=40,
        num_updates=200,
        run_idx=0,
    )
    print(fitness)
Exemplo n.º 3
0
        ob_rms = utils.get_vec_normalize(input_envs)
        if ob_rms is not None:
            ob_rms = ob_rms.ob_rms

        fits, info = evaluate(actor_critic, ob_rms, input_envs, NUM_PROC, device)
        fitnesses.append(fits)

    return (fitnesses[-1]), 0, 0


if __name__ == "__main__":
    args = get_args()
    env_name = register_set_goal(0)

    envs = make_vec_envs(
        env_name, args.seed, 1, args.gamma, None, torch.device("cpu"), False
    )
    print("start the train function")
    init_sigma = args.init_sigma
    init_model = init_ppo(envs, log(init_sigma))
    #init_model = torch.load("saved_model.pt")

    fitness = inner_loop_ppo(
        init_model,
        args,
        args.lr,
        num_steps=40000,
        num_updates=150,
    )

    print(fitness)
Exemplo n.º 4
0
def inner_loop_ppo(
    weights,
    args,
    learning_rate,
    num_steps,
    num_updates,
    run_idx,
    input_envs,
):

    torch.set_num_threads(1)
    device = torch.device("cpu")
    #print(input_envs.venv.spec._kwargs['config']['goal_locations'])
    #env_name = register_set_goal(run_idx)

    #envs = make_vec_envs(env_name, np.random.randint(2**32), NUM_PROC,
    #                     args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors)
    actor_critic = init_ppo(input_envs, log(args.init_sigma))
    actor_critic.to(device)

    # apply the weights to the model
    apply_from_list(weights, actor_critic)


    agent = algo.PPO(
        actor_critic,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=learning_rate,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(num_steps, NUM_PROC,
                              input_envs.observation_space.shape, input_envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = input_envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    fitnesses = []
    violation_cost = 0

    for j in range(num_updates):

        episode_step_counter = 0
        for step in range(num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states, (final_action, _) = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
            # Obser reward and next obs
            obs, reward, done, infos = input_envs.step(final_action)
            episode_step_counter += 1

            # Count the cost
            total_reward = reward
            for info in infos:
                violation_cost += info['cost']
                total_reward -= info['cost']

            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, total_reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        ob_rms = utils.get_vec_normalize(input_envs)
        if ob_rms is not None:
            ob_rms = ob_rms.ob_rms

        fits, info = evaluate(actor_critic, ob_rms, input_envs, NUM_PROC, device)
        fitnesses.append(fits)

    return (fitnesses[-1]), 0, 0
Exemplo n.º 5
0
    env_name = register_set_goal(0)
    # env_name = "Safexp-PointButton0-v0"

    envs = make_vec_envs(env_name, args.seed, 1, args.gamma, None,
                         torch.device("cpu"), False)
    print("start the train function")
    # parameters = torch.load(
    #    "/Users/djrg/code/instincts/modular_rl_safety_gym/trained_models/pulled_from_server/es_testing/x_spread_2_goal/9736443fff_0/saved_weights_gen_460.dat")
    # parameters = torch.load(
    #    "/Users/djrg/code/instincts/modular_rl_safety_gym/trained_models/pulled_from_server/es_testing/ce46f3e92f_0/saved_weights_gen_227.dat"
    # )
    # args.lr = 0.001 #parameters[-1][0]
    ##print(f"learning rate {args.lr}")
    # print(args.init_sigma)
    args.init_sigma = 0.6
    args.lr = 0.001
    blueprint_model = init_ppo(envs, log(args.init_sigma))
    parameters = get_model_weights(blueprint_model)
    parameters.append(np.array([args.lr]))

    # plot_weight_histogram(parameters)

    fitness = inner_loop_ppo(parameters,
                             args,
                             args.lr,
                             num_steps=4000,
                             num_updates=100,
                             run_idx=CURRENT_GOAL,
                             inst_on=False,
                             visualize=False)