예제 #1
0
 def __init__(self, env, feature_str, network_path, device, **kwargs):
     self.env = env
     self.feature = feature_factory(feature_str)
     self.device = device
     self.network = None
     if network_path:
         self.load_network(network_path)
예제 #2
0
def main(load_memory, generate_envs, feature_str, gamma, ftq_params, ftq_net_params, device, normalize_reward,
         workspace, seed,
         lambda_=0., **args):
    envs, params = envs_factory.generate_envs(**generate_envs)
    e = envs[0]
    set_seed(seed, e)

    feature = feature_factory(feature_str)

    ftq = PytorchFittedQ(
        device=device,
        policy_network=NetFTQ(n_in=len(feature(e.reset(), e)), n_out=e.action_space.n, **ftq_net_params),
        action_str=None if not hasattr(e, "action_str") else e.action_str,
        test_policy=None,
        gamma=gamma,
        **ftq_params
    )

    rm = Memory()
    rm.load_memory(**load_memory)

    transitions_ftq, _ = urpy.datas_to_transitions(rm.memory, e, feature, lambda_, normalize_reward)
    logger.info("[learning ftq with full batch] #samples={} ".format(len(transitions_ftq)))
    ftq.reset(True)
    ftq.workspace = workspace
    makedirs(ftq.workspace)
    ftq.fit(transitions_ftq)
    ftq.save_policy()
예제 #3
0
 def __init__(self,
              env,
              feature_str,
              network_path,
              betas_for_discretisation,
              device,
              hull_options,
              clamp_Qc=False,
              **kwargs):
     self.env = env
     self.feature = feature_factory(feature_str)
     self.betas_for_discretisation = betas_for_discretisation
     self.device = device
     self.network = None
     self.hull_options = hull_options
     self.clamp_Qc = clamp_Qc
     if network_path:
         self.load_network(network_path)
예제 #4
0
def main(load_memory, generate_envs, feature_str, gamma, gamma_c, bftq_params,
         bftq_net_params, workspace, seed, device, normalize_reward, general,
         **args):
    logger = logging.getLogger(__name__)

    envs, params = envs_factory.generate_envs(**generate_envs)
    e = envs[0]
    e.reset()
    set_seed(seed, e)
    feature = feature_factory(feature_str)

    bftq = PytorchBudgetedFittedQ(
        device=device,
        workspace=workspace,
        actions_str=get_actions_str(e),
        policy_network=NetBFTQ(size_state=len(feature(e.reset(), e)),
                               n_actions=e.action_space.n,
                               **bftq_net_params),
        gamma=gamma,
        gamma_c=gamma_c,
        split_batches=general["gpu"]["split_batches"],
        cpu_processes=general["cpu"]["processes"],
        env=e,
        **bftq_params)

    makedirs(workspace)
    rm = Memory()
    rm.load_memory(**load_memory)

    _, transitions_bftq = urpy.datas_to_transitions(rm.memory, e, feature, 0,
                                                    normalize_reward)
    logger.info("[learning bftq with full batch] #samples={} ".format(
        len(transitions_bftq)))

    bftq.reset(True)
    _ = bftq.fit(transitions_bftq)

    bftq.save_policy()
예제 #5
0
def main(generate_envs, feature_str, betas_for_exploration, gamma, gamma_c,
         bftq_params, bftq_net_params, N_trajs, workspace, seed, device,
         normalize_reward, trajs_by_ftq_batch, epsilon_decay, general, **args):
    # Prepare BFTQ
    envs, params = envs_factory.generate_envs(**generate_envs)
    e = envs[0]
    set_seed(seed, e)
    rm = Memory()
    feature = feature_factory(feature_str)

    def build_fresh_bftq():
        bftq = PytorchBudgetedFittedQ(
            device=device,
            workspace=workspace / "batch=0",
            actions_str=get_actions_str(e),
            policy_network=NetBFTQ(size_state=len(feature(e.reset(), e)),
                                   n_actions=e.action_space.n,
                                   **bftq_net_params),
            gamma=gamma,
            gamma_c=gamma_c,
            cpu_processes=general["cpu"]["processes"],
            env=e,
            split_batches=general["gpu"]["split_batches"],
            hull_options=general["hull_options"],
            **bftq_params)
        return bftq

    # Prepare learning
    i_traj = 0
    decays = math_utils.epsilon_decay(**epsilon_decay,
                                      N=N_trajs,
                                      savepath=workspace)
    betas_for_exploration = np.array(eval(betas_for_exploration))
    memory_by_batch = [get_current_memory()]
    batch_sizes = near_split(N_trajs, size_bins=trajs_by_ftq_batch)
    pi_epsilon_greedy_config = {
        "__class__": repr(EpsilonGreedyPolicy),
        "pi_greedy": {
            "__class__": repr(RandomBudgetedPolicy)
        },
        "pi_random": {
            "__class__": repr(RandomBudgetedPolicy)
        },
        "epsilon": decays[0],
        "hull_options": general["hull_options"],
        "clamp_Qc": bftq_params["clamp_Qc"]
    }

    # Main loop
    trajs = []
    for batch, batch_size in enumerate(batch_sizes):
        # Prepare workers
        cpu_processes = min(
            general["cpu"]["processes_when_linked_with_gpu"] or os.cpu_count(),
            batch_size)
        workers_n_trajectories = near_split(batch_size, cpu_processes)
        workers_start = np.cumsum(workers_n_trajectories)
        workers_traj_indexes = [
            np.arange(*times) for times in zip(
                np.insert(workers_start[:-1], 0, 0), workers_start)
        ]
        if betas_for_exploration.size:
            workers_betas = [
                betas_for_exploration.take(indexes, mode='wrap')
                for indexes in workers_traj_indexes
            ]
        else:
            workers_betas = [
                np.random.random(indexes.size)
                for indexes in workers_traj_indexes
            ]
        workers_seeds = np.random.randint(0, 10000, cpu_processes).tolist()
        workers_epsilons = [
            decays[i_traj + indexes] for indexes in workers_traj_indexes
        ]
        workers_params = list(
            zip_with_singletons(generate_envs, pi_epsilon_greedy_config,
                                workers_seeds, gamma, gamma_c,
                                workers_n_trajectories, workers_betas,
                                workers_epsilons, None, general["dictConfig"]))

        # Collect trajectories
        logger.info(
            "Collecting trajectories with {} workers...".format(cpu_processes))
        if cpu_processes == 1:
            results = []
            for params in workers_params:
                results.append(execute_policy_from_config(*params))
        else:
            with Pool(processes=cpu_processes) as pool:
                results = pool.starmap(execute_policy_from_config,
                                       workers_params)
        i_traj += sum([len(trajectories) for trajectories, _ in results])

        # Fill memory
        [
            rm.push(*sample) for trajectories, _ in results
            for trajectory in trajectories for sample in trajectory
        ]

        transitions_ftq, transition_bftq = datas_to_transitions(
            rm.memory, e, feature, 0, normalize_reward)

        # Fit model
        logger.info(
            "[BATCH={}]---------------------------------------".format(batch))
        logger.info(
            "[BATCH={}][learning bftq pi greedy] #samples={} #traj={}".format(
                batch, len(transition_bftq), i_traj))
        logger.info(
            "[BATCH={}]---------------------------------------".format(batch))
        bftq = build_fresh_bftq()
        bftq.reset(True)
        bftq.workspace = workspace / "batch={}".format(batch)
        makedirs(bftq.workspace)
        if isinstance(e, EnvGridWorld):
            for trajectories, _ in results:
                for traj in trajectories:
                    trajs.append(traj)

            w = World(e)
            w.draw_frame()
            w.draw_lattice()
            w.draw_cases()
            w.draw_source_trajectories(trajs)
            w.save((bftq.workspace / "bftq_on_2dworld_sources").as_posix())
        q = bftq.fit(transition_bftq)

        # Save policy
        network_path = bftq.save_policy()
        os.system("cp {}/policy.pt {}/policy.pt".format(
            bftq.workspace, workspace))

        # Save memory
        save_memory(bftq, memory_by_batch, by_batch=False)

        # Update greedy policy
        pi_epsilon_greedy_config["pi_greedy"] = {
            "__class__": repr(PytorchBudgetedFittedPolicy),
            "feature_str": feature_str,
            "network_path": network_path,
            "betas_for_discretisation": bftq.betas_for_discretisation,
            "device": bftq.device,
            "hull_options": general["hull_options"],
            "clamp_Qc": bftq_params["clamp_Qc"]
        }

        if isinstance(e, EnvGridWorld):

            def pi(state, beta):
                import torch
                from ncarrara.budgeted_rl.bftq.pytorch_budgeted_fittedq import convex_hull, \
                    optimal_pia_pib
                with torch.no_grad():
                    hull = convex_hull(s=torch.tensor([state],
                                                      device=device,
                                                      dtype=torch.float32),
                                       Q=q,
                                       action_mask=np.zeros(e.action_space.n),
                                       id="run_" + str(state),
                                       disp=False,
                                       betas=bftq.betas_for_discretisation,
                                       device=device,
                                       hull_options=general["hull_options"],
                                       clamp_Qc=bftq_params["clamp_Qc"])
                    opt, _ = optimal_pia_pib(beta=beta,
                                             hull=hull,
                                             statistic={})
                return opt

            def qr(state, a, beta):
                import torch
                s = torch.tensor([[state]], device=device)
                b = torch.tensor([[[beta]]], device=device)
                sb = torch.cat((s, b), dim=2)
                return q(sb).squeeze()[a]

            def qc(state, a, beta):
                import torch
                s = torch.tensor([[state]], device=device)
                b = torch.tensor([[[beta]]], device=device)
                sb = torch.cat((s, b), dim=2)
                return q(sb).squeeze()[e.action_space.n + a]

            w = World(e, bftq.betas_for_discretisation)
            w.draw_frame()
            w.draw_lattice()
            w.draw_cases()
            w.draw_policy_bftq(pi, qr, qc, bftq.betas_for_discretisation)
            w.save((bftq.workspace / "bftq_on_2dworld").as_posix())

    save_memory(bftq, memory_by_batch, by_batch=True)
예제 #6
0
def main(generate_envs,
         feature_str,
         gamma,
         gamma_c,
         ftq_params,
         ftq_net_params,
         device,
         epsilon_decay,
         N_trajs,
         trajs_by_ftq_batch,
         normalize_reward,
         workspace,
         seed,
         save_memory,
         general,
         lambda_=0,
         **args):
    envs, params = envs_factory.generate_envs(**generate_envs)
    e = envs[0]
    set_seed(seed, e)
    rm = Memory()
    feature = feature_factory(feature_str)

    def build_fresh_ftq():
        ftq = PytorchFittedQ(
            device=device,
            policy_network=NetFTQ(n_in=len(feature(e.reset(), e)),
                                  n_out=e.action_space.n,
                                  **ftq_net_params),
            action_str=None if not hasattr(e, "action_str") else e.action_str,
            test_policy=None,
            gamma=gamma,
            **ftq_params)
        return ftq

    # Prepare learning
    i_traj = 0
    decays = math_utils.epsilon_decay(**epsilon_decay,
                                      N=N_trajs,
                                      savepath=workspace)
    batch_sizes = near_split(N_trajs, size_bins=trajs_by_ftq_batch)
    pi_epsilon_greedy_config = {
        "__class__": repr(EpsilonGreedyPolicy),
        "pi_greedy": {
            "__class__": repr(RandomPolicy)
        },
        "pi_random": {
            "__class__": repr(RandomPolicy)
        },
        "epsilon": decays[0]
    }

    # Main loop
    trajs = []
    for batch, batch_size in enumerate(batch_sizes):
        # Prepare workers
        cpu_processes = min(
            general["cpu"]["processes_when_linked_with_gpu"] or os.cpu_count(),
            batch_size)
        workers_n_trajectories = near_split(batch_size, cpu_processes)
        workers_start = np.cumsum(workers_n_trajectories)
        workers_traj_indexes = [
            np.arange(*times) for times in zip(
                np.insert(workers_start[:-1], 0, 0), workers_start)
        ]
        workers_seeds = np.random.randint(0, 10000, cpu_processes).tolist()
        workers_epsilons = [
            decays[i_traj + indexes] for indexes in workers_traj_indexes
        ]
        workers_params = list(
            zip_with_singletons(generate_envs, pi_epsilon_greedy_config,
                                workers_seeds, gamma, gamma_c,
                                workers_n_trajectories, None, workers_epsilons,
                                None, general["dictConfig"]))

        # Collect trajectories
        logger.info(
            "Collecting trajectories with {} workers...".format(cpu_processes))
        if cpu_processes == 1:
            results = [execute_policy_from_config(*workers_params[0])]
        else:
            with Pool(processes=cpu_processes) as pool:
                results = pool.starmap(execute_policy_from_config,
                                       workers_params)
        i_traj += sum([len(trajectories) for trajectories, _ in results])

        # Fill memory
        [
            rm.push(*sample) for trajectories, _ in results
            for trajectory in trajectories for sample in trajectory
        ]
        transitions_ftq, _ = datas_to_transitions(rm.memory, e, feature,
                                                  lambda_, normalize_reward)

        # Fit model
        logger.info(
            "[BATCH={}]---------------------------------------".format(batch))
        logger.info(
            "[BATCH={}][learning ftq pi greedy] #samples={} #traj={}".format(
                batch, len(transitions_ftq), i_traj))
        logger.info(
            "[BATCH={}]---------------------------------------".format(batch))
        ftq = build_fresh_ftq()
        ftq.reset(True)
        ftq.workspace = workspace / "batch={}".format(batch)
        makedirs(ftq.workspace)

        if isinstance(e, EnvGridWorld):

            for trajectories, _ in results:
                for traj in trajectories:
                    trajs.append(traj)

            w = World(e)
            w.draw_frame()
            w.draw_lattice()
            w.draw_cases()
            w.draw_source_trajectories(trajs)
            w.save((ftq.workspace / "bftq_on_2dworld_sources").as_posix())

        ftq.fit(transitions_ftq)

        # Save policy
        network_path = ftq.save_policy()
        os.system("cp {}/policy.pt {}/final_policy.pt".format(
            ftq.workspace, workspace))

        # Update greedy policy
        pi_epsilon_greedy_config["pi_greedy"] = {
            "__class__": repr(PytorchFittedPolicy),
            "feature_str": feature_str,
            "network_path": network_path,
            "device": ftq.device
        }
    if save_memory is not None:
        rm.save(workspace / save_memory["path"], save_memory["as_json"])
예제 #7
0
def main(policy_path, generate_envs, feature_str, device, workspace, bftq_params, seed, general,
         betas_test, N_trajs, gamma, gamma_c, bftq_net_params, **args):
    if not os.path.isabs(policy_path):
        policy_path = workspace / policy_path

    env = envs_factory.generate_envs(**generate_envs)[0][0]
    feature = feature_factory(feature_str)

    bftq = PytorchBudgetedFittedQ(
        device=device,
        workspace=workspace,
        actions_str=get_actions_str(env),
        policy_network=NetBFTQ(size_state=len(feature(env.reset(), env)), n_actions=env.action_space.n,
                               **bftq_net_params),
        gamma=gamma,
        gamma_c=gamma_c,
        cpu_processes=general["cpu"]["processes"],
        env=env,
        hull_options=general["hull_options"],
        **bftq_params)
    bftq.reset(True)

    pi_config = {
        "__class__": repr(PytorchBudgetedFittedPolicy),
        "feature_str": feature_str,
        "network_path": policy_path,
        "betas_for_discretisation": eval(bftq_params["betas_for_discretisation"]),
        "device": device,
        "hull_options": general["hull_options"],
        "clamp_Qc": bftq_params["clamp_Qc"],
        "env": env
    }
    pi = policy_factory(pi_config)

    # Iterate over betas
    for beta in eval(betas_test):
        logger.info("Rendering with beta={}".format(beta))
        set_seed(seed, env)
        for traj in range(N_trajs):
            done = False
            pi.reset()
            info_env = {}
            info_pi = {"beta": beta}
            t = 0

            # Make a workspace for trajectories
            traj_workspace = workspace / "trajs" / "beta={}".format(beta) / "traj={}".format(traj)
            makedirs(traj_workspace)
            bftq.workspace = traj_workspace
            monitor = MonitorV2(env, traj_workspace, add_subdirectory=False)
            obs = monitor.reset()

            # Run trajectory
            while not done:
                action_mask = get_action_mask(env)
                info_pi = merge_two_dicts(info_pi, info_env)
                bftq.draw_Qr_and_Qc(obs, pi.network, "render_t={}".format(t), show=False)
                a, _, info_pi = pi.execute(obs, action_mask, info_pi)
                render(env, workspace, t, a)
                obs, _, done, info_env = monitor.step(a)
                t += 1
            monitor.close()
예제 #8
0
def main(empty_previous_test=False):
    # betas = np.linspace(0, 1, 5)

    set_seed(C.seed)

    if empty_previous_test:
        empty_directory(C.path_ftq_results)

    envs, params = generate_envs(**C["generate_envs"])
    e = envs[0]
    e.reset()
    feature = feature_factory(C["feature_str"])

    size_state = len(feature(e.reset(), e))
    logger.info("neural net input size : {}".format(size_state))
    N = C["create_data"]["N_trajs"]
    traj_max_size = np.inf
    decays = epsilon_decay(**C["create_data"]["epsilon_decay"], N=N)
    net = BudgetedNetwork(size_state=size_state,
                          layers=C["bftq_net_params"]["intra_layers"] +
                          [2 * e.action_space.n],
                          device=C.device,
                          **C["bftq_net_params"])

    betas_for_discretisation = eval(C["betas_for_discretisation"])
    betas = np.linspace(0, 1, 31)

    print(C["bdqn_params"])
    dqn = PytorchBudgetedDQN(policy_net=net,
                             workspace=C.path_bdqn,
                             device=C.device,
                             gamma=C["gamma"],
                             gamma_c=C["gamma_c"],
                             beta_for_discretisation=betas_for_discretisation,
                             **C["bdqn_params"])
    dqn.reset()
    e.seed(C.seed)
    rrr = []
    rrr_greedy = []
    nb_samples = 0
    rm = Memory()
    result = np.zeros((N, 4))
    for n in range(N):
        if len(betas) > 0:
            betas_to_explore = betas
        else:
            betas_to_explore = [np.random.random()]
        for beta in betas_to_explore:
            logger.info("beta {}".format(beta))
            if N // 10 > 0 and n % (N // 10) == 0:
                logger.debug("DQN step {}/{}".format(n, N))
            s = e.reset()
            done = False
            result_traj = np.zeros(4)
            it = 0
            trajectory = []
            while (not done):
                if np.random.random() < decays[n]:
                    if hasattr(e, "action_space_executable"):
                        raise NotImplementedError("TODO")
                    else:
                        action_repartition = np.random.random(e.action_space.n)
                        action_repartition /= np.sum(action_repartition)
                        budget_repartion = generate_random_point_on_simplex_not_uniform(
                            coeff=action_repartition,
                            bias=beta,
                            min_x=0,
                            max_x=1)
                        a = np.random.choice(a=range(e.action_space.n),
                                             p=action_repartition)
                        beta_ = budget_repartion[a]
                        logger.info(
                            "Random action : {} with a random budget : {:.2f} (from {})"
                            .format(a, beta_, [
                                "{:.2f}".format(bud)
                                for bud in budget_repartion
                            ]))
                else:
                    if hasattr(e, "action_space_executable"):
                        raise NotImplementedError("TODO")
                    else:
                        a, beta_ = dqn.pi(feature(s, e), beta,
                                          np.zeros(e.action_space.n))
                        logger.info(
                            "Greedy action : {} with corresponding budget {}".
                            format(a, beta))

                s_, r_, done, info = e.step(a)
                c_ = info["c_"]
                sample = (s, a if type(a) is str else int(a), r_, s_, done,
                          info)
                trajectory.append(sample)

                result_traj += np.array(
                    [r_, c_, r_ * (C["gamma"]**it), c_ * (C["gamma_c"]**it)])
                t_dqn = (feature(s,
                                 e), a, r_, feature(s_,
                                                    e), c_, beta, done, info)
                dqn.update(*t_dqn)
                s = s_
                beta = beta_
                nb_samples += 1
                it += 1
                if it % 100 == 0:
                    if it > 500:
                        logger.warning(
                            "Number of trajectories overflowing : {}".format(
                                it))
                if traj_max_size is not None and it >= traj_max_size:
                    logger.warning("Max size trajectory reached")
                    break
            logger.info("result_traj : {}".format(result_traj))

            for sample in trajectory:
                rm.push(*sample)

    logger.info("[execute_policy] saving results at : {}".format(
        C.path_dqn_results))
    np.savetxt(C.path_bdqn_results + "/greedy_lambda_=0.result", result)
    if N > 100:
        nb_traj_packet = 100
        a = np.reshape(rrr, (int(N / nb_traj_packet), -1))
        a = np.mean(a, 1)
        x = np.asarray(range(len(a))) * nb_traj_packet
        plt.plot(x, a)
        a = np.reshape(rrr_greedy, (int(N / nb_traj_packet), -1))
        a = np.mean(a, 1)
        plt.plot(x, a)
        plt.title("dqn results")
        plt.show()
        plt.savefig(C.workspace / "dqn_create_data")
        plt.close()
    rm.save(C.workspace, "/" + C["create_data"]["filename_data"],
            C["create_data"]["as_json"])