def __init__(self, env, feature_str, network_path, device, **kwargs): self.env = env self.feature = feature_factory(feature_str) self.device = device self.network = None if network_path: self.load_network(network_path)
def main(load_memory, generate_envs, feature_str, gamma, ftq_params, ftq_net_params, device, normalize_reward, workspace, seed, lambda_=0., **args): envs, params = envs_factory.generate_envs(**generate_envs) e = envs[0] set_seed(seed, e) feature = feature_factory(feature_str) ftq = PytorchFittedQ( device=device, policy_network=NetFTQ(n_in=len(feature(e.reset(), e)), n_out=e.action_space.n, **ftq_net_params), action_str=None if not hasattr(e, "action_str") else e.action_str, test_policy=None, gamma=gamma, **ftq_params ) rm = Memory() rm.load_memory(**load_memory) transitions_ftq, _ = urpy.datas_to_transitions(rm.memory, e, feature, lambda_, normalize_reward) logger.info("[learning ftq with full batch] #samples={} ".format(len(transitions_ftq))) ftq.reset(True) ftq.workspace = workspace makedirs(ftq.workspace) ftq.fit(transitions_ftq) ftq.save_policy()
def __init__(self, env, feature_str, network_path, betas_for_discretisation, device, hull_options, clamp_Qc=False, **kwargs): self.env = env self.feature = feature_factory(feature_str) self.betas_for_discretisation = betas_for_discretisation self.device = device self.network = None self.hull_options = hull_options self.clamp_Qc = clamp_Qc if network_path: self.load_network(network_path)
def main(load_memory, generate_envs, feature_str, gamma, gamma_c, bftq_params, bftq_net_params, workspace, seed, device, normalize_reward, general, **args): logger = logging.getLogger(__name__) envs, params = envs_factory.generate_envs(**generate_envs) e = envs[0] e.reset() set_seed(seed, e) feature = feature_factory(feature_str) bftq = PytorchBudgetedFittedQ( device=device, workspace=workspace, actions_str=get_actions_str(e), policy_network=NetBFTQ(size_state=len(feature(e.reset(), e)), n_actions=e.action_space.n, **bftq_net_params), gamma=gamma, gamma_c=gamma_c, split_batches=general["gpu"]["split_batches"], cpu_processes=general["cpu"]["processes"], env=e, **bftq_params) makedirs(workspace) rm = Memory() rm.load_memory(**load_memory) _, transitions_bftq = urpy.datas_to_transitions(rm.memory, e, feature, 0, normalize_reward) logger.info("[learning bftq with full batch] #samples={} ".format( len(transitions_bftq))) bftq.reset(True) _ = bftq.fit(transitions_bftq) bftq.save_policy()
def main(generate_envs, feature_str, betas_for_exploration, gamma, gamma_c, bftq_params, bftq_net_params, N_trajs, workspace, seed, device, normalize_reward, trajs_by_ftq_batch, epsilon_decay, general, **args): # Prepare BFTQ envs, params = envs_factory.generate_envs(**generate_envs) e = envs[0] set_seed(seed, e) rm = Memory() feature = feature_factory(feature_str) def build_fresh_bftq(): bftq = PytorchBudgetedFittedQ( device=device, workspace=workspace / "batch=0", actions_str=get_actions_str(e), policy_network=NetBFTQ(size_state=len(feature(e.reset(), e)), n_actions=e.action_space.n, **bftq_net_params), gamma=gamma, gamma_c=gamma_c, cpu_processes=general["cpu"]["processes"], env=e, split_batches=general["gpu"]["split_batches"], hull_options=general["hull_options"], **bftq_params) return bftq # Prepare learning i_traj = 0 decays = math_utils.epsilon_decay(**epsilon_decay, N=N_trajs, savepath=workspace) betas_for_exploration = np.array(eval(betas_for_exploration)) memory_by_batch = [get_current_memory()] batch_sizes = near_split(N_trajs, size_bins=trajs_by_ftq_batch) pi_epsilon_greedy_config = { "__class__": repr(EpsilonGreedyPolicy), "pi_greedy": { "__class__": repr(RandomBudgetedPolicy) }, "pi_random": { "__class__": repr(RandomBudgetedPolicy) }, "epsilon": decays[0], "hull_options": general["hull_options"], "clamp_Qc": bftq_params["clamp_Qc"] } # Main loop trajs = [] for batch, batch_size in enumerate(batch_sizes): # Prepare workers cpu_processes = min( general["cpu"]["processes_when_linked_with_gpu"] or os.cpu_count(), batch_size) workers_n_trajectories = near_split(batch_size, cpu_processes) workers_start = np.cumsum(workers_n_trajectories) workers_traj_indexes = [ np.arange(*times) for times in zip( np.insert(workers_start[:-1], 0, 0), workers_start) ] if betas_for_exploration.size: workers_betas = [ betas_for_exploration.take(indexes, mode='wrap') for indexes in workers_traj_indexes ] else: workers_betas = [ np.random.random(indexes.size) for indexes in workers_traj_indexes ] workers_seeds = np.random.randint(0, 10000, cpu_processes).tolist() workers_epsilons = [ decays[i_traj + indexes] for indexes in workers_traj_indexes ] workers_params = list( zip_with_singletons(generate_envs, pi_epsilon_greedy_config, workers_seeds, gamma, gamma_c, workers_n_trajectories, workers_betas, workers_epsilons, None, general["dictConfig"])) # Collect trajectories logger.info( "Collecting trajectories with {} workers...".format(cpu_processes)) if cpu_processes == 1: results = [] for params in workers_params: results.append(execute_policy_from_config(*params)) else: with Pool(processes=cpu_processes) as pool: results = pool.starmap(execute_policy_from_config, workers_params) i_traj += sum([len(trajectories) for trajectories, _ in results]) # Fill memory [ rm.push(*sample) for trajectories, _ in results for trajectory in trajectories for sample in trajectory ] transitions_ftq, transition_bftq = datas_to_transitions( rm.memory, e, feature, 0, normalize_reward) # Fit model logger.info( "[BATCH={}]---------------------------------------".format(batch)) logger.info( "[BATCH={}][learning bftq pi greedy] #samples={} #traj={}".format( batch, len(transition_bftq), i_traj)) logger.info( "[BATCH={}]---------------------------------------".format(batch)) bftq = build_fresh_bftq() bftq.reset(True) bftq.workspace = workspace / "batch={}".format(batch) makedirs(bftq.workspace) if isinstance(e, EnvGridWorld): for trajectories, _ in results: for traj in trajectories: trajs.append(traj) w = World(e) w.draw_frame() w.draw_lattice() w.draw_cases() w.draw_source_trajectories(trajs) w.save((bftq.workspace / "bftq_on_2dworld_sources").as_posix()) q = bftq.fit(transition_bftq) # Save policy network_path = bftq.save_policy() os.system("cp {}/policy.pt {}/policy.pt".format( bftq.workspace, workspace)) # Save memory save_memory(bftq, memory_by_batch, by_batch=False) # Update greedy policy pi_epsilon_greedy_config["pi_greedy"] = { "__class__": repr(PytorchBudgetedFittedPolicy), "feature_str": feature_str, "network_path": network_path, "betas_for_discretisation": bftq.betas_for_discretisation, "device": bftq.device, "hull_options": general["hull_options"], "clamp_Qc": bftq_params["clamp_Qc"] } if isinstance(e, EnvGridWorld): def pi(state, beta): import torch from ncarrara.budgeted_rl.bftq.pytorch_budgeted_fittedq import convex_hull, \ optimal_pia_pib with torch.no_grad(): hull = convex_hull(s=torch.tensor([state], device=device, dtype=torch.float32), Q=q, action_mask=np.zeros(e.action_space.n), id="run_" + str(state), disp=False, betas=bftq.betas_for_discretisation, device=device, hull_options=general["hull_options"], clamp_Qc=bftq_params["clamp_Qc"]) opt, _ = optimal_pia_pib(beta=beta, hull=hull, statistic={}) return opt def qr(state, a, beta): import torch s = torch.tensor([[state]], device=device) b = torch.tensor([[[beta]]], device=device) sb = torch.cat((s, b), dim=2) return q(sb).squeeze()[a] def qc(state, a, beta): import torch s = torch.tensor([[state]], device=device) b = torch.tensor([[[beta]]], device=device) sb = torch.cat((s, b), dim=2) return q(sb).squeeze()[e.action_space.n + a] w = World(e, bftq.betas_for_discretisation) w.draw_frame() w.draw_lattice() w.draw_cases() w.draw_policy_bftq(pi, qr, qc, bftq.betas_for_discretisation) w.save((bftq.workspace / "bftq_on_2dworld").as_posix()) save_memory(bftq, memory_by_batch, by_batch=True)
def main(generate_envs, feature_str, gamma, gamma_c, ftq_params, ftq_net_params, device, epsilon_decay, N_trajs, trajs_by_ftq_batch, normalize_reward, workspace, seed, save_memory, general, lambda_=0, **args): envs, params = envs_factory.generate_envs(**generate_envs) e = envs[0] set_seed(seed, e) rm = Memory() feature = feature_factory(feature_str) def build_fresh_ftq(): ftq = PytorchFittedQ( device=device, policy_network=NetFTQ(n_in=len(feature(e.reset(), e)), n_out=e.action_space.n, **ftq_net_params), action_str=None if not hasattr(e, "action_str") else e.action_str, test_policy=None, gamma=gamma, **ftq_params) return ftq # Prepare learning i_traj = 0 decays = math_utils.epsilon_decay(**epsilon_decay, N=N_trajs, savepath=workspace) batch_sizes = near_split(N_trajs, size_bins=trajs_by_ftq_batch) pi_epsilon_greedy_config = { "__class__": repr(EpsilonGreedyPolicy), "pi_greedy": { "__class__": repr(RandomPolicy) }, "pi_random": { "__class__": repr(RandomPolicy) }, "epsilon": decays[0] } # Main loop trajs = [] for batch, batch_size in enumerate(batch_sizes): # Prepare workers cpu_processes = min( general["cpu"]["processes_when_linked_with_gpu"] or os.cpu_count(), batch_size) workers_n_trajectories = near_split(batch_size, cpu_processes) workers_start = np.cumsum(workers_n_trajectories) workers_traj_indexes = [ np.arange(*times) for times in zip( np.insert(workers_start[:-1], 0, 0), workers_start) ] workers_seeds = np.random.randint(0, 10000, cpu_processes).tolist() workers_epsilons = [ decays[i_traj + indexes] for indexes in workers_traj_indexes ] workers_params = list( zip_with_singletons(generate_envs, pi_epsilon_greedy_config, workers_seeds, gamma, gamma_c, workers_n_trajectories, None, workers_epsilons, None, general["dictConfig"])) # Collect trajectories logger.info( "Collecting trajectories with {} workers...".format(cpu_processes)) if cpu_processes == 1: results = [execute_policy_from_config(*workers_params[0])] else: with Pool(processes=cpu_processes) as pool: results = pool.starmap(execute_policy_from_config, workers_params) i_traj += sum([len(trajectories) for trajectories, _ in results]) # Fill memory [ rm.push(*sample) for trajectories, _ in results for trajectory in trajectories for sample in trajectory ] transitions_ftq, _ = datas_to_transitions(rm.memory, e, feature, lambda_, normalize_reward) # Fit model logger.info( "[BATCH={}]---------------------------------------".format(batch)) logger.info( "[BATCH={}][learning ftq pi greedy] #samples={} #traj={}".format( batch, len(transitions_ftq), i_traj)) logger.info( "[BATCH={}]---------------------------------------".format(batch)) ftq = build_fresh_ftq() ftq.reset(True) ftq.workspace = workspace / "batch={}".format(batch) makedirs(ftq.workspace) if isinstance(e, EnvGridWorld): for trajectories, _ in results: for traj in trajectories: trajs.append(traj) w = World(e) w.draw_frame() w.draw_lattice() w.draw_cases() w.draw_source_trajectories(trajs) w.save((ftq.workspace / "bftq_on_2dworld_sources").as_posix()) ftq.fit(transitions_ftq) # Save policy network_path = ftq.save_policy() os.system("cp {}/policy.pt {}/final_policy.pt".format( ftq.workspace, workspace)) # Update greedy policy pi_epsilon_greedy_config["pi_greedy"] = { "__class__": repr(PytorchFittedPolicy), "feature_str": feature_str, "network_path": network_path, "device": ftq.device } if save_memory is not None: rm.save(workspace / save_memory["path"], save_memory["as_json"])
def main(policy_path, generate_envs, feature_str, device, workspace, bftq_params, seed, general, betas_test, N_trajs, gamma, gamma_c, bftq_net_params, **args): if not os.path.isabs(policy_path): policy_path = workspace / policy_path env = envs_factory.generate_envs(**generate_envs)[0][0] feature = feature_factory(feature_str) bftq = PytorchBudgetedFittedQ( device=device, workspace=workspace, actions_str=get_actions_str(env), policy_network=NetBFTQ(size_state=len(feature(env.reset(), env)), n_actions=env.action_space.n, **bftq_net_params), gamma=gamma, gamma_c=gamma_c, cpu_processes=general["cpu"]["processes"], env=env, hull_options=general["hull_options"], **bftq_params) bftq.reset(True) pi_config = { "__class__": repr(PytorchBudgetedFittedPolicy), "feature_str": feature_str, "network_path": policy_path, "betas_for_discretisation": eval(bftq_params["betas_for_discretisation"]), "device": device, "hull_options": general["hull_options"], "clamp_Qc": bftq_params["clamp_Qc"], "env": env } pi = policy_factory(pi_config) # Iterate over betas for beta in eval(betas_test): logger.info("Rendering with beta={}".format(beta)) set_seed(seed, env) for traj in range(N_trajs): done = False pi.reset() info_env = {} info_pi = {"beta": beta} t = 0 # Make a workspace for trajectories traj_workspace = workspace / "trajs" / "beta={}".format(beta) / "traj={}".format(traj) makedirs(traj_workspace) bftq.workspace = traj_workspace monitor = MonitorV2(env, traj_workspace, add_subdirectory=False) obs = monitor.reset() # Run trajectory while not done: action_mask = get_action_mask(env) info_pi = merge_two_dicts(info_pi, info_env) bftq.draw_Qr_and_Qc(obs, pi.network, "render_t={}".format(t), show=False) a, _, info_pi = pi.execute(obs, action_mask, info_pi) render(env, workspace, t, a) obs, _, done, info_env = monitor.step(a) t += 1 monitor.close()
def main(empty_previous_test=False): # betas = np.linspace(0, 1, 5) set_seed(C.seed) if empty_previous_test: empty_directory(C.path_ftq_results) envs, params = generate_envs(**C["generate_envs"]) e = envs[0] e.reset() feature = feature_factory(C["feature_str"]) size_state = len(feature(e.reset(), e)) logger.info("neural net input size : {}".format(size_state)) N = C["create_data"]["N_trajs"] traj_max_size = np.inf decays = epsilon_decay(**C["create_data"]["epsilon_decay"], N=N) net = BudgetedNetwork(size_state=size_state, layers=C["bftq_net_params"]["intra_layers"] + [2 * e.action_space.n], device=C.device, **C["bftq_net_params"]) betas_for_discretisation = eval(C["betas_for_discretisation"]) betas = np.linspace(0, 1, 31) print(C["bdqn_params"]) dqn = PytorchBudgetedDQN(policy_net=net, workspace=C.path_bdqn, device=C.device, gamma=C["gamma"], gamma_c=C["gamma_c"], beta_for_discretisation=betas_for_discretisation, **C["bdqn_params"]) dqn.reset() e.seed(C.seed) rrr = [] rrr_greedy = [] nb_samples = 0 rm = Memory() result = np.zeros((N, 4)) for n in range(N): if len(betas) > 0: betas_to_explore = betas else: betas_to_explore = [np.random.random()] for beta in betas_to_explore: logger.info("beta {}".format(beta)) if N // 10 > 0 and n % (N // 10) == 0: logger.debug("DQN step {}/{}".format(n, N)) s = e.reset() done = False result_traj = np.zeros(4) it = 0 trajectory = [] while (not done): if np.random.random() < decays[n]: if hasattr(e, "action_space_executable"): raise NotImplementedError("TODO") else: action_repartition = np.random.random(e.action_space.n) action_repartition /= np.sum(action_repartition) budget_repartion = generate_random_point_on_simplex_not_uniform( coeff=action_repartition, bias=beta, min_x=0, max_x=1) a = np.random.choice(a=range(e.action_space.n), p=action_repartition) beta_ = budget_repartion[a] logger.info( "Random action : {} with a random budget : {:.2f} (from {})" .format(a, beta_, [ "{:.2f}".format(bud) for bud in budget_repartion ])) else: if hasattr(e, "action_space_executable"): raise NotImplementedError("TODO") else: a, beta_ = dqn.pi(feature(s, e), beta, np.zeros(e.action_space.n)) logger.info( "Greedy action : {} with corresponding budget {}". format(a, beta)) s_, r_, done, info = e.step(a) c_ = info["c_"] sample = (s, a if type(a) is str else int(a), r_, s_, done, info) trajectory.append(sample) result_traj += np.array( [r_, c_, r_ * (C["gamma"]**it), c_ * (C["gamma_c"]**it)]) t_dqn = (feature(s, e), a, r_, feature(s_, e), c_, beta, done, info) dqn.update(*t_dqn) s = s_ beta = beta_ nb_samples += 1 it += 1 if it % 100 == 0: if it > 500: logger.warning( "Number of trajectories overflowing : {}".format( it)) if traj_max_size is not None and it >= traj_max_size: logger.warning("Max size trajectory reached") break logger.info("result_traj : {}".format(result_traj)) for sample in trajectory: rm.push(*sample) logger.info("[execute_policy] saving results at : {}".format( C.path_dqn_results)) np.savetxt(C.path_bdqn_results + "/greedy_lambda_=0.result", result) if N > 100: nb_traj_packet = 100 a = np.reshape(rrr, (int(N / nb_traj_packet), -1)) a = np.mean(a, 1) x = np.asarray(range(len(a))) * nb_traj_packet plt.plot(x, a) a = np.reshape(rrr_greedy, (int(N / nb_traj_packet), -1)) a = np.mean(a, 1) plt.plot(x, a) plt.title("dqn results") plt.show() plt.savefig(C.workspace / "dqn_create_data") plt.close() rm.save(C.workspace, "/" + C["create_data"]["filename_data"], C["create_data"]["as_json"])