def load(self, config): if type(config) == type(""): self.logger.info("reading config file at {}".format(config)) with open(config, 'r') as infile: import json self.dict = json.load(infile) elif type(config) == type({}): self.dict = config else: raise TypeError( "Wrong type for configuration, must be a path or a dict") self.id = self.dict["general"]["id"] self.workspace = Path(self.dict["general"]["workspace"]) print(self.dict["general"]) import logging.config as config config.dictConfig(self.dict["general"]["dictConfig"]) self.seed = self.dict["general"]["seed"] if self.seed is not None: from ncarrara.utils.math_utils import set_seed set_seed(self.seed) import numpy as np np.set_printoptions(precision=2, suppress=True) self.writer = None self.is_tensorboardX = self["general"]["is_tensorboardX"] return self
def execute_policy_from_config(generate_envs, policy_config, seed=None, gamma_r=1.0, gamma_c=1.0, n_trajectories=10, beta=1., epsilon_schedule=None, save_path=None, logging_config={}): """ Generate an environment and a policy from configurations, and collect trajectories. :param generate_envs: environment config :param policy_config: policy config :param seed: to seed the environment before execution :param gamma_r: see execute_policy() :param gamma_c: see execute_policy() :param n_trajectories: see execute_policy() :param beta: see execute_policy() :param epsilon_schedule: see execute_policy() :param save_path: see execute_policy() :param logging_config: the logging config of the process :return: the collected trajectories """ if logging_config: import logging.config as config config.dictConfig(logging_config) envs, params = envs_factory.generate_envs(**generate_envs) env = envs[0] set_seed(seed, env) policy_config["env"] = env pi = policy_factory(policy_config) return execute_policy(env, pi, gamma_r, gamma_c, n_trajectories, beta, epsilon_schedule, save_path)
def main(source_envs, feature_dqn_info, net_params, dqn_params, N, seed, device, workspace, decay, start_decay, traj_max_size, gamma, writer=None): envs, params = generate_envs(**source_envs) for ienv, env in enumerate(envs): logger.info("generating samples for env {}".format(ienv)) set_seed(seed=seed, env=env) feature_dqn = build_feature_dqn(feature_dqn_info) _, _, memory, dqn = run_dqn( env, id="generate_sources_env_{}".format(ienv), workspace=workspace / "dqn_workspace", seed=seed, feature_dqn=feature_dqn, device=device, net_params=net_params, dqn_params=dqn_params, N=N, decay=decay, start_decay=start_decay, traj_max_size=traj_max_size, gamma=gamma, writer=writer) memory.save(workspace / "samples" / "{}.json".format(ienv), as_json=False) dqn.save(workspace / "dqn" / "{}.pt".format(ienv)) with open(workspace / 'params.json', 'w') as file: dump = json.dumps(params, indent=4) print(dump) file.write(dump) return env.action_space.n
def main(load_memory, generate_envs, feature_str, gamma, ftq_params, ftq_net_params, device, normalize_reward, workspace, seed, lambda_=0., **args): envs, params = envs_factory.generate_envs(**generate_envs) e = envs[0] set_seed(seed, e) feature = feature_factory(feature_str) ftq = PytorchFittedQ( device=device, policy_network=NetFTQ(n_in=len(feature(e.reset(), e)), n_out=e.action_space.n, **ftq_net_params), action_str=None if not hasattr(e, "action_str") else e.action_str, test_policy=None, gamma=gamma, **ftq_params ) rm = Memory() rm.load_memory(**load_memory) transitions_ftq, _ = urpy.datas_to_transitions(rm.memory, e, feature, lambda_, normalize_reward) logger.info("[learning ftq with full batch] #samples={} ".format(len(transitions_ftq))) ftq.reset(True) ftq.workspace = workspace makedirs(ftq.workspace) ftq.fit(transitions_ftq) ftq.save_policy()
def main(load_memory, generate_envs, feature_str, gamma, gamma_c, bftq_params, bftq_net_params, workspace, seed, device, normalize_reward, general, **args): logger = logging.getLogger(__name__) envs, params = envs_factory.generate_envs(**generate_envs) e = envs[0] e.reset() set_seed(seed, e) feature = feature_factory(feature_str) bftq = PytorchBudgetedFittedQ( device=device, workspace=workspace, actions_str=get_actions_str(e), policy_network=NetBFTQ(size_state=len(feature(e.reset(), e)), n_actions=e.action_space.n, **bftq_net_params), gamma=gamma, gamma_c=gamma_c, split_batches=general["gpu"]["split_batches"], cpu_processes=general["cpu"]["processes"], env=e, **bftq_params) makedirs(workspace) rm = Memory() rm.load_memory(**load_memory) _, transitions_bftq = urpy.datas_to_transitions(rm.memory, e, feature, 0, normalize_reward) logger.info("[learning bftq with full batch] #samples={} ".format( len(transitions_bftq))) bftq.reset(True) _ = bftq.fit(transitions_bftq) bftq.save_policy()
def main(generate_envs, feature_str, betas_for_exploration, gamma, gamma_c, bftq_params, bftq_net_params, N_trajs, workspace, seed, device, normalize_reward, trajs_by_ftq_batch, epsilon_decay, general, **args): # Prepare BFTQ envs, params = envs_factory.generate_envs(**generate_envs) e = envs[0] set_seed(seed, e) rm = Memory() feature = feature_factory(feature_str) def build_fresh_bftq(): bftq = PytorchBudgetedFittedQ( device=device, workspace=workspace / "batch=0", actions_str=get_actions_str(e), policy_network=NetBFTQ(size_state=len(feature(e.reset(), e)), n_actions=e.action_space.n, **bftq_net_params), gamma=gamma, gamma_c=gamma_c, cpu_processes=general["cpu"]["processes"], env=e, split_batches=general["gpu"]["split_batches"], hull_options=general["hull_options"], **bftq_params) return bftq # Prepare learning i_traj = 0 decays = math_utils.epsilon_decay(**epsilon_decay, N=N_trajs, savepath=workspace) betas_for_exploration = np.array(eval(betas_for_exploration)) memory_by_batch = [get_current_memory()] batch_sizes = near_split(N_trajs, size_bins=trajs_by_ftq_batch) pi_epsilon_greedy_config = { "__class__": repr(EpsilonGreedyPolicy), "pi_greedy": { "__class__": repr(RandomBudgetedPolicy) }, "pi_random": { "__class__": repr(RandomBudgetedPolicy) }, "epsilon": decays[0], "hull_options": general["hull_options"], "clamp_Qc": bftq_params["clamp_Qc"] } # Main loop trajs = [] for batch, batch_size in enumerate(batch_sizes): # Prepare workers cpu_processes = min( general["cpu"]["processes_when_linked_with_gpu"] or os.cpu_count(), batch_size) workers_n_trajectories = near_split(batch_size, cpu_processes) workers_start = np.cumsum(workers_n_trajectories) workers_traj_indexes = [ np.arange(*times) for times in zip( np.insert(workers_start[:-1], 0, 0), workers_start) ] if betas_for_exploration.size: workers_betas = [ betas_for_exploration.take(indexes, mode='wrap') for indexes in workers_traj_indexes ] else: workers_betas = [ np.random.random(indexes.size) for indexes in workers_traj_indexes ] workers_seeds = np.random.randint(0, 10000, cpu_processes).tolist() workers_epsilons = [ decays[i_traj + indexes] for indexes in workers_traj_indexes ] workers_params = list( zip_with_singletons(generate_envs, pi_epsilon_greedy_config, workers_seeds, gamma, gamma_c, workers_n_trajectories, workers_betas, workers_epsilons, None, general["dictConfig"])) # Collect trajectories logger.info( "Collecting trajectories with {} workers...".format(cpu_processes)) if cpu_processes == 1: results = [] for params in workers_params: results.append(execute_policy_from_config(*params)) else: with Pool(processes=cpu_processes) as pool: results = pool.starmap(execute_policy_from_config, workers_params) i_traj += sum([len(trajectories) for trajectories, _ in results]) # Fill memory [ rm.push(*sample) for trajectories, _ in results for trajectory in trajectories for sample in trajectory ] transitions_ftq, transition_bftq = datas_to_transitions( rm.memory, e, feature, 0, normalize_reward) # Fit model logger.info( "[BATCH={}]---------------------------------------".format(batch)) logger.info( "[BATCH={}][learning bftq pi greedy] #samples={} #traj={}".format( batch, len(transition_bftq), i_traj)) logger.info( "[BATCH={}]---------------------------------------".format(batch)) bftq = build_fresh_bftq() bftq.reset(True) bftq.workspace = workspace / "batch={}".format(batch) makedirs(bftq.workspace) if isinstance(e, EnvGridWorld): for trajectories, _ in results: for traj in trajectories: trajs.append(traj) w = World(e) w.draw_frame() w.draw_lattice() w.draw_cases() w.draw_source_trajectories(trajs) w.save((bftq.workspace / "bftq_on_2dworld_sources").as_posix()) q = bftq.fit(transition_bftq) # Save policy network_path = bftq.save_policy() os.system("cp {}/policy.pt {}/policy.pt".format( bftq.workspace, workspace)) # Save memory save_memory(bftq, memory_by_batch, by_batch=False) # Update greedy policy pi_epsilon_greedy_config["pi_greedy"] = { "__class__": repr(PytorchBudgetedFittedPolicy), "feature_str": feature_str, "network_path": network_path, "betas_for_discretisation": bftq.betas_for_discretisation, "device": bftq.device, "hull_options": general["hull_options"], "clamp_Qc": bftq_params["clamp_Qc"] } if isinstance(e, EnvGridWorld): def pi(state, beta): import torch from ncarrara.budgeted_rl.bftq.pytorch_budgeted_fittedq import convex_hull, \ optimal_pia_pib with torch.no_grad(): hull = convex_hull(s=torch.tensor([state], device=device, dtype=torch.float32), Q=q, action_mask=np.zeros(e.action_space.n), id="run_" + str(state), disp=False, betas=bftq.betas_for_discretisation, device=device, hull_options=general["hull_options"], clamp_Qc=bftq_params["clamp_Qc"]) opt, _ = optimal_pia_pib(beta=beta, hull=hull, statistic={}) return opt def qr(state, a, beta): import torch s = torch.tensor([[state]], device=device) b = torch.tensor([[[beta]]], device=device) sb = torch.cat((s, b), dim=2) return q(sb).squeeze()[a] def qc(state, a, beta): import torch s = torch.tensor([[state]], device=device) b = torch.tensor([[[beta]]], device=device) sb = torch.cat((s, b), dim=2) return q(sb).squeeze()[e.action_space.n + a] w = World(e, bftq.betas_for_discretisation) w.draw_frame() w.draw_lattice() w.draw_cases() w.draw_policy_bftq(pi, qr, qc, bftq.betas_for_discretisation) w.save((bftq.workspace / "bftq_on_2dworld").as_posix()) save_memory(bftq, memory_by_batch, by_batch=True)
def main(betas_test, policy_path, generate_envs, feature_str, device, workspace, gamma, gamma_c, bftq_params, seed, N_trajs, path_results, general, **args): if not os.path.isabs(policy_path): policy_path = workspace / policy_path pi_config = { "__class__": repr(PytorchBudgetedFittedPolicy), "feature_str": feature_str, "network_path": policy_path, "betas_for_discretisation": eval(bftq_params["betas_for_discretisation"]), "device": device, "hull_options": general["hull_options"], "clamp_Qc": bftq_params["clamp_Qc"] } mock_env = envs_factory.generate_envs(**generate_envs)[0][0] makedirs(workspace / "trajs") makedirs(path_results) set_seed(seed) try: for beta in eval(betas_test): # Prepare workers cpu_processes = min( general["cpu"]["processes_when_linked_with_gpu"] or os.cpu_count(), N_trajs) workers_n_trajectories = near_split(N_trajs, cpu_processes) workers_seeds = np.random.randint(0, 10000, cpu_processes).tolist() workers_params = list( zip_with_singletons( generate_envs, pi_config, workers_seeds, gamma, gamma_c, workers_n_trajectories, beta, None, "{}/beta={}.results".format(path_results, beta), general["dictConfig"])) logger.info("Collecting trajectories with {} workers...".format( cpu_processes)) with Pool(cpu_processes) as pool: results = pool.starmap(execute_policy_from_config, workers_params) rez = np.concatenate([result for _, result in results], axis=0) trajs = [] for t, _ in results: trajs += t print("BFTQ({:.2f}) : {}".format(beta, format_results(rez))) if isinstance(mock_env, EnvGridWorld): from ncarrara.utils_rl.environments.gridworld.world import World w = World(mock_env) w.draw_frame() w.draw_lattice() w.draw_cases() w.draw_test_trajectories(trajs) pp = (workspace / "trajs" / "trajs_beta").as_posix() w.save(pp + "={:.2f}".format(beta)) if isinstance(mock_env, EnvGridWorld): os.system("convert -delay 10 -loop 0 " + workspace.as_posix() + "/trajs/" + "*.png " + workspace.as_posix() + "/out.gif") except FileNotFoundError as e: logger.warning("Could not load policy: {}".format(e))
def main(generate_envs, feature_str, gamma, gamma_c, ftq_params, ftq_net_params, device, epsilon_decay, N_trajs, trajs_by_ftq_batch, normalize_reward, workspace, seed, save_memory, general, lambda_=0, **args): envs, params = envs_factory.generate_envs(**generate_envs) e = envs[0] set_seed(seed, e) rm = Memory() feature = feature_factory(feature_str) def build_fresh_ftq(): ftq = PytorchFittedQ( device=device, policy_network=NetFTQ(n_in=len(feature(e.reset(), e)), n_out=e.action_space.n, **ftq_net_params), action_str=None if not hasattr(e, "action_str") else e.action_str, test_policy=None, gamma=gamma, **ftq_params) return ftq # Prepare learning i_traj = 0 decays = math_utils.epsilon_decay(**epsilon_decay, N=N_trajs, savepath=workspace) batch_sizes = near_split(N_trajs, size_bins=trajs_by_ftq_batch) pi_epsilon_greedy_config = { "__class__": repr(EpsilonGreedyPolicy), "pi_greedy": { "__class__": repr(RandomPolicy) }, "pi_random": { "__class__": repr(RandomPolicy) }, "epsilon": decays[0] } # Main loop trajs = [] for batch, batch_size in enumerate(batch_sizes): # Prepare workers cpu_processes = min( general["cpu"]["processes_when_linked_with_gpu"] or os.cpu_count(), batch_size) workers_n_trajectories = near_split(batch_size, cpu_processes) workers_start = np.cumsum(workers_n_trajectories) workers_traj_indexes = [ np.arange(*times) for times in zip( np.insert(workers_start[:-1], 0, 0), workers_start) ] workers_seeds = np.random.randint(0, 10000, cpu_processes).tolist() workers_epsilons = [ decays[i_traj + indexes] for indexes in workers_traj_indexes ] workers_params = list( zip_with_singletons(generate_envs, pi_epsilon_greedy_config, workers_seeds, gamma, gamma_c, workers_n_trajectories, None, workers_epsilons, None, general["dictConfig"])) # Collect trajectories logger.info( "Collecting trajectories with {} workers...".format(cpu_processes)) if cpu_processes == 1: results = [execute_policy_from_config(*workers_params[0])] else: with Pool(processes=cpu_processes) as pool: results = pool.starmap(execute_policy_from_config, workers_params) i_traj += sum([len(trajectories) for trajectories, _ in results]) # Fill memory [ rm.push(*sample) for trajectories, _ in results for trajectory in trajectories for sample in trajectory ] transitions_ftq, _ = datas_to_transitions(rm.memory, e, feature, lambda_, normalize_reward) # Fit model logger.info( "[BATCH={}]---------------------------------------".format(batch)) logger.info( "[BATCH={}][learning ftq pi greedy] #samples={} #traj={}".format( batch, len(transitions_ftq), i_traj)) logger.info( "[BATCH={}]---------------------------------------".format(batch)) ftq = build_fresh_ftq() ftq.reset(True) ftq.workspace = workspace / "batch={}".format(batch) makedirs(ftq.workspace) if isinstance(e, EnvGridWorld): for trajectories, _ in results: for traj in trajectories: trajs.append(traj) w = World(e) w.draw_frame() w.draw_lattice() w.draw_cases() w.draw_source_trajectories(trajs) w.save((ftq.workspace / "bftq_on_2dworld_sources").as_posix()) ftq.fit(transitions_ftq) # Save policy network_path = ftq.save_policy() os.system("cp {}/policy.pt {}/final_policy.pt".format( ftq.workspace, workspace)) # Update greedy policy pi_epsilon_greedy_config["pi_greedy"] = { "__class__": repr(PytorchFittedPolicy), "feature_str": feature_str, "network_path": network_path, "device": ftq.device } if save_memory is not None: rm.save(workspace / save_memory["path"], save_memory["as_json"])
def run_dqn(env,id, workspace, device, net_params, dqn_params, decay, N, seed, feature_dqn, start_decay, gamma=None, transfer_module=None, evaluate_greedy_policy=True, traj_max_size=None, writer=None): size_state = len(feature_dqn(env.reset())) net = NetDQN(n_in=size_state, n_out=env.action_space.n, **net_params) dqn = TDQN( id=id, policy_network=net, device=device, transfer_module=transfer_module, workspace=workspace, writer=writer, feature=feature_dqn, info={"env":env}, **dqn_params) dqn.reset() set_seed(seed=seed, env=env) rrr = [] rrr_greedy = [] epsilons = epsilon_decay(start=start_decay, decay=decay, N=N, savepath=workspace) nb_samples = 0 memory = Memory() for n in range(N): # print("-------------------------- "+str(n)+ "----------------------") s = env.reset() done = False rr = 0 it = 0 while (not done): if random.random() < epsilons[n]: if hasattr(env, "action_space_executable"): a = np.random.choice(env.action_space_executable()) else: a = env.action_space.sample() else: if hasattr(env, "action_space_executable"): exec = env.action_space_executable() action_mask = np.ones(env.action_space.n) for ex in exec: action_mask[ex] = 0. a = dqn.pi(s, action_mask) else: a = dqn.pi(s, np.zeros(env.action_space.n)) s_, r_, done, info = env.step(a) done = done or (traj_max_size is not None and it >= traj_max_size - 1) rr += r_ * (gamma ** it) t_dqn = (s, a, r_, s_, done, info) memory.push(s, a, r_, s_, done, info) dqn.update(*t_dqn) s = s_ nb_samples += 1 it += 1 # if writer is not None: # writer.add_scalar('{} return/episode', rr, n) rrr.append(rr) if evaluate_greedy_policy: s = env.reset() done = False rr_greedy = 0 it = 0 while (not done): if hasattr(env, "action_space_executable"): exec = env.action_space_executable() action_mask = np.ones(env.action_space.n) for ex in exec: action_mask[ex] = 0. a = dqn.pi(s, action_mask) else: a = dqn.pi(s, np.zeros(env.action_space.n)) # print(env.action_space_str[a]) s_, r_, done, info = env.step(a) done = done or (traj_max_size is not None and it >= traj_max_size - 1) rr_greedy += r_ * (gamma ** it) s = s_ it += 1 rrr_greedy.append(rr_greedy) if writer is not None: writer.add_scalar('{}_return_greedy/episode'.format(id), rr_greedy, n) # print("eps={} greedy={}".format(rr,rr_greedy)) import matplotlib.pyplot as plt for param_stat in ["weights_over_time", "biais_over_time", "ae_errors_over_time", "p_over_time", "best_fit_over_time","error_bootstrap_source","error_bootstrap_partial"]: if hasattr(dqn, param_stat): var = getattr(dqn, param_stat) plt.plot(range(len(var)), var) plt.title(param_stat) plt.savefig(workspace / param_stat) plt.close() return rrr, rrr_greedy, memory, dqn
def main(policy_path, generate_envs, feature_str, device, workspace, bftq_params, seed, general, betas_test, N_trajs, gamma, gamma_c, bftq_net_params, **args): if not os.path.isabs(policy_path): policy_path = workspace / policy_path env = envs_factory.generate_envs(**generate_envs)[0][0] feature = feature_factory(feature_str) bftq = PytorchBudgetedFittedQ( device=device, workspace=workspace, actions_str=get_actions_str(env), policy_network=NetBFTQ(size_state=len(feature(env.reset(), env)), n_actions=env.action_space.n, **bftq_net_params), gamma=gamma, gamma_c=gamma_c, cpu_processes=general["cpu"]["processes"], env=env, hull_options=general["hull_options"], **bftq_params) bftq.reset(True) pi_config = { "__class__": repr(PytorchBudgetedFittedPolicy), "feature_str": feature_str, "network_path": policy_path, "betas_for_discretisation": eval(bftq_params["betas_for_discretisation"]), "device": device, "hull_options": general["hull_options"], "clamp_Qc": bftq_params["clamp_Qc"], "env": env } pi = policy_factory(pi_config) # Iterate over betas for beta in eval(betas_test): logger.info("Rendering with beta={}".format(beta)) set_seed(seed, env) for traj in range(N_trajs): done = False pi.reset() info_env = {} info_pi = {"beta": beta} t = 0 # Make a workspace for trajectories traj_workspace = workspace / "trajs" / "beta={}".format(beta) / "traj={}".format(traj) makedirs(traj_workspace) bftq.workspace = traj_workspace monitor = MonitorV2(env, traj_workspace, add_subdirectory=False) obs = monitor.reset() # Run trajectory while not done: action_mask = get_action_mask(env) info_pi = merge_two_dicts(info_pi, info_env) bftq.draw_Qr_and_Qc(obs, pi.network, "render_t={}".format(t), show=False) a, _, info_pi = pi.execute(obs, action_mask, info_pi) render(env, workspace, t, a) obs, _, done, info_env = monitor.step(a) t += 1 monitor.close()