def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config, env='BomberMan-v0') init_w = trainer.get_policy('policy_01').get_weights() trainer.restore( 'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-25_08-56-43eo23nmho\\checkpoint_002360\\checkpoint-2360' ) trainer.workers.foreach_worker( lambda w: w.get_policy('policy_01').set_weights(init_w)) trainer.restore('.\\kill-policy-0\\checkpoint') trainer.import_model() iter = 0 #def update_phase(ev): # ev.foreach_env(lambda e: e.set_phase(phase)) while True: iter += 1 result = trainer.train() if iter % 200 == 0: if not os.path.exists(f'./model-{iter}'): #trainer.get_policy('policy_01').export_model(f'./model-{iter}') trainer.export_policy_model(f'./model-{iter}/main', 'policy_01') trainer.export_policy_model(f'./model-{iter}/collect', 'policy_collect') trainer.export_policy_model(f'./model-{iter}/destroy', 'policy_destroy') trainer.export_policy_model(f'./model-{iter}/kill', 'policy_kill') else: print("model already saved")
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config, env='BomberMan-v0') trainer.restore( 'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-26_20-15-082mjvde9i\\checkpoint_008980\\checkpoint-8980' ) iter = 0 while True: iter += 1 result = trainer.train() if iter % 200 == 0: if not os.path.exists(f'./model-{iter}'): trainer.get_policy('policy_01').export_model( f'./model-{iter}') else: print("model already saved")
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config, env='BomberMan-v0') #trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-16_09-20-44984tj3ip\\checkpoint_002770\\checkpoint-2770') iter = 0 #def update_phase(ev): # ev.foreach_env(lambda e: e.set_phase(phase)) while True: iter += 1 result = trainer.train() if iter % 200 == 1: if not os.path.exists(f'./model-{iter}'): trainer.get_policy('policy_01').export_model(f'./model-{iter}') else: print("model already saved")
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config) if checkpoint_dir: trainer.load_checkpoint(checkpoint_dir) chk_freq = 10 if useModelFromLowLevelTrain: config_low["num_workers"] = 0 config_low["num_envs_per_worker"] = 1 config_low["num_gpus"] = 1 agentLow = PPOTrainer(config_low) agentLow.restore( "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}". format(experiment_name, experiment_id, checkpoint_num, checkpoint_num)) lowWeight = agentLow.get_policy().get_weights() highWeight = trainer.get_policy("low_level_policy").get_weights() lowState = agentLow.get_policy().get_state() importedOptState = OrderedDict([ (k.replace("default_policy", "low_level_policy"), v) for k, v in lowState["_optimizer_variables"].items() ]) importedPolicy = { hw: lowWeight[lw] for hw, lw in zip(highWeight.keys(), lowWeight.keys()) } importedPolicy["_optimizer_variables"] = importedOptState trainer.get_policy("low_level_policy").set_state(importedPolicy) chk_freq = 1 # Hanya perlu 1 kali saja di awal untuk save model hasil import while True: result = trainer.train() tune.report(**result) if (trainer._iteration % chk_freq == 0): with tune.checkpoint_dir( step=trainer._iteration) as checkpoint_dir: trainer.save(checkpoint_dir)
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config, env='BomberMan-v0') #trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-10_14-16-50n_4knahb\\checkpoint_002700\\checkpoint-2700') iter = 0 def update_phase(ev): ev.foreach_env(lambda e: e.set_phase(phase)) phase = 2 trainer.workers.foreach_worker(update_phase) while True: iter += 1 result = trainer.train() if iter % 250 == 0: if not os.path.exists(f'./model-{iter}'): trainer.get_policy('policy_01').export_model(f'./model-{iter}') else: print("model already saved") ''' if phase == 1 and result["policy_reward_mean"]["policy_01"] > 2: print(f'Phase 2 now.') phase = 2 trainer.workers.foreach_worker(update_phase) ''' if phase == 1 and result["policy_reward_mean"]["policy_01"] > 3: print(f'Phase 2 now.') phase = 2 trainer.workers.foreach_worker(update_phase) #trainer.config['gamma'] = 0.995 if phase == 0 and result["policy_reward_mean"]["policy_01"] > 3.5: print(f'Phase 1 now.') phase = 1 trainer.workers.foreach_worker(update_phase)
def main(): ray.init() # Hyperparameters of PPO are not well tuned. Most of them refer to https://github.com/xtma/pytorch_car_caring/blob/master/train.py trainer = PPOTrainer(env=MyEnv, config={ "use_pytorch": True, "model": { "custom_model": "mymodel", "custom_options": { 'encoder_path': args.encoder_path, 'train_encoder': args.train_encoder }, "custom_action_dist": "mydist", }, "env_config": { 'game': 'CarRacing' }, "num_workers": args.num_workers, "num_envs_per_worker": args.num_envs_per_worker, "num_gpus": args.num_gpus, "use_gae": args.use_gae, "batch_mode": args.batch_mode, "vf_loss_coeff": args.vf_loss_coeff, "vf_clip_param": args.vf_clip_param, "lr": args.lr, "kl_coeff": args.kl_coeff, "num_sgd_iter": args.num_sgd_iter, "grad_clip": args.grad_clip, "clip_param": args.clip_param, "rollout_fragment_length": args.rollout_fragment_length, "train_batch_size": args.train_batch_size, "sgd_minibatch_size": args.sgd_minibatch_size }) for i in range(args.train_epochs): trainer.train() print("%d Train Done" % (i), "Save Freq: %d" % (args.model_save_freq)) if (i + 1) % args.model_save_freq == 0: print("%d Episodes Done" % (i)) weights = trainer.get_policy().get_weights() torch.save(weights, args.model_save_path + "%d-mode.pt" % (i + 1)) trainer.save(args.trainer_save_path) print("Done All!") trainer.stop()
fullpath1 = checkpoint_path + checkpoint1 checkpoint2 = "checkpoint_000005/checkpoint-5" fullpath2 = checkpoint_path + checkpoint2 sum1a = 0 sum1b = 0 sum2a = 0 sum2b = 0 if os.path.exists(fullpath1): print('path FOUND!') print("Restoring from checkpoint path", fullpath1) trainer.restore(fullpath1) temp = trainer.get_policy().model._curiosity_feature_net sum1a = sum(v.sum() for k, v in trainer.get_policy().get_weights().items()) sum1b = sum( v.eval(trainer.get_policy()._sess).sum() for v in trainer.get_policy().model._curiosity_feature_net.variables()) else: print("That path does not exist!") if os.path.exists(fullpath2): print('path FOUND!') print("Restoring from checkpoint path", fullpath2) trainer2.restore(fullpath2) sum2a = sum(v.sum() for k, v in trainer2.get_policy().get_weights().items()) sum2b = sum( v.eval(trainer2.get_policy()._sess).sum() for v in trainer2.get_policy().model._curiosity_feature_net.variables()) else:
# (210, 160, 3) prep.transform(env.reset()).shape # (84, 84, 3) # __preprocessing_observations_end__ # __query_action_dist_start__ # Get a reference to the policy import numpy as np from ray.rllib.agents.ppo import PPOTrainer trainer = PPOTrainer(env="CartPole-v0", config={ "framework": "tf2", "num_workers": 0 }) policy = trainer.get_policy() # <ray.rllib.policy.eager_tf_policy.PPOTFPolicy_eager object at 0x7fd020165470> # Run a forward pass to get model output logits. Note that complex observations # must be preprocessed as in the above code block. logits, _ = policy.model({"obs": np.array([[0.1, 0.2, 0.3, 0.4]])}) # (<tf.Tensor: id=1274, shape=(1, 2), dtype=float32, numpy=...>, []) # Compute action distribution given logits policy.dist_class # <class_object 'ray.rllib.models.tf.tf_action_dist.Categorical'> dist = policy.dist_class(logits, policy.model) # <ray.rllib.models.tf.tf_action_dist.Categorical object at 0x7fd02301d710> # Query the distribution for samples, sample logps dist.sample()
agentLow = PPOTrainer(config_low) experiment_name = "HWalk_Low_Mimic" experiment_id = "PPO_HumanoidBulletEnvLow-v0_699c9_00000_0_2021-04-18_22-14-39" checkpoint_num = "1930" agentLow.restore( "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".format( experiment_name, experiment_id, checkpoint_num, checkpoint_num)) # agent.export_policy_model("out/model", "default_policy") # agent.import_model("out/model") # agent.get_policy("default_policy").import_model_from_h5 agentHigh = PPOTrainer(config_hier) lowWeight = agentLow.get_policy().get_weights() highWeight = agentHigh.get_policy("low_level_policy").get_weights() importedPolicy = { hw: lowWeight[lw] for hw, lw in zip(highWeight.keys(), lowWeight.keys()) } s1 = agentLow.get_policy().get_state() s11 = OrderedDict([(k.replace("default_policy", "low_level_policy"), v) for k, v in s1['_optimizer_variables'].items()]) importedPolicy['_optimizer_variables'] = s11 agentHigh.get_policy("low_level_policy").set_state(importedPolicy) obs = single_env.low_level_obs_space.sample() print(agentLow.compute_action(obs)) print(agentHigh.compute_action(obs, policy_id='low_level_policy')) print("=============================================================")
del config['num_workers'] del config['num_gpus'] ray.init(num_cpus=8, num_gpus=1) PPOagent = PPOTrainer(env=env_name, config=config) PPOagent.restore(checkpoint_path) reward_sum = 0 frame_list = [] i = 0 env.reset() for agent in env.agent_iter(): observation, reward, done, info = env.last() reward_sum += reward if done: action = None else: action, _, _ = PPOagent.get_policy("policy_0").compute_single_action(observation) env.step(action) i += 1 if i % (len(env.possible_agents)+1) == 0: frame_list.append(PIL.Image.fromarray(env.render(mode='rgb_array'))) env.close() print(reward_sum) frame_list[0].save("out.gif", save_all=True, append_images=frame_list[1:], duration=3, loop=0)
obs_space = TrainerConfig.OBS_SPACE_CONNECT3 print("The observation space is: ") print(obs_space) print("The action space is: ") act_space = TrainerConfig.ACT_SPACE_CONNECT3 print(act_space) trainer_obj = PPOTrainer(config=TrainerConfig.PPO_TRAINER_CONNECT3, ) restored_weights = [] weights = np.load(weights_file, allow_pickle=True) weights_name = ["p" + str(i + 1) for i in range(weights_to_keep)] for name in weights_name: restored_weights.append(weights[()][name]) trainer_obj.callbacks.add_weights(restored_weights[-1]) for i, weights in enumerate(restored_weights): trainer_obj.get_policy("player1").set_weights(weights) model_to_evaluate = trainer_obj.get_policy("player1").model updated_weights = trainer_obj.get_policy("player1").get_weights() print("there are " + str(len(weights)) + " weights") indx = 0 equal_weights = [] for w1, w2 in zip(weights, updated_weights): if np.array_equal(w1, w2): equal_weights.append(indx) indx += 1 print(equal_weights) elo_diff, model_score, minimax_score, draw = model_vs_minimax_connect3(
'num_workers'] = 4 # noptepochs (int) Number of epoch when optimizing the surrogate ppo_config[ 'clip_param'] = 0.2 # cliprange (float or callable) Clipping parameter, it can be a function ppo_config[ 'vf_clip_param'] = 1 # cliprange_vf = None? -- (float or callable) Clipping parameter for the value function, # it can be a function. This is a parameter specific to the OpenAI implementation. If None is passed (default), then # cliprange (that is used for the policy) will be used. IMPORTANT: this clipping depends on the reward scaling. To # deactivate value function clipping (and recover the original PPO implementation), you have to pass a negative value # (e.g. -1). ppo_config['env_config'] = env_config ppo_config['train_batch_size'] = 4000 ppo_config['explore'] = False PPO_agent = PPOTrainer(config=ppo_config, env=SSA_Tasker_Env) PPO_agent.restore(ppo_checkpoint) PPO_agent.get_policy().config['explore'] = False logdir = '/home/ash/ray_results/ssa_experiences/agent_visible_greedy_spoiled/' + str( env_config['rso_count']) + 'RSOs_jones_flatten_10000episodes/' marwil_config = MARWIL_CONFIG.copy() marwil_config['evaluation_num_workers'] = 1 marwil_config['env_config'] = env_config marwil_config['evaluation_interval'] = 1 marwil_config['evaluation_config'] = {'input': 'sampler'} marwil_config['beta'] = 1 # 0 marwil_config['input'] = logdir marwil_config['env_config'] = env_config marwil_config['explore'] = False MARWIL_agent = MARWILTrainer(config=marwil_config, env=SSA_Tasker_Env)
def load_agent(): # Initialize training environment ray.init() def environment_creater(params=None): agent = SimpleAvoidAgent(noise=0.05) return TronRayEnvironment(board_size=13, num_players=4) env = environment_creater() tune.register_env("tron_multi_player", environment_creater) ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard) # Configure Deep Q Learning with reasonable values config = DEFAULT_CONFIG.copy() config['num_workers'] = 4 ## config['num_gpus'] = 1 #config["timesteps_per_iteration"] = 1024 #config['target_network_update_freq'] = 256 #config['buffer_size'] = 100_000 #config['schedule_max_timesteps'] = 200_000 #config['exploration_fraction'] = 0.02 #config['compress_observations'] = False #config['n_step'] = 2 #config['seed'] = SEED #Configure for PPO #config["sample_batch_size"]= 100 #config["train_batch_size"]=200 #config["sgd_minibatch_size"]=60 #Configure A3C with reasonable values # We will use a simple convolution network with 3 layers as our feature extractor config['model']['vf_share_layers'] = True config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)] config['model']['fcnet_hiddens'] = [256] config['model']['custom_preprocessor'] = 'tron_prep' # All of the models will use the same network as before agent_config = { "model": { "vf_share_layers": True, "conv_filters": [(512, 5, 1), (256, 3, 2), (128, 3, 2)], "fcnet_hiddens": [256], "custom_preprocessor": 'tron_prep' } } def policy_mapping_function(x): if x == '0': return "trainer" return "opponent" config['multiagent'] = { "policy_mapping_fn": policy_mapping_function, "policies": { "trainer": (None, env.observation_space, env.action_space, agent_config), "opponent": (None, env.observation_space, env.action_space, agent_config) }, "policies_to_train": ["trainer"] } # Begin training or evaluation #trainer = DDPGTrainer(config, "tron_single_player") #trainer = A3CTrainer(config, "tron_single_player") #trainer = MARWILTrainer(config, "tron_single_player") trainer = PPOTrainer(config, "tron_multi_player") trainer.restore("./sp_checkpoint_1802/checkpoint-1802") return trainer.get_policy("trainer")
def execute(self): timesteps = 0 best_period_value = None if self.pr.agent.name() == "A2C": trainer = A2CTrainer(config=self.rllib_config, logger_creator=rllib_logger_creator) elif self.pr.agent.name() == "PPO": trainer = PPOTrainer(config=self.rllib_config, logger_creator=rllib_logger_creator) # import pdb; pdb.set_trace() else: raise ValueError('There is no rllib trainer with name ' + self.pr.agent.name()) tf_writer = SummaryWriter( self.pr.save_logs_to) if self.pr.save_logs_to else None reward_metric = Metric(short_name='rews', long_name='trajectory reward', formatting_string='{:5.1f}', higher_is_better=True) time_step_metric = Metric(short_name='steps', long_name='total number of steps', formatting_string='{:5.1f}', higher_is_better=True) metrics = [reward_metric, time_step_metric] if self.pr.train: start_time = time.time() policy_save_tag = 0 while timesteps < self.pr.total_steps: result = trainer.train() timesteps = result["timesteps_total"] reward_metric.log(result['evaluation']['episode_reward_mean']) time_step_metric.log(result['evaluation']['episode_len_mean']) # import pdb; pdb.set_trace() # # Get a metric list from each environment. # if hasattr(trainer, "evaluation_workers"): # metric_lists = sum(trainer.evaluation_workers.foreach_worker(lambda w: w.foreach_env(lambda e: e.metrics)), []) # else: # metric_lists = sum(trainer.workers.foreach_worker(lambda w: w.foreach_env(lambda e: e.metrics)), []) # metrics = metric_lists[0] # # Aggregate metrics from all other environments. # for metric_list in metric_lists[1:]: # for i, metric in enumerate(metric_list): # metrics[i]._values.extend(metric._values) save_logs_to = self.pr.save_logs_to model_save_paths_dict = self.pr.model_save_paths_dict # Consider whether to save a model. saved = False if model_save_paths_dict is not None and metrics[ 0].currently_optimal: # trainer.get_policy().model.save(model_save_paths_dict) policy_save_tag += 1 trainer.get_policy().model.save_model_in_progress( model_save_paths_dict, policy_save_tag) saved = True # Write the metrics for this reporting period. total_seconds = time.time() - start_time logger.write_and_condense_metrics(total_seconds, 'iters', timesteps, saved, metrics, tf_writer) # Clear the metrics, both those maintained by the training workers and by the evaluation ones. condense_fn = lambda environment: [ m.condense_values() for m in environment.metrics ] trainer.workers.foreach_worker( lambda w: w.foreach_env(condense_fn)) if hasattr(trainer, "evaluation_workers"): trainer.evaluation_workers.foreach_worker( lambda w: w.foreach_env(condense_fn)) else: start_time = time.time() env = trainer.workers.local_worker().env metrics = env.metrics worker = trainer.workers.local_worker() steps = steps_since_report = 0 while True: batch = worker.sample() current_steps = len(batch["obs"]) steps += current_steps steps_since_report += current_steps if steps_since_report >= self.pr.reporting_interval: total_seconds = time.time() - start_time # Write the metrics for this reporting period. logger.write_and_condense_metrics(total_seconds, 'iters', steps, False, metrics, tf_writer) steps_since_report = 0 if steps >= self.pr.total_steps: break env.close() # Get a summary metric for the entire stage, based on the environment's first metric. summary_metric = logger.summarize_stage(metrics[0]) # Temporary workaround for https://github.com/ray-project/ray/issues/8205 ray.shutdown() _register_all() return summary_metric
# results_path = os.path.split(checkpoint_path)[0] # else: # results_path = args.results_path # evaluator.evaluate(trainer, results_path) ########################################################### # Visualize salient map if args.visualize_salient_obj: HIGH_RES_OUTPUT = True if HIGH_RES_OUTPUT: out = cv2.VideoWriter('salient_obj_video.mp4', cv2.VideoWriter_fourcc(*"MJPG"), 30, (320, 320)) else: out = cv2.VideoWriter('salient_obj_video.mp4', cv2.VideoWriter_fourcc(*"MJPG"), 30, eval(config["env_config"]["resized_input_shape"])) model = tf.keras.models.clone_model(trainer.get_policy().model.base_model) # type: tf.keras.Model env = launch_and_wrap_env(config["env_config"]) obs_wrappers, _, _ = get_wrappers(env) env.reset() obs = env.reset() done = False while not done: salient_map_mean, action_dist_params = nvidia_salient_map(model, obs) if HIGH_RES_OUTPUT: render = env.render_obs() displayed_obs = obs_wrappers[0].observation(render) # Clipping wrapper, shouldn't be hardcoded displayed_obs = cv2.resize(displayed_obs, (displayed_obs.shape[0], displayed_obs.shape[0]), interpolation=cv2.INTER_AREA) displayed_obs = (displayed_obs / 255.).astype(np.float32) else: displayed_obs = obs
if restore_ckpt: best_ckpt = restore_training(trainer_obj, ckpt_dir, custom_metrics_file) else: best_ckpt = 0 print("Starting training from scratch") for epoch in tqdm(range(best_ckpt + 1, epochs)): print("Epoch " + str(epoch)) results = trainer_obj.train() p1_score = results["custom_metrics"]["player1_score"] minimax_score = results["custom_metrics"]["player2_score"] score_difference = results["custom_metrics"]["score_difference"] actual_depth = trainer_obj.get_policy("minimax").depth if epoch % ckpt_step == 0 and epoch != 0: custom_metrics = results["custom_metrics"] save_checkpoint(trainer_obj, ckpt_dir, custom_metrics_file, custom_metrics, ckpt_to_keep) if p1_score >= minimax_score: print("Player 1 was able to beat MiniMax algorithm with depth " + str(actual_depth)) new_depth = actual_depth + 1 print("Increasing Minimax depth to " + str(new_depth)) trainer_obj.get_policy("minimax").depth = new_depth trainer_obj.save(Config.IMPORTANT_CKPT_PATH) if new_depth > max_depth:
def run_saved(args): if args.OSM[0] == 1 and args.OSM[1] == 0: setting = "RLvsOSM" elif args.OSM[0] == 1 and args.OSM[1] == 1: setting = "OSMvsOSM" else: setting = "RL{0}".format(len(args.alphas) - sum(args.honest)) if args.save_path == 'none': checkpointnum = 0 else: checkpointnum = args.save_path.split('-')[-1] env_name = "{setting}_{spirit}_{blocks}_{alpha:04d}_{spy}_{checkpointnum}".format( spirit=int(args.team_spirit * 100), blocks=int(args.blocks), alpha=int(args.alphas[0] * 10000), spy=args.spy[1], setting=setting, checkpointnum=checkpointnum) ray.init(local_mode=True, memory=700 * 1024 * 1024, object_store_memory=100 * 1024 * 1024, driver_object_store_memory=100 * 1024 * 102) print("Testing {0}".format(setting), env_name) def select_policy(agent_id): return agent_id ModelCatalog.register_custom_model("pa_model", ParametricActionsModel) register_env(env_name, lambda config: ParametricBitcoin(config)) if args.extended: action_n = 6 else: action_n = 4 # define the state space, one for parties that have access to spy info and one without spy_state_space = constants.make_spy_space(len(args.alphas), args.blocks) blind_state_space = constants.make_blind_space(len(args.alphas), args.blocks) policies = dict() osm_space = spaces.Box( low=np.zeros(4), high=np.array([args.blocks + 4, args.blocks + 4, args.blocks + 4, 3.])) if sum(args.OSM) > 0: osm = OSM_strategy( osm_space, spaces.Discrete(4), { 'alpha': args.alphas[0], 'gamma': args.gammas[0], 'blocks': args.blocks }) blind_dim = 0 for space in blind_state_space: blind_dim += get_preprocessor(space)(space).size spy_dim = 0 for space in spy_state_space: spy_dim += get_preprocessor(space)(space).size spy_state_space_wrapped = spaces.Dict({ "action_mask": spaces.Box(0, 1, shape=(action_n, )), "avail_actions": spaces.Box(-10, 10, shape=(action_n, action_n)), "bitcoin": spaces.Box(0, np.inf, shape=(spy_dim, )) }) blind_state_space_wrapped = spaces.Dict({ "action_mask": spaces.Box(0, 1, shape=(action_n, )), "avail_actions": spaces.Box(-10, 10, shape=(action_n, action_n)), "bitcoin": spaces.Box(0, np.inf, shape=(blind_dim, )) }) preps = [None for i in range(len(args.alphas))] for i in range(len(args.alphas)): if args.spy[i] == 1: policies[str(i)] = (None, spy_state_space_wrapped, spaces.Discrete(action_n), { "model": { "use_lstm": args.use_lstm, "custom_model": "pa_model", "custom_options": { "parties": len(args.alphas), "spy": True, "blocks": args.blocks, "extended": args.extended } } }) preps[i] = get_preprocessor(spy_state_space_wrapped)( spy_state_space_wrapped) elif args.OSM[i] == 1: policies[str(i)] = (OSM_strategy, osm_space, spaces.Discrete(4), { 'alpha': args.alphas[0], 'gamma': args.gammas[0], 'blocks': args.blocks }) elif args.honest[i] == 1: policies[str(i)] = (Honest, osm_space, spaces.Discrete(6), { 'alpha': args.alphas[0], 'gamma': args.gammas[0], 'blocks': args.blocks, 'fiftyone': args.fiftyone[i], 'extended': args.extended }) else: policies[str(i)] = (None, blind_state_space_wrapped, spaces.Discrete(action_n), { "model": { "use_lstm": args.use_lstm, "custom_model": "pa_model", "custom_options": { "parties": len(args.alphas), "spy": False, "blocks": args.blocks, "extended": args.extended } } }) preps[i] = get_preprocessor(blind_state_space_wrapped)( blind_state_space_wrapped) env_config = { 'max_hidden_block': args.blocks, 'alphas': args.alphas, 'gammas': args.gammas, 'ep_length': args.ep_length, 'print': args.debug, 'spy': args.spy, 'team_spirit': args.team_spirit, 'OSM': args.OSM, 'extended': args.extended, 'honest': args.honest, } policies_to_train = [ str(i) for i in range(len(args.alphas)) if args.OSM[i] != 1 and args.honest[i] != 1 ] env = ParametricBitcoin(env_config=env_config) if len(policies_to_train) != 0: if args.trainer == 'PPO': trainer = PPOTrainer(env=BitcoinEnv, config={ "num_workers": 0, "multiagent": { "policies_to_train": policies_to_train, "policies": policies, "policy_mapping_fn": select_policy, }, "env_config": env_config }) else: trainer = DQNTrainer(env=env_name, config={ "eager": True, "multiagent": { "policies_to_train": policies_to_train, "policies": policies, "policy_mapping_fn": select_policy, }, "env_config": env_config }) model = trainer.get_policy().model print(model.base_model.summary()) print("Restoring model") trainer.restore(args.save_path) loaded_policies = dict() for k in range(len(args.alphas)): if args.OSM[k] == 1: loaded_policies[str(k)] = osm elif args.honest[k] == 1: honest = Honest( osm_space, spaces.Discrete(6), { 'alpha': args.alphas[0], 'gamma': args.gammas[0], 'blocks': args.blocks, 'fiftyone': args.fiftyone[k], 'extended': args.extended }, ) loaded_policies[str(k)] = honest preps[k] = None else: loaded_policies[str(k)] = trainer.get_policy(str(k)) trials = 100000 reslist = [] for j in range(3): blocks = np.zeros(len(args.alphas) + 1) event_blocks = np.zeros(len(args.alphas) + 1) action_dist = { str(i): np.zeros(action_n) for i in range(len(args.alphas)) } res = dict() for i in range(trials): obs = env.reset() isDone = False RNNstates = {str(i): [] for i in range(len(args.alphas))} while not isDone: action_dict = dict() for k in range(len(policies)): prep = preps[k] if not prep: action_dict[str(k)], _, _ = loaded_policies[str( k)].compute_single_action(obs=obs[str(k)], state=[]) else: action_dict[str(k)], _, _ = loaded_policies[str( k)].compute_single_action(obs=prep.transform( obs[str(k)]), state=[]) action_dist[str(k)][action_dict[str(k)]] += 1 obs, _, done, _ = env.step(action_dict) isDone = done['__all__'] if i == 0 and j == 0: with open( os.path.join('/afs/ece/usr/charlieh/eval_results', env_name + '_trace.txt'), 'w+') as f: f.write(env.wrapped._debug_string) blocks += env.wrapped._accepted_blocks event_blocks += env.wrapped._total_blocks total_event_blocks = np.sum(event_blocks) if i % 100 == 0: print("Relative rewards", blocks / np.sum(blocks)) print("Relative received", event_blocks / total_event_blocks) for i in range(len(args.alphas)): print("Action dist", str(i), action_dist[str(i)] / np.sum(action_dist[str(i)])) res['blocks'] = blocks res['action dist'] = action_dist res['blocks norm'] = blocks / np.sum(blocks) res['actions norm'] = { str(i): action_dist[str(i)] / np.sum(action_dist[str(i)]) for i in range(len(args.alphas)) } reslist.append(res) np.save(os.path.join('/afs/ece/usr/charlieh/eval_results', env_name), reslist, allow_pickle=True)
# Set up env ray.init(**config["ray_init_config"]) register_env('Duckietown', launch_and_wrap_env) ########################################################### # Restore agent trainer = PPOTrainer(config=config["rllib_config"]) trainer.restore(checkpoint_path) print_config(trainer.config) ########################################################### # Visualize HIGH_RES_OUTPUT = True model = tf.keras.models.clone_model( trainer.get_policy().model.base_model) # type: tf.keras.Model cap = cv2.VideoCapture('./docs/Real.mp4') # fourcc = cv2.VideoWriter_fourcc(*'FMP4') fps = cap.get(cv2.CAP_PROP_FPS) if HIGH_RES_OUTPUT: out = cv2.VideoWriter('salient_obj_video.mp4', cv2.VideoWriter_fourcc(*"MJPG"), fps, (320, 320)) else: out = cv2.VideoWriter('salient_obj_video.mp4', cv2.VideoWriter_fourcc(*"MJPG"), fps, eval(config["env_config"]["resized_input_shape"])) dummy_env = wrap_env(config["env_config"]) obs_wrappers, _, _ = get_wrappers(dummy_env)
# restoring checkpoints require ray ray.init() # best_ckpt=restore_training(trainer_obj, ckpt_dir,custom_metrics_file) with open(Config.MINIMAX_DEPTH_PATH) as json_file: data = json.load(json_file) minimax_depth = 3 #data["minimax_depth"] # restore weights from a previous run restored_weights = [] weights = np.load(weights_file, allow_pickle=True) weights_name = ["p" + str(i + 1) for i in range(weights_to_keep)] for name in weights_name: restored_weights.append(weights[()][name]) trainer_obj.callbacks.add_weights(restored_weights[-1]) # give player 1 the best weights trainer_obj.get_policy("player1").set_weights(restored_weights[-1]) ray.shutdown() else: best_ckpt = 0 minimax_depth = 1 print("Starting training from scratch") # import moved here otherwise i get version compatibility issues by using # the log_creator import tensorflow as tf number_of_stochastic_moves = 5 logdir = str(trainer_obj._logdir)
#update_percentage = update_times * 0.01 epoch_update = 0 for epoch in range(num_epoch): print("Training iteration: {}".format(epoch), end='\t') res = trainer.train() win_percentage = (res["policy_reward_mean"]["trainer"] - res["episode_len_mean"]) / 11 - 10 / 11 + 1 print("Win percentage: ", win_percentage, end='\t') print("Average reward: ", res["policy_reward_mean"]["trainer"]) update_percentage = update_times * 0.01 if win_percentage > 0.72 + update_percentage or win_percentage > 0.82: # and res["policy_reward_mean"]["trainer"] > 18 + update_times: if epoch_update == 0: epoch_update = epoch if epoch >= epoch_update + 5: update_times += 1 epoch_update = epoch print("UPDATING OPPONENTS") trainer_weights = trainer.get_policy("trainer").get_weights() trainer.get_policy("opponent").set_weights(trainer_weights) reward = env.test(trainer) if epoch % save_epochs == 0: trainer.save() #print(res) #print("Average reward: ", res["policy_reward_mean"]["trainer"] ) if epoch % 1 == 0: reward = env.test(trainer)
# Serving and training loop env = trainer.env_creator({}) # obs_state = {} # obs_state["obs"] = obs[list(obs.keys())[0]] player1 = Connect4Config.PLAYER1 player1_id = Connect4Config.PLAYER1_ID player2 = Connect4Config.PLAYER2 player2_id = Connect4Config.PLAYER2_ID actual_player = player1 actual_player_id = player1_id obs = env.reset(player1_id) obs = {"obs": obs[actual_player]} action_dict = {} while True: # action, state, info_trainer = trainer.get_policy(actual_player).compute_single_action(obs)#compute_action(obs[actual_player],policy_id=actual_player,explore=False)#, full_fetch=True) action_logits, _ = trainer.get_policy(actual_player).model.forward( obs, None, None) action = np.argmax(action_logits[0]) action_dict = {actual_player: action} print("Player " + str(actual_player_id + 1) + " picked column: " + str(action + 1)) obs, reward, done, info = env.step(action_dict) print(env) if done["__all__"]: print("Player " + str(actual_player_id + 1) + " WON!!!!!!") obs = env.reset() break if actual_player == player1: actual_player = player2 actual_player_id = player2_id else: actual_player = player1
import os import ray import ray.tune as tune from ray.tune import sample_from from fast_image_env import FastImageEnv from fast_model import TorchFastModel, TorchCustomFastModel from ray.rllib.models import ModelCatalog from ray.rllib.agents.ppo import PPOTrainer if __name__ == "__main__": ray.shutdown() ray.init() config = { "env": FastImageEnv, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": 1, "num_workers": 4, "framework": "torch", } trainer = PPOTrainer(config=config) print(trainer.get_policy().model) results = tune.run("PPO", config=config, verbose=3) print(results) ray.shutdown()
TOTAL_STEPS = int(args.total_steps) launch_script = "./launchClient_quiet.sh" register_env(ENV_NAME, create_env) # update config with evaluation resources and switch exploration off config = get_config(checkpoint_file) config["num_workers"] = args.num_workers config["num_gpus"] = args.num_gpus config["explore"] = False # Load agent ray.init() trainer = PPOTrainer(config) trainer.restore(checkpoint_file) policy = trainer.get_policy() # Start Malmo instances GAME_INSTANCE_PORTS = [COMMAND_PORT + i for i in range(NUM_WORKERS)] instances = launch_minecraft(GAME_INSTANCE_PORTS, launch_script=launch_script) # Connect to the Java instances env = create_env(config) # Custom evaluation loop print(f"running evaluations for {EPISODES} episodes") for ep in range(EPISODES): state = env.reset() done = False ep_length = 0
}, "observation_filter": "NoFilter", "clip_actions": False, "framework": "torch" }, env="MinerEnv-v0") id = 2050 checkpoint_dir = "/home/lucius/ray_results/gold_miner_2/PPO_MinerEnv-v0_0_2020-09-13_00-54-26q3mjnpej" checkpoint = "{}/checkpoint_{}/checkpoint-{}".format(checkpoint_dir, id, id) ppo_agent.restore(checkpoint) for i in range(8): mem_size = 0 weights = ppo_agent.get_policy(f"policy_{i}").get_weights() for key in weights: parameters = 1 for value in weights[key].shape: parameters *= value mem_size += parameters weights[key] = torch.tensor(weights[key]) print(mem_size) torch.save( weights, f"/home/lucius/working/projects/gold_miner/resources/TrainedModels/model_{i}.pt" ) # model = FourthModel(constants.OBS_SPACE, constants.ACT_SPACE, 6, {}, "model", constants.NUM_FEATURES)
lstm_weights = np.load(best_weights_npy,allow_pickle=True) number_of_evaluation_games = Config.NUMBER_OF_EVALUATION_GAMES # 100 number_of_games_to_test = Config.NUMBER_OF_GAMES_TO_TEST #[1,2,3,4,5] depth_list = Config.DEPTH_LIST # [1,4,6] number_of_stochastic_moves = 6 sequence_len = lstm_timesteps npy_weights_file = os.path.join(data_dir,"weights.npy") weights = np.load(npy_weights_file,allow_pickle=True)[()] play = True trainer_obj = PPOTrainer( config=TrainerConfig.PPO_TRAINER_CONNECT3, ) model = trainer_obj.get_policy("player1").model # ============================================================================= # TEST THE MODEL # ============================================================================= import tensorflow as tf lstm_model = LSTM_model(batch_size,(lstm_timesteps,features_len),output_len,lstm_hidden,False) # generate a fake input to define the model stucture and then load the weights # [batch,timestep,features] # random_input = np.random.rand(1,lstm_timesteps,features_len) random_input = np.random.rand(1,lstm_timesteps,features_len) random_input = random_input.astype('float32') lstm_model(random_input) lstm_model.set_weights(lstm_weights[()])
} ppo_trainer_config = { "env": "ParametricScopone", "multiagent": { "policies_to_train": ["ppo_policy_nico"], "policies": policies, "policy_mapping_fn": lambda agent_id: "ppo_policy_albi" if agent_id in ("player_1", "player_3") else "ppo_policy_nico", }, "observation_filter": "NoFilter", "callbacks": PlayerScoreCallbacks } trainer = PPOTrainer(config=ppo_trainer_config) if restore_checkpoint: trainer.restore(checkpoint_path) trainer.get_policy("ppo_policy_albi").model.base_model.summary() trainer.get_policy("ppo_policy_nico").model.base_model.summary() for i in range(10000): res = trainer.train() print("Iteration {}. policy_reward_mean: {}".format( i, res['policy_reward_mean'])) if i % checkpoint_every == 0: trainer.save() print('Training finished, check the results in ~/ray_results/<dir>/')