def train_agent(cmdl): global_time = time.perf_counter() env = utils.env_factory(cmdl, "training") eval_env = utils.env_factory(cmdl, "evaluation") name = cmdl.agent_type env_space = (env.action_space, env.observation_space) agent = get_agent(name)(env_space, cmdl) eval_env_space = (env.action_space, env.observation_space) eval_agent = get_agent("evaluation")(eval_env_space, cmdl) agent.display_setup(env, cmdl) ep_cnt = 1 fps_time = time.perf_counter() s, r, done = env.reset(), 0, False for step_cnt in range(cmdl.training_steps): a = agent.evaluate_policy(s) _s, _a = s.clone(), a s, r, done, _ = env.step(a) agent.improve_policy(_s, _a, r, s, done) step_cnt += 1 agent.gather_stats(r, done) # Do some reporting if step_cnt != 0 and step_cnt % cmdl.report_frequency == 0: agent.display_stats(fps_time) agent.display_model_stats() fps_time = time.perf_counter() gc.collect() # Start doing an evaluation eval_ready = step_cnt >= cmdl.eval_start if eval_ready and (step_cnt % cmdl.eval_frequency == 0): eval_time = time.perf_counter() evaluate_agent(step_cnt, eval_env, eval_agent, agent.policy, cmdl) gc.collect() fps_time = fps_time + (time.perf_counter() - eval_time) if done: ep_cnt += 1 s, r, done = env.reset(), 0, False agent.display_final_report(ep_cnt, step_cnt, global_time)
def main(args): env_kwargs = argument_parser.prepare_env_kwargs(args) env = gym.make(args.environment_name, **env_kwargs) state = env.reset() agent = agents.get_agent(args.agent, env_name=args.environment_name, network_architecture=args.value_estimator, init_state=state, num_of_actions=env.action_space.n) if args.agent in ['dqn', 'a2c']: agent.load_weights(args.weights) out_path = pathlib2.Path('/'.join(args.weights.split('/')[:-1])) agent.eval() else: out_path = pathlib2.Path('out/{}'.format(args.agent)) agent.set_action_space(env_kwargs['action_space']) evaluator = create_validation_engine(agent, env) path_plotter = path_plot.Plotter(args.environment_name, out_path) state_recorder = StateRecorder(args.environment_name, out_path) action_recorder = ActionRecorder(args.environment_name, out_path, env_kwargs['action_space']) evaluator.attach(state_recorder) evaluator.attach(action_recorder) evaluator.attach(path_plotter) evaluator.attach(LapTimeMeasure(out_path, args.environment_name)) # evaluator.attach(ProgressBar(persist=False)) engine_state = evaluator.run(data=StepGenerator( env, agent, max_steps=args.max_steps, break_if_collision=args.break_if_collision), max_epochs=1000)
def main(args): session_id = get_new_session_id() logger = Logger(session_id) env_names = re.findall(r'[\(|,]([^\(|,|\)]+)', args.environment_name) envs = [] for env_name in env_names: env = gym.make( env_name, **argparse.prepare_env_kwargs(args, gazebo_multienv=True)) args.port_gazebo = str(int(args.port_gazebo) + 1) args.port_ros = str(int(args.port_ros) + 1) envs.append(env) state = [env.reset() for env in envs][0] agent = agents.get_agent( args.agent, **prepare_agent_kwargs(args, state, logger, envs[0].action_space.n)) if args.pretrained: print('load pretrained weights: ', args.pretrained) agent.load_weights(args.pretrained) agent.train() saver = NetSaver(args, session_id) trainer = create_reinforce_engine(agent, envs, args) # trainer.attach(ProgressBar(persist=False)) # Key error 'percentage' after a few k of epochs !? trainer.attach(saver) trainer.attach(logger) engine_state = trainer.run(data=MultienvStepGenerator( envs, agent, max_steps=args.max_steps), max_epochs=args.epochs_count)
def train_seq(init_model, get_optim, multitask, args): """ Train sequentially """ from agents import get_agent wrappers = getattr(args.lifelong, "wrappers", list()) agent_class = get_agent(args.lifelong.mode, base_wrappers=wrappers) agent = agent_class(init_model, get_optim, multitask, args) agent.train_sequentially()
def train_agent(cmdl): step_cnt = 0 ep_cnt = 0 preprocess = Preprocessor(cmdl.env_class).transform env = utils.get_new_env(cmdl.env_name) agent = get_agent(cmdl.agent.name)(env.action_space, cmdl.agent) display_setup(env, cmdl) start_time = time.time() while step_cnt < cmdl.training.step_no: ep_cnt += 1 o, r, done = env.reset(), 0, False s = preprocess(o) while not done: a = agent.evaluate_policy(s) o, r, done, _ = env.step(a) _s, _a = s, a s = preprocess(o) agent.improve_policy(_s, _a, r, s, done) step_cnt += 1 agent.gather_stats(r, done) if ep_cnt % cmdl.report_freq == 0: agent.display_stats(start_time) agent.display_model_stats() end_time = time.time() display_stats(ep_cnt, step_cnt, end_time - start_time) """
def main(_): spec = cluster_spec(config.num_workers, 1) cluster = tf.train.ClusterSpec(spec).as_cluster_def() signal.signal(signal.SIGHUP, shutdown) signal.signal(signal.SIGINT, shutdown) signal.signal(signal.SIGTERM, shutdown) if config.job_name == "worker": server = tf.train.Server(cluster, job_name="worker", task_index=config.task, config=tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=2)) env = create_env(config.env_id, client_id=str(config.task)) model_fn = lambda: get_model(config)(env.observation_space.shape, env. action_space.n) agent = get_agent(config)(model_fn, env, config) trainer = Trainer(agent, env, server, config.task, config.log_dir) if config.is_train: trainer.train() else: trainer.test() else: server = tf.train.Server( cluster, job_name="ps", task_index=config.task, config=tf.ConfigProto(device_filters=["/job:ps"])) while True: time.sleep(1000)
def main(args): session_id = get_new_session_id() logger = Logger(session_id) env = gym.make(args.environment_name, **argparse.prepare_env_kwargs(args)) state = env.reset() agent = agents.get_agent( args.agent, **prepare_agent_kwargs(args, state, logger, env.action_space.n)) if args.pretrained: print('load pretrained weights: ', args.pretrained) agent.load_weights(args.pretrained) agent.train() saver = NetSaver(args, session_id) trainer = create_reinforce_engine(agent, env, args) # trainer.attach(ProgressBar(persist=False)) # Key error 'percentage' after a few k of epochs !? trainer.attach(saver) trainer.attach(logger) engine_state = trainer.run(data=StepGenerator(env, agent, max_steps=args.max_steps), max_epochs=args.epochs_count)
def predict_for_malmo(shared_objects, cfg): predict_queue = shared_objects["predict_queue"] send_back_queues = shared_objects["send_back_queues"] N = len(send_back_queues) # -- Initialize agent and wrap it in a Binary18BatchAgentWrapper :) Agent = get_agent(cfg.agent.type) agent = Agent(cfg.agent.name, ENV_ACTIONS, cfg, shared_objects) print_info("{:s}<{:s}> learns from queue. |role={:d}".format( cfg.agent.name, cfg.agent.type, cfg.agent.role )) dtype = torch.LongTensor(0) if cfg.general.use_cuda: dtype = dtype.cuda() while True: # -- 1. tasks = [] while True: try: t = predict_queue.get(True, 0.1) tasks.append(t) except Queue.Empty: if len(tasks) > 1: break # print("Let's predict for {:d} losers.".format(len(tasks))) # -- 4. # 4.a. Create batch from transitions # !! This is incomplete ids = torch.LongTensor([_id for (_id, _, _, _) in tasks]) state = torch.cat([torch.LongTensor(s) for (_, s, _, _) in tasks], 0).float() done = torch.cat([torch.LongTensor(d) for (_, _, d, _) in tasks], 0) tokens = torch.LongTensor([token for (_, _, _, token) in tasks]) if cfg.general.use_cuda: ids = ids.cuda() state = state.cuda() done = done.cuda() actions = agent.batch_predict(ids, state, done) # print(actions.unsqueeze(0)) for _id, action, token in zip(ids, actions.tolist(), tokens): send_back_queues[_id].send((action, token)) # print("Sent to {:d} action {:d}".format(_id, action)) # print("Done! Waiting again...") tasks.clear()
def main(op_check): agent = get_agent(S.agent.name) # Load Pretrained if S.load_weights_dir: agent.load_snapshot(S.load_weights_dir) if op_check: agent.operation_check() else: agent.train()
def train_agent(cmdl): step_cnt = 0 ep_cnt = 0 start_time = time.time() env = utils.get_new_env(cmdl.env_name, cmdl) eval_env = EvaluationMonitor(gym.make(cmdl.env_name), cmdl) name = cmdl.agent.name agent = get_agent(name)(env.action_space, cmdl.agent) eval_agent = get_agent(name)(eval_env.action_space, cmdl.agent, False) preprocess = Preprocessor(cmdl.env_class).transform agent.display_setup(env, cmdl) while step_cnt < cmdl.training.step_no: ep_cnt += 1 o, r, done = env.reset(), 0, False s = preprocess(o) while not done: a = agent.evaluate_policy(s) o, r, done, _ = env.step(a) _s, _a = s, a s = preprocess(o) agent.improve_policy(_s, _a, r, s, done) step_cnt += 1 agent.gather_stats(r, done) if step_cnt % cmdl.report_freq == 0: agent.display_stats(start_time) agent.display_model_stats() gc.collect() if step_cnt % cmdl.eval_freq == 0: evaluate_agent(step_cnt, eval_env, eval_agent, agent.policy, cmdl) end_time = time.time() agent.display_final_report(ep_cnt, step_cnt, end_time - start_time)
def search_metadata(url='', key='', org='', account='', metadata='', **kwargs): agent_names = [] org_list = orgs.get_orgs(url=url, org=org, account=account, key=key) for o in org_list: account_list = accounts.get_accounts(url=url, org=o['name'], key=key) for acc in account_list: agent_list = agents.get_agents(url=url, org=org, account=acc['name'], key=key) for summary in agent_list: agent_names.append(summary['name']) for agent in agent_names: try: search_hash = flatten(agents.get_agent(url=url, org=org, account=acc['name'], key=key, agent_name=agent)) if metadata in search_hash.keys() or metadata in search_hash.values(): click.echo('Organization: ' + o['name'] + ' Account: ' + acc['name'] + ' Agent: ' + agent) except: continue
def action(): ckeck_training_deamon() agent = agents.get_agent(request.client_id) if agent.replay_memory_lock.acquire(): agent.replay_memory.add_trajectories_base64(request.trajectories) agent.replay_memory_lock.release() fresh_model = agent.get_fresh_model() if fresh_model is not None: agent.update_runtime_parameters() return { "model": fresh_model, "runtime_parameters": agent.runtime_parameters.__dict__ } else: return {}
def search_metadata(url='', key='', org='', account='', metadata='', **kwargs): agent_names = [] org_list = orgs.get_orgs(url=url, org=org, account=account, key=key) for o in org_list: account_list = accounts.get_accounts(url=url, org=o['name'], key=key) for acc in account_list: agent_list = agents.get_agents(url=url, org=org, account=acc['name'], key=key) for summary in agent_list: agent_names.append(summary['name']) for agent in agent_names: try: search_hash = flatten(agents.get_agent(url=url, org=org, account=acc['name'], key=key, agent_name=agent)) if metadata in search_hash.keys() or metadata in search_hash.values(): click.echo('Organization: %s Account: %s Agent: %s' % (o['name'], acc['name'], agent)) except: continue
def train_agent(shared_objects, cfg): env = PigChaseEnvironment( parse_clients_args(cfg.envs.minecraft.ports), PigChaseTopDownStateBuilder(), role=cfg.agent.role, randomize_positions=cfg.envs.minecraft.randomize_positions) agent = get_agent(cfg.agent.type)(cfg.agent.name, ENV_ACTIONS) print( clr( "[ %s ] type=%s, role=%d. Agent started." % (cfg.agent.name, cfg.agent.type, cfg.agent.role), 'cyan')) obs = env.reset() reward = 0 is_terminal = False viz_rewards = [] ep_cnt = 0 start_time = time.time() print("No of epochs: %d. Max no of steps/epoch: %d" % (cfg.training.episodes_no, cfg.training.max_step_no)) training_steps = cfg.training.episodes_no * cfg.training.max_step_no for step in range(1, training_steps + 1): # check if env needs reset if env.done: obs = env.reset() ep_cnt += 1 if ep_cnt % cfg.general.report_freq == 0: print("[DQN] Ep: %d | Rw: %d" % (ep_cnt, sum(viz_rewards) / cfg.general.report_freq)) viz_rewards.clear() # select an action action = agent.act(obs, reward, is_terminal, is_training=True) # take a step obs, reward, is_terminal = env.do(action) viz_rewards.append(reward) elapsed_time = time.time() - start_time print("Finished in %.2f seconds at %.2ffps." % (elapsed_time, training_steps / elapsed_time))
def run_once(args): cfg, run_id, path = args sim_path = path + "/" + cfg.simulator.save_folder if not os.path.exists(sim_path): os.makedirs(sim_path) simulator = Simulator(cfg, sim_path, log) simulator.start() # -- Set seed cfg.general.seed = utils.set_seed(cfg.general.seed) # -- Load simulator # TODO 2 start server with config # TODO 2 Save simulator config in path ( see line 41 with save_config( # -- Resume agent and metrics if checkpoints are available resume_path = path + "/" + cfg.checkpoint if resume_path: log.info("Resuming training ...") cfg.agent.resume = resume_path logging.info('listening to server %s:%s', cfg.simulator.host, cfg.simulator.port) # -- Get agent agent = get_agent(cfg.agent) agent.set_simulator(cfg) os.chdir(sim_path) benchmark_agent = DemoBenchmark(cfg.simulator.town) # -- Init finished #save_config(os.path.join(cfg.general.common.save_path, "ran_cfg"), cfg) # Now actually run the driving_benchmark #import pdb; pdb.set_trace() run_driving_benchmark(agent, benchmark_agent, cfg.simulator.town, cfg.simulator.carla_log_name, cfg.simulator.continue_experiment, cfg.simulator.host, cfg.simulator.port) simulator.kill_process()
def run_once(args): cfg, run_id, path = args # -- Set seed cfg.general.seed = utils.set_seed(cfg.general.seed) # -- Get data loaders data_loader = get_data_loader(cfg.data_loader) train_data = data_loader.get_train_loader() test_data = data_loader.get_test_loader() # -- Resume agent and metrics if checkpoints are available # TODO Resume if cfg.checkpoint != "": resume_path = path + "/" + cfg.checkpoint log.info("Resuming training ...") cfg.agent.resume = resume_path # -- Get agent agent = get_agent(cfg.agent) # -- Should have some kind of reporting agent # TODO Implement reporting agent # -- Init finished save_config(os.path.join(cfg.general.common.save_path, "ran_cfg"), cfg) eval_freq = cfg.train.eval_freq no_epochs = cfg.train.no_epochs - agent.get_train_epoch() for epoch in range(no_epochs): log.info("Train epoch: {}".format(epoch)) agent.train(train_data) if epoch % eval_freq == 0: agent.test(test_data) print("Finished an epoch :D") with open(path + "/loss_values_train", "wb") as f: pickle.dump(agent.loss_values_train, f) with open(path + "/loss_values_test", "wb") as f: pickle.dump(agent.loss_values_test, f) agent.eval_agent()
def run_once(args): cfg, run_id, path = args # -- Set seed cfg.general.seed = utils.set_seed(cfg.general.seed) # -- Resume agent and metrics if checkpoints are available # TODO Resume resume_path = path + "/" + cfg.checkpoint if resume_path: log.info("Resuming training ...") cfg.agent.resume = resume_path # -- Get agent agent = get_agent(cfg.agent) # -- Should have some kind of reporting agent # TODO Implement reporting agent # -- Init finished save_config(os.path.join(cfg.general.common.save_path, "ran_cfg"), cfg) agent.eval_agent()
def main(): print_info("Starting...") # -- Read configuration config = read_config() # -- Configure Torch if config.general.seed != 0: torch.manual_seed(config.general.seed) if config.general.use_cuda: torch.cuda.manual_seed_all(config.general.seed) # Configure model shared_model = get_model(config.model.name) if config.general.use_cuda: shared_model.cuda() shared_model.share_memory() # Get agent agent = get_agent(config.agent.type)(config.agent.name, ENV_ACTIONS, shared_model, config) # Shared statistics shared_stats = AtomicStatistics() # Shared objects shared_objects = {"agent": agent, "stats_leRMS": shared_stats} start_time = time.time() #Train Agent train_agent_simulated(shared_objects, config) total_time = time.time() - start_time print_info("Everything done in {:.2f}!".format(total_time))
def run_once(args): cfg, run_id, path = args # -- Set seed cfg.general.seed = utils.set_seed(cfg.general.seed) # -- Resume agent and metrics if checkpoints are available # TODO Resume resume_path = path + "/" + cfg.checkpoint if resume_path: log.info("Network_activation ...") cfg.agent.resume = resume_path # -- Get agent agent = get_agent(cfg.agent) if cfg.eval_model is False: log.info("Not in eval mode") return if cfg.image_number != -1: eval_network(agent, cfg) else: pass
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--agent_name', '-a', dest='agent_name', action='store', required=True, help='Name of agent to be used to retieve config and agent object.') parser.add_argument('--shuffle_times', '-s', dest='shuffle_times', type=int, action='store', required=True, help='Times to shuffle the dataset.') parser.add_argument('--reward_func', '-r', dest='reward_func', default='default', action='store', help='Reward function.') parser.add_argument('--output_name', '-o', dest='output_name', default='', action='store', help='prefix of output score files.') args = parser.parse_args() config = get_config(args.agent_name) dataset = WarfarinDataSet(config) regrets = np.zeros((args.shuffle_times, dataset.size())) precision = np.zeros((args.shuffle_times, dataset.size())) reward_func = get_reward_func(args.reward_func) for i in range(args.shuffle_times): agent = get_agent(args.agent_name, config, dataset) dataset.shuffle() regret = 0 corrects = 0 for ts, data in tqdm(enumerate(dataset)): features = data['features'] label = data['label'] action, context = agent.act(features) reward = reward_func(label, action) agent.feedback(reward, context) # Calacualte Eval metrics regret -= reward regrets[i][ts] = regret if is_correct_action(label, action): corrects += 1 precision[i][ts] = corrects / (ts + 1) print('{} final regret: {} final average precision: {}'.format( i, regret, precision[i][-1])) if args.output_name: output_name = args.output_name else: output_name = args.agent_name avg_regrets = np.average(regrets, axis=0) avg_precision = np.average(precision, axis=0) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(range(dataset.size()), avg_regrets, 'b') fig.savefig("data/scores/{}-regret.png".format(output_name)) print(np.std(regrets, axis=0)) with open("data/scores/{}-regret-values.txt".format(output_name), mode='w') as f: f.write(';'.join(map(lambda x: ','.join(map(str, x)), regrets))) fig2 = plt.figure() ax2 = fig2.add_subplot(1, 1, 1) ax2.plot(range(dataset.size()), avg_precision, 'b') fig2.savefig("data/scores/{}-precision.png".format(output_name)) with open("data/scores/{}-precision-values.txt".format(output_name), mode='w') as f: f.write(';'.join(map(lambda x: ','.join(map(str, x)), precision)))
def run(full_args: Namespace) -> None: # import torch.multiprocessing as mp # mp.set_start_method('spawn') args = full_args.main agent_args = full_args.agent model_args = full_args.model env_args = full_args.env_cfg extra_logs = getattr(full_args, "extra_logs", None) if args.seed == 0: args.seed = full_args.run_id + 1 max_eprews = args.max_eprews post_process_args(agent_args) post_process_args(model_args) model_dir = getattr(args, "model_dir", full_args.out_dir) print(model_dir) # ============================================================================================== # @ torc_rl repo original # Define logger, CSV writer and Tensorboard writer logger = utils.get_logger(model_dir) csv_file, csv_writer = utils.get_csv_writer(model_dir) tb_writer = None if args.tb: from tensorboardX import SummaryWriter tb_writer = SummaryWriter(model_dir) # Log command and all script arguments logger.info("{}\n".format(" ".join(sys.argv))) logger.info("{}\n".format(args)) # ============================================================================================== # Set seed for all randomness sources utils.seed(args.seed) # ============================================================================================== # Generate environments envs = [] # Get environment wrapper wrapper_method = getattr(full_args.env_cfg, "wrapper", None) if wrapper_method is None: def idem(x): return x env_wrapper = idem else: env_wrappers = [getattr(environment, w_p) for w_p in wrapper_method] def env_wrapp(w_env): for wrapper in env_wrappers[::-1]: w_env = wrapper(w_env) return w_env env_wrapper = env_wrapp actual_procs = getattr(args, "actual_procs", None) master_make_envs = getattr(full_args.env_cfg, "master_make_envs", False) if actual_procs: # Split envs in chunks no_envs = args.procs envs, chunk_size = get_envs(full_args, env_wrapper, no_envs, master_make=master_make_envs) first_env = envs[0][0] print( f"NO of envs / proc: {chunk_size}; No of processes {len(envs[1:])} + Master" ) else: for i in range(args.procs): env = env_wrapper(gym.make(args.env)) env.max_steps = full_args.env_cfg.max_episode_steps env.no_stacked_frames = full_args.env_cfg.no_stacked_frames env.seed(args.seed + 10000 * i) envs.append(env) first_env = envs[0] # Generate evaluation envs eval_envs = [] if full_args.env_cfg.no_eval_envs > 0: no_envs = full_args.env_cfg.no_eval_envs eval_envs, chunk_size = get_envs(full_args, env_wrapper, no_envs, master_make=master_make_envs) # Define obss preprocessor max_image_value = full_args.env_cfg.max_image_value normalize_img = full_args.env_cfg.normalize obs_space, preprocess_obss = utils.get_obss_preprocessor( args.env, first_env.observation_space, model_dir, max_image_value=max_image_value, normalize=normalize_img) # ============================================================================================== # Load training status try: status = utils.load_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} saver = utils.SaveData(model_dir, save_best=args.save_best, save_all=args.save_all) model, agent_data, other_data = None, dict(), None try: # Continue from last point model, agent_data, other_data = saver.load_training_data(best=False) logger.info("Training data exists & loaded successfully\n") except OSError: logger.info("Could not load training data\n") # ============================================================================================== # Load Model if model is None: model = get_model(model_args, obs_space, first_env.action_space, use_memory=model_args.use_memory, no_stacked_frames=env_args.no_stacked_frames) logger.info(f"Model [{model_args.name}] successfully created\n") # Print Model info logger.info("{}\n".format(model)) if torch.cuda.is_available(): model.cuda() logger.info("CUDA available: {}\n".format(torch.cuda.is_available())) # ============================================================================================== # Load Agent algo = get_agent(full_args.agent, envs, model, agent_data, preprocess_obss=preprocess_obss, reshape_reward=None, eval_envs=eval_envs) has_evaluator = hasattr(algo, "evaluate") and full_args.env_cfg.no_eval_envs > 0 # ============================================================================================== # Train model crt_eprew = 0 if "eprew" in other_data: crt_eprew = other_data["eprew"] num_frames = status["num_frames"] total_start_time = time.time() update = status["update"] update_start_time = time.time() while num_frames < args.frames: # Update model parameters logs = algo.update_parameters() num_frames += logs["num_frames"] update += 1 if has_evaluator: if update % args.eval_interval == 0: algo.evaluate() prev_start_time = update_start_time update_start_time = time.time() # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_start_time - prev_start_time) duration = int(time.time() - total_start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += ["entropy", "value", "policy_loss", "value_loss"] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"] ] header += ["grad_norm"] data += [logs["grad_norm"]] # add log fields that are not in the standard log format (for example value_int) extra_fields = extra_log_fields(header, list(logs.keys())) header.extend(extra_fields) data += [logs[field] for field in extra_fields] # print to stdout the standard log fields + fields required in config keys_format, printable_data = print_keys(header, data, extra_logs) logger.info(keys_format.format(*printable_data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() if status["num_frames"] == 0: csv_writer.writerow(header) csv_writer.writerow(data) csv_file.flush() if args.tb: for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) status = {"num_frames": num_frames, "update": update} crt_eprew = list(rreturn_per_episode.values())[0] # -- Save vocabulary and model if args.save_interval > 0 and update % args.save_interval == 0: # preprocess_obss.vocab.save() saver.save_training_data(model, algo.get_save_data(), crt_eprew) logger.info("Model successfully saved") utils.save_status(status, model_dir) if crt_eprew > max_eprews != 0: print("Reached max return 0.93") exit()
def run(mazeType, ai, trials, width, height, showplan): # setup some data structure for reporting results results = [] # for each trial for trial in range(trials): result = {} # create a list of agents, will only be one if `all` is not selected agents = [] if ai == ALL: for agent in ALL_AGENTS: agents.append((get_agent(agent), agent)) else: agents.append((get_agent(ai), ai)) # create a list of mazes, will only be one if `all` is not selected mazes = [] if mazeType == ALL: for maze in ALL_MAZES: mazes.append((get_maze(maze, width, height), maze)) else: mazes.append((get_maze(mazeType, width, height), mazeType)) # for each maze for mazeTuple in mazes: maze = mazeTuple[0] if showplan: print_maze(maze, width, height) mazeName = mazeTuple[1] result[mazeName] = {} # for each agent for agentTuple in agents: agentName = agentTuple[1] agent = agentTuple[0] # create a problem object for this maze, corner to corner problem = Problem((1, 1), (width - 2, height - 2), maze, width, height) result[mazeName][agentName] = {} time_zero = time.time() # get a plan from this agent plan = agent.getPlan(problem) # record results of this agent-maze pair into the reporting data structure result[mazeName][agentName]['time'] = time.time() - time_zero result[mazeName][agentName]['length'] = len(plan) result[mazeName][agentName]['nodes'] = problem.nodes_explored if showplan: print(plan) results.append(result) # print tabulated results by_maze = {} by_ai = {} time_table = {} length_table = {} nodes_table = {} for result in results: for mazeName in result: if mazeName not in by_maze: by_maze[mazeName] = {"times": [], "lengths": [], "nodes": []} if mazeName not in time_table: time_table[mazeName] = {} if mazeName not in length_table: length_table[mazeName] = {} if mazeName not in nodes_table: nodes_table[mazeName] = {} for agentName in result[mazeName]: if agentName not in by_ai: by_ai[agentName] = { "times": [], "lengths": [], "nodes": [] } if agentName not in time_table[mazeName]: time_table[mazeName][agentName] = [] if agentName not in length_table[mazeName]: length_table[mazeName][agentName] = [] if agentName not in nodes_table[mazeName]: nodes_table[mazeName][agentName] = [] by_maze[mazeName]['times'].append( result[mazeName][agentName]['time']) by_maze[mazeName]['lengths'].append( result[mazeName][agentName]['length']) by_maze[mazeName]['nodes'].append( result[mazeName][agentName]['nodes']) by_ai[agentName]['times'].append( result[mazeName][agentName]['time']) by_ai[agentName]['lengths'].append( result[mazeName][agentName]['length']) by_ai[agentName]['nodes'].append( result[mazeName][agentName]['nodes']) time_table[mazeName][agentName].append( result[mazeName][agentName]['time']) length_table[mazeName][agentName].append( result[mazeName][agentName]['length']) nodes_table[mazeName][agentName].append( result[mazeName][agentName]['nodes']) maze_list = [mazeName for mazeName in by_maze] print('\nMean Results by Maze:\n') print("\t\t\tTime\tLength\tNodes") for mazeName in by_maze: print('{maze: <16}'.format(maze=mazeName) + '\t' + str(mean(by_maze[mazeName]['times'])) + '\t' + str(mean(by_maze[mazeName]['lengths'])) + '\t' + str(mean(by_maze[mazeName]['nodes']))) print('\nMean Results by Agent:\n') print("\t\t\tTime\tLength\tNodes") for agentName in by_ai: print('{agent: <16}'.format(agent=agentName) + '\t' + str(mean(by_ai[agentName]['times'])) + '\t' + str(mean(by_ai[agentName]['lengths'])) + '\t' + str(mean(by_ai[agentName]['nodes']))) print('\nMean Times by Agent Maze combinations:\n') print('{a:<16}'.format(a='') + '\t'.join(map(lambda x: '{item: <16}'.format(item=x), maze_list))) for agentName in by_ai: print('{agent: <16}'.format(agent=agentName) + '\t'.join( map(lambda x: '{a:<16}'.format(a=mean(time_table[x][agentName])), maze_list))) print('\nMean Length by Agent Maze combinations:\n') print('{a:<16}'.format(a='') + '\t'.join(map(lambda x: '{item: <16}'.format(item=x), maze_list))) for agentName in by_ai: print('{agent: <16}'.format(agent=agentName) + '\t'.join( map(lambda x: '{a:<16}'.format(a=mean(length_table[x][agentName])), maze_list))) print('\nMean Nodes Explored by Agent Maze combinations:\n') print('{a:<16}'.format(a='') + '\t'.join(map(lambda x: '{item: <16}'.format(item=x), maze_list))) for agentName in by_ai: print('{agent: <16}'.format(agent=agentName) + '\t'.join( map(lambda x: '{a:<16}'.format(a=mean(nodes_table[x][agentName])), maze_list)))
def train_from_malmo(shared_objects, cfg): batch_size = cfg.general.batch_size queue = shared_objects["queue"] session = shared_objects["session"] reset = shared_objects["reset"] # -- Initialize agent and wrap it in a Binary18BatchAgentWrapper :) Agent = get_agent(cfg.agent.type) agent = Agent(cfg.agent.name, ENV_ACTIONS, cfg, shared_objects) print_info("{:s}<{:s}> learns from queue. |role={:d}".format( cfg.agent.name, cfg.agent.type, cfg.agent.role)) dtype = torch.LongTensor(0) if cfg.general.use_cuda: dtype = dtype.cuda() episodes_no = cfg.training.episodes_no best_r_ep = None best_r_frame = None frame_rewards = [] episode_rewards = [] for episode in range(1, episodes_no + 1): # 1. Checks the queue. If less than 32.. check again, else goto 2. # 2. Inform others to drop future experiences and wait for new params. # 3. Collect transitions # 4. Train agent and update parameters # 5. Drop any shit from queue # 6. Inform others that they should take the new params. Go to 1. # -- 1. while queue.qsize() < batch_size: time.sleep(.1) # -- 2. reset.value = 1 # -- 3. transitions = [] while len(transitions) < batch_size: try: t = queue.get() transitions.append(t) except Queue.Empty: print("futere") break while not queue.empty(): try: t = queue.get() transitions.append(t) except Queue.Empty: print("futere") break # -- 4. # 4.a. Create batch from transitions # !! This is incomplete print(transitions) (s, r, d, a) = transitions[0][0] (s, r, d, a) = torch.LongTensor(s), torch.FloatTensor( r), torch.LongTensor(d), torch.LongTensor(a) _s = s.new().resize_(torch.Size([0]) + s.size()[1:]) _a = a.new().resize_(torch.Size([0]) + a.size()[1:]) _r = r.new().resize_(torch.Size([0]) + r.size()[1:]) _d = d.new().resize_(torch.Size([0]) + d.size()[1:]) # -- Apply padding on short games n = len(transitions) max_len = max([len(game) for game in transitions]) avg_len = np.mean([len(game) for game in transitions]) fake = [(_s, _r, _d, _a)] transitions = [t + fake * (max_len - len(t)) for t in transitions] transitions = list(map(list, zip(*transitions))) all_r = .0 total_r = .0 a = time.time() rewards_no = 0 for step, all_t in enumerate(transitions): states = torch.cat( list(map(lambda t: torch.LongTensor(t[0]), all_t)), 0) rewards = torch.cat( list(map(lambda t: torch.FloatTensor(t[1]), all_t)), 0) done = torch.cat( list(map(lambda t: torch.LongTensor(t[2]), all_t)), 0) actions = torch.cat( list(map(lambda t: torch.LongTensor(t[3]), all_t)), 0) _alive_no = states.size(0) print("Alive: {:d}, but {:d} are dead!".format( _alive_no, done.nonzero().nelement())) assert actions.size(0) == _alive_no assert rewards.size(0) == _alive_no assert done.size(0) == _alive_no if cfg.general.use_cuda: states = states.cuda() rewards = rewards.cuda() done = done.cuda() actions = actions.cuda() # print("---------Step {} ==========".format(step)) # print("Some transition:") # one_hot = states # print("rewards: :", rewards) # print("done: ", done) # print("action: ", actions) # print( # one_hot[0, 4] + one_hot[0, 5] * 2 + one_hot[0, 6] * 3 + # one_hot[ # 0, 7] * 4 # + one_hot[0, 8] * 7 # + one_hot[0, 13] * 11) agent.act(states, rewards, done, True, actions=actions) all_r += rewards.sum() rewards_no += _alive_no total_r += rewards.sum() b = time.time() agent.reset() # -- 5. session.value = session.value + 1 while not queue.empty(): try: queue.get_nowait() except Queue.Empty: break print_info("Go again!") reset.value = 0 all_r /= rewards_no total_r /= n do_save = False if best_r_frame is None or best_r_frame < all_r: do_save = True best_r_frame = all_r r_str = clr("{:.6f}".format(best_r_frame), "white", "on_magenta") # salveaza ceva save_model(self, best_r_frame, episode, save_only_min=False) # agent.save_model() else: r_str = clr("{:.6f}".format(all_r), "magenta") if best_r_ep is None or best_r_ep < total_r: do_save = True best_r_ep = total_r r2_str = clr("{:.6f}".format(best_r_ep), "white", "on_magenta") # salveaza ceva # agent.save_model() else: r2_str = clr("{:.6f}".format(total_r), "magenta") print_info("Episode: " + clr("{:d}".format(episode), "blue") + clr(" | ", "yellow") + "Rewards per episode: " + r2_str + clr(" | ", "yellow") + "Rewards per frame: " + r_str + clr(" | ", "yellow") + "Batch size: " + clr("{:d}".format(n), "blue") + clr(" | ", "yellow") + "Avg length: " + clr("{:.2f}".format(avg_len), "blue") + clr(" | ", "yellow") + "Back time: " + clr("{:.2f}".format(b - a), "blue")) if do_save: agent.model_utils.save_model(all_r, total_r, episode, save_only_min=False) frame_rewards.append(all_r) episode_rewards.append(total_r) print("-----------------") print("Last ten:") print("Last ten step rewards: ", frame_rewards[-10:]) print("Last ten epis rewards: ", episode_rewards[-10:]) print("-----------------") torch.save( torch.stack([ torch.FloatTensor(frame_rewards), torch.FloatTensor(frame_rewards) ]), "results/rewards.torch")
def run(full_args: Namespace, return_models: bool = False): if sys.argv[0].startswith("train"): import os full_args.out_dir = os.path.dirname(sys.argv[1]) args = full_args.main agent_args = full_args.agent model_args = full_args.model extra_logs = getattr(full_args, "extra_logs", None) main_r_key = getattr(full_args, "main_r_key", None) if args.seed == 0: args.seed = full_args.run_id + 1 max_eprews = args.max_eprews max_eprews_window = getattr(args, "max_eprews_window", 1) post_process_args(agent_args) post_process_args(model_args) model_dir = getattr(args, "model_dir", full_args.out_dir) print(model_dir) # ============================================================================================== # @ torc_rl repo original # Define logger, CSV writer and Tensorboard writer logger = utils.get_logger(model_dir) csv_file, csv_writer = utils.get_csv_writer(model_dir) tb_writer = None if args.tb: from tensorboardX import SummaryWriter tb_writer = SummaryWriter(model_dir) # Log command and all script arguments logger.info("{}\n".format(" ".join(sys.argv))) logger.info("{}\n".format(args)) # ============================================================================================== # Set seed for all randomness sources utils.seed(args.seed) # ============================================================================================== # Generate environments envs = [] # Get env wrappers - must be a list of elements wrapper_method = getattr(full_args.env_cfg, "wrapper", None) if wrapper_method is None: def idem(x): return x env_wrapper = idem else: env_wrappers = [getattr(gym_wrappers, w_p) for w_p in wrapper_method] def env_wrapp(w_env): for wrapper in env_wrappers[::-1]: w_env = wrapper(w_env) return w_env env_wrapper = env_wrapp actual_procs = getattr(args, "actual_procs", None) no_actions = getattr(full_args.env_cfg, "no_actions", 6) if actual_procs: # Split envs in chunks no_envs = args.procs envs, chunk_size = get_envs(full_args, env_wrapper, no_envs, n_actions=no_actions) first_env = envs[0][0] print( f"NO of envs / proc: {chunk_size}; No of processes {len(envs[1:])} + Master" ) else: for i in range(args.procs): env = env_wrapper(gym.make(args.env)) env.max_steps = full_args.env_cfg.max_episode_steps env.seed(args.seed + 10000 * i) envs.append(env) first_env = envs[0] # Generate evaluation envs eval_envs = [] eval_episodes = getattr(full_args.env_cfg, "eval_episodes", 0) if full_args.env_cfg.no_eval_envs > 0: no_envs = full_args.env_cfg.no_eval_envs eval_envs, chunk_size = get_envs(full_args, env_wrapper, no_envs, n_actions=no_actions) # Define obss preprocessor max_image_value = full_args.env_cfg.max_image_value normalize_img = full_args.env_cfg.normalize permute = getattr(full_args.env_cfg, "permute", False) obss_preprocessor = getattr(full_args.env_cfg, "obss_preprocessor", None) obs_space, preprocess_obss = utils.get_obss_preprocessor( args.env, first_env.observation_space, model_dir, max_image_value=max_image_value, normalize=normalize_img, permute=permute, type=obss_preprocessor) first_obs = first_env.reset() if "state" in first_obs: full_state_size = first_obs["state"].shape # Add full size shape add_to_cfg(full_args, MAIN_CFG_ARGS, "full_state_size", full_state_size) if "position" in first_obs: position_size = first_obs["position"].shape # Add full size shape add_to_cfg(full_args, MAIN_CFG_ARGS, "position_size", position_size) # Add the width and height of environment for position estimation model_args.width = first_env.unwrapped.width model_args.height = first_env.unwrapped.height # ============================================================================================== # Load training status try: status = utils.load_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} saver = utils.SaveData(model_dir, save_best=args.save_best, save_all=args.save_all) model, agent_data, other_data = None, dict(), None try: # Continue from last point model, agent_data, other_data = saver.load_training_data(best=False) logger.info("Training data exists & loaded successfully\n") except OSError: logger.info("Could not load training data\n") # ============================================================================================== # Load Model if model is None: model = get_model(model_args, obs_space, first_env.action_space, use_memory=model_args.mem) logger.info(f"Model [{model_args.name}] successfully created\n") # Print Model info logger.info("{}\n".format(model)) if torch.cuda.is_available(): model.cuda() logger.info("CUDA available: {}\n".format(torch.cuda.is_available())) # ============================================================================================== # Load Agent algo = get_agent(full_args.agent, envs, model, agent_data, preprocess_obss=preprocess_obss, reshape_reward=None, eval_envs=eval_envs, eval_episodes=eval_episodes) has_evaluator = hasattr(algo, "evaluate") and full_args.env_cfg.no_eval_envs > 0 if return_models: return algo, model, envs, saver # ============================================================================================== # Train model prev_rewards = [] crt_eprew = 0 if "eprew" in other_data: crt_eprew = other_data["eprew"] num_frames = status["num_frames"] total_start_time = time.time() update = status["update"] update_start_time = time.time() while num_frames < args.frames: # Update model parameters logs = algo.update_parameters() num_frames += logs["num_frames"] update += 1 if update % args.eval_interval == 0 and has_evaluator: eval_logs = algo.evaluate(eval_key=main_r_key) logs.update(eval_logs) prev_start_time = update_start_time update_start_time = time.time() # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_start_time - prev_start_time) duration = int(time.time() - total_start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += ["entropy", "value", "policy_loss", "value_loss"] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"] ] header += ["grad_norm"] data += [logs["grad_norm"]] # add log fields that are not in the standard log format (for example value_int) extra_fields = extra_log_fields(header, list(logs.keys())) header.extend(extra_fields) data += [logs[field] for field in extra_fields] # print to stdout the standard log fields + fields required in config keys_format, printable_data = print_keys(header, data, extra_logs) logger.info(keys_format.format(*printable_data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() if status["num_frames"] == 0: csv_writer.writerow(header) csv_writer.writerow(data) csv_file.flush() if args.tb: for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) status = {"num_frames": num_frames, "update": update} if main_r_key is None: crt_eprew = list(rreturn_per_episode.values())[0] prev_rewards.append(crt_eprew) else: crt_eprew = logs[main_r_key] prev_rewards.append(logs[main_r_key]) # -- Save vocabulary and model if args.save_interval > 0 and update % args.save_interval == 0: preprocess_obss.vocab.save() saver.save_training_data(model, algo.get_save_data(), crt_eprew) logger.info("Model successfully saved") utils.save_status(status, model_dir) check_rew = np.mean(prev_rewards[-max_eprews_window:]) if len(prev_rewards) > max_eprews_window and check_rew > max_eprews: print( f"Reached mean return {max_eprews} for a window of {max_eprews_window} steps" ) exit()
def pull(self, endpoint, flag_id, flag): headers={"User-Agent":get_agent(),}#"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0" flag_id = self.loader(flag_id) team_host = "http://%s:8000" % endpoint billing_cell = "/billing/%s/" % flag_id["account"]["username"] validate_cell = "/validate/%s/" % flag_id["tid"] with requests.Session() as s: s.cookies.set("sessionid", flag_id["sid"]) s.cookies.set("transaction_id", flag_id["tid"]) try: check = s.get(team_host + billing_cell, timeout=self.conn_timeout, headers=headers) except requests.ConnectionError as ex: self.logger.error(self.validate_step_err % unicode(ex)) self.logger.debug(str(ex), exc_info=True) return (Result.DOWN, flag_id) except requests.HTTPError as ex: self.logger.error(self.validate_step_err % str(ex)) self.logger.debug(str(ex), exc_info=True) return (Result.MUMBLE, flag_id) except requests.Timeout as ex: self.logger.error(self.validate_step_err % str(ex)) self.logger.debug(str(ex), exc_info=True) return (Result.DOWN, flag_id) except Exception as ex: self.logger.error(self.validate_step_err % str(ex)) self.logger.debug(str(ex), exc_info=True) return (Result.DOWN, flag_id) s.cookies.set("transaction_sign", flag_id["tsign"]) try: validate = s.get(team_host + validate_cell, timeout=self.conn_timeout, headers=headers) except requests.ConnectionError as ex: self.logger.error(self.validate_step_err % unicode(ex)) self.logger.debug(str(ex), exc_info=True) return (Result.DOWN, flag_id) except requests.HTTPError as ex: self.logger.error(self.validate_step_err % str(ex)) self.logger.debug(str(ex), exc_info=True) return (Result.MUMBLE, flag_id) except requests.Timeout as ex: self.logger.error(self.validate_step_err % str(ex)) self.logger.debug(str(ex), exc_info=True) return (Result.DOWN, flag_id) except Exception as ex: self.logger.error(self.validate_step_err % str(ex)) self.logger.debug(str(ex), exc_info=True) return (Result.DOWN, flag_id) if (check.status_code == 200) and (validate.status_code == 200): flag_stat = s.cookies.get("valid") if flag_stat is None: return Result.MUMBLE elif flag_stat == "True": if flag_id["account"]["billing"]["sign"] == check.text.replace('\n','').replace('\r',''): return Result.UP else: return Result.CORRUPT else: Result.MUMBLE else: return Result.MUMBLE
def train_on_simulator(shared_objects, cfg): batch_size = cfg.general.batch_size stats = shared_objects["stats_leRMS"] # -- Initialize simulated environment env = ArificialMalmo(cfg.envs.simulated) print_info( "Environment initialized (batch_size:={:d}).".format(batch_size)) # -- Initialize agent and wrap it in a Binary18BatchAgentWrapper :) Agent = get_agent(cfg.agent.type) agent = Agent(cfg.agent.name, ENV_ACTIONS, cfg, shared_objects) agent_runner = Binary18BatchAgentWrapper(agent, cfg.agent.name, cfg) print_info( "{:s}<{:s}> agent is up and waiting to learn. |role={:d}".format( cfg.agent.name, cfg.agent.type, cfg.agent.role)) # -- Initialize alien alien = VillagePeopleEnvChallengeAgent(PigChaseChallengeAgent_V, cfg.alien.name, env._board_one_hot, cfg) print_info("Alien is up.") # -- Start training agents = [alien, agent_runner] agent_idx = 1 env_agents_data = [env.agent0, env.agent1] dtype = torch.LongTensor(0) if cfg.general.use_cuda: dtype = dtype.cuda() def restartGame(): obs = env.reset() reward = torch.zeros(batch_size).type_as(dtype) done = torch.zeros(batch_size).type_as(dtype) for agent in agents: agent.reset() return obs, reward, done obs, reward, done = restartGame() ep_cnt = 0 crt_agent = 0 viz_rewards = torch.LongTensor(batch_size).type_as(dtype) viz_steps = torch.LongTensor(batch_size).type_as(dtype) # Batch of agents used for evaluation during training. eval_agents_count = batch_size if cfg.evaluation.during_training.truncate: eval_agents_count = int(batch_size * cfg.agent.exploration[0][1]) viz_rewards = torch.LongTensor(eval_agents_count).type_as(dtype) viz_rewards.fill_(0) viz_steps.fill_(0) start_time = time.time() episode_time = AverageMeter() report_freq = cfg.general.report_freq print_info("No of epochs: {:d}. Max no of steps/epoch: {:d}".format( cfg.training.episodes_no, cfg.training.max_step_no)) training_steps = cfg.training.episodes_no * cfg.training.max_step_no * 2 start_episode_time = time.time() start_report_time = time.time() max_freq_r = -100 max_freq_r_ep = -1 for step in range(1, training_steps + 1): # check if env needs reset if env.done.all(): episode_time.update(time.time() - start_episode_time) start_episode_time = time.time() obs, reward, done = restartGame() ep_cnt += 1 stats.inc_episodes(batch_size) crt_agent = 0 if ep_cnt % report_freq == 0: batch_mean_reward = torch.sum(viz_rewards) / report_freq game_mean_reward = batch_mean_reward / eval_agents_count last_report_time = time.time() - start_report_time start_report_time = time.time() r_step = torch.mean(viz_rewards.float() / viz_steps.float()) if game_mean_reward > max_freq_r: max_freq_r = game_mean_reward max_freq_r_ep = ep_cnt agent.model_utils.save_model(r_step, game_mean_reward, ep_cnt, save_only_min=False) print_info("Ep: %d | batch_avg_R: %.4f | game_avg_R: %.4f " "| R_step: %.4f | (Max_R: %.4f at ep %d)" % (ep_cnt, batch_mean_reward, game_mean_reward, r_step, max_freq_r, max_freq_r_ep)) print_info( "Ep: %d | (Ep_avg_time: %.4f) | (Last_report: %.4f)" % (ep_cnt, episode_time.avg, last_report_time)) viz_rewards.fill_(0) viz_steps.fill_(0) # select an action agent_act = agents[crt_agent].act( obs, reward, done, (1 - env_agents_data[crt_agent].got_done)) stats.inc_frames((1 - env.done.long()).sum()) # take a step obs, reward, done = env.do(agent_act) crt_agent = (crt_agent + 1) % 2 if crt_agent == agent_idx: viz_steps.add_(1 - env.done.long()) viz_rewards.add_(reward[:eval_agents_count]) elapsed_time = time.time() - start_time print("Finished in %.2f seconds at %.2ffps." % (elapsed_time, training_steps / elapsed_time))
def get_post_form_headers(self, data=""): headers={"User-Agent":get_agent(), #"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0" "Content-Type":"application/x-www-form-urlencoded", "Content-Length": str(len(data))} return headers
def collect_from_malmo(id_, shared_objects, cfg): clients = cfg.envs.minecraft.ports my_id = id_ reset = shared_objects["reset"] session_id = shared_objects["session"] queue = shared_objects["queue"] if "predict_queue" in shared_objects: use_predict_queue = True predict_queue = shared_objects["predict_queue"] answer_queue = shared_objects["answer_pipe"][my_id] else: use_predict_queue = False # ----------------------- Run Challenge agent ------------------------------ challenger_stopped = mp.Value("i", 0) shared_obj_ = {"stopped": challenger_stopped} p = mp.Process(target=run_challenge_agent, args=(id_, clients, shared_obj_)) p.start() sleep(5) # ----------------------- Run VillageP Agent ------------------------------- # --- Start agent agent_role = 1 cfg.general.use_cuda = True if not use_predict_queue: agent_actor = get_agent(cfg.agent.type)(cfg.agent.name, ENV_ACTIONS, cfg, shared_objects) # SET Not max predictor agent_actor.predict_max = False # agent = PigChaseVillagePopleAgent(ENV_AGENT_NAMES[agent_role], ENV_ACTIONS, # agent_actor, # use_cuda=cfg.general.use_cuda) state_builder = PigChaseVillagePeopleBuilder18Binary(agent_role) print("A3C: ", clients) env = PigChaseEnvironment(clients, state_builder, role=1, randomize_positions=True) agent_done = False reward = 0 episode = 0 step = 0 obs = env.reset() received_none = 0 while obs is None: # this can happen if the episode ended with the first # action of the other agent # print('Warning: received obs == None.') received_none += 1 if received_none == 10: print("[[{}]] Panic !!! > Received {} None in a row".format( id_, received_none)) if received_none == 100: print("[[{}]] Panic! Challenger stopped." " Received {} None in a row".format(id_, received_none)) return -1 print("[[{}]] Born an playing!".format(id_)) ep_states = [] crt_session_id = session_id.value while True: step += 1 # check if env needs reset # print("AGENT123123") if env.done or agent_done: # print("[[{}]] Done ep {}.".format(id_, episode)) if challenger_stopped.value < 0: print("[[{}]] Child process ended!!".format(id_)) pass if reset.value == 1: # --- Master is training network # ---- Restart ---------------------- # TODO restart MInecraft process while reset.value == 1: sleep(0.1) ep_states.clear() if session_id.value != crt_session_id: ep_states.clear() crt_session_id = session_id.value if len(ep_states) > 0: # --- Will be restarted state_ = torch.LongTensor(obs).unsqueeze(0) done_ = torch.LongTensor([int(agent_done)]) reward_ = torch.FloatTensor([reward]) if use_predict_queue: predict_queue.put( (my_id, state_.cpu().numpy(), done_.cpu().numpy(), 23)) (act, _) = answer_queue.recv() act = torch.LongTensor([act]) else: act = agent_actor.act(state_.cuda(), reward_.cuda(), done_.cuda(), False) # act = agent_actor.act(state_, reward_, done_, False) ep_states.append((state_.cpu().numpy(), reward_.cpu().numpy(), done_.cpu().numpy(), act.cpu().numpy())) queue.put(ep_states) ep_states = [] obs = env.reset() received_none = 0 while obs is None: # this can happen if the episode ended with the first # action of the other agent # print('Warning: received obs == None.') received_none += 1 if received_none == 10: print( "[[{}]] Panic !!! > Received {} None in a row".format( id_, received_none)) if received_none == 10000: print("[[{}]] Panic! Challenger stopped." " Received {} None in a row".format( id_, received_none)) sleep(5) obs = env.reset() episode += 1 state_ = torch.LongTensor(obs).unsqueeze(0) reward_ = torch.FloatTensor([reward]) done_ = torch.LongTensor([int(agent_done)]) if not agent_done: if use_predict_queue: predict_queue.put( (my_id, state_.cpu().numpy(), done_.cpu().numpy(), 23)) (act, _) = answer_queue.recv() act = torch.LongTensor([act]) else: act = agent_actor.act(state_.cuda(), reward_.cuda(), done_.cuda(), False) else: reward_[0] = 0 done_[0] = 0 if use_predict_queue: predict_queue.put( (my_id, state_.cpu().numpy(), done_.cpu().numpy(), 23)) (act, _) = answer_queue.recv() act = torch.LongTensor([act]) else: act = agent_actor.act(state_.cuda(), reward_.cuda(), done_.cuda(), False) # act = agent_actor.act(state_, reward_, done_, False) ep_states.append((state_.cpu().numpy(), reward_.cpu().numpy(), done_.cpu().numpy(), act.cpu().numpy())) obs, reward, agent_done = env.do(act[0])
rewards = [] dt_string = datetime.now().strftime("%d%m%Y%H%M%S") dir_name = "runs/eval_env_{}_agent_{}_memory_{}_{}".format( args.env_name, args.agent_type, args.memory_type, dt_string) logger = SummaryWriter(log_dir=dir_name) with open(os.path.join(dir_name, 'command_line_args.txt'), 'w') as f: json.dump(args.__dict__, f, indent=2) # Prepare envirornment env = environments.get_env(args.env_name) # Prepare memory module memory_module = memory.get_module(args.memory_type, args) # Prepare agent agent = agents.get_agent(args.agent_type, env, memory_module, dir_name, device, args) # Load saved model agent.load_model(args.model_path) # Iterate through episodes for episode in range(args.num_episodes): # Run episode and get reward rewards.append(run_episode(env, agent)) # Get average reward and log results, and add to to all_logging_dict reward_np = np.array(rewards) i = reward_np.shape[ 0] if reward_np.shape[0] < args.avg_episodes else args.avg_episodes avg_reward = reward_np[-i:].mean() logging_dict = { "avg_rewards": avg_reward,
if isinstance(config.model.load, str): checkpoint = torch.load(config.model.load) iteration = checkpoint['iteration'] reward = checkpoint['reward'] print("LOADING MODEL: {} ---> MAX R: {}".format(config.model.load, reward)) shared_model.load_state_dict(checkpoint['state_dict']) if config.general.use_cuda: shared_model.cuda() shared_objects = { "model": shared_model, "stats_leRMS": AtomicStatistics() } agent_actor = get_agent(config.agent.type)(config.agent.name, ENV_ACTIONS, config, shared_objects) agent_role = 1 ag1 = Binary18BatchAgentWrapper(agent_actor, config.agent.name, config, is_training=False) # ag1 = VillagePeopleEnvChallengeAgent(PigChaseChallengeAgent_V, "Agent_2", # env._board_one_hot, config) # ag1 = MalmoAgentWrapper(PigChaseChallengeAgent, "Agent_1", config) # ag1 = VillagePeopleEnvRandomAgent("Agent_2", config) agents = [ag0, ag1] env_agents = [env.agent0, env.agent1] start = time.time()
def train(): # initialize environments and set up logging folders config = utils.get_rl_args() rom = config.rom_path.format(config.env_id) env = utils.make_env(rom, 0, max_episode_steps=config.env_step_limit) frame_idx = 0 resume_flag = False if bool(config.resume_folder): folder_path = os.path.join(config.log_dir, config.env_id, config.resume_folder) if os.path.exists(folder_path): print('## Resume training from ', folder_path) last_frame_idx = 0 with open(os.path.join(folder_path, 'loss.csv'), 'r') as f: for line in f: if line[0] == '#': continue last_frame_idx = int(line.split(',')[0]) print('## ## last_frame_idx', last_frame_idx) frame_idx = last_frame_idx if os.path.exists(os.path.join(folder_path, 'model.pt')): resume_flag = True else: print('## Initialize training from ', folder_path) try: os.makedirs(folder_path) except OSError: print('Creating {} folder failed.'.format(folder_path)) else: folder_path = utils.setup_experiment_folder(config) model = agents.get_agent(config=config, env=env, log_dir=folder_path) monitor = utils.ExperimentMonitor(config, folder_path) if resume_flag: print('## load checkpoint from ', folder_path) model.load_checkpoint(folder_path) monitor.add_separator() dataset = utils.wrap_experience_replay( model.replay_buffer, config, size_limit=config.experiment_monitor_freq * config.batch_size) episode_logger = {'reward': 0, 'init_time': 0, 'num': 0} greedy = (config.exploit_type == 'greedy') # some logging functions def logging(s, print_=True, log_=True): if print_: print(s) if log_: with open(os.path.join(folder_path, 'log.txt'), 'a+') as f_log: f_log.write(s + '\n') f_log.flush() def dump_trajectory_action(action_text, actions, action_id, frame_idx): if (episode_logger['num'] % config.training_dump_freq == config.training_dump_freq - 1): frame_num = frame_idx - episode_logger['init_time'] logging('[Episode {} step {}] Act: {}=({})\n'.format( episode_logger['num'], frame_num, action_text, actions[action_id])) return def dump_trajectory_state(obs_text, frame_idx): if (episode_logger['num'] % config.training_dump_freq == config.training_dump_freq - 1): st = obs_text.split('|') logging('[Episode {} step {}] Obs: \nl={}\ni={}\no={}\n'.format( episode_logger['num'], frame_idx - episode_logger['init_time'], clean(st[0]), clean(st[1]), clean(st[2]))) return def dump_rewards(reward, frame_idx): if (episode_logger['num'] % config.training_dump_freq == config.training_dump_freq - 1): logging('[Episode {} step {}] Reward:{}, CumR:{}'.format( episode_logger['num'], frame_idx - episode_logger['init_time'], reward, episode_logger['reward'], )) return # history observation obs_history = utils.ObservationHistory(config.history_window) # interact with the environment def actor_step(obs_ids, action_tuple, frame_idx): # compute current action template_ids, obj1_pos, obj2_pos, actions = action_tuple epsilon = config.epsilon_by_frame(frame_idx) action_id, action_text, prob = model.get_action(obs_ids, action_tuple, epsilon, greedy=greedy) dump_trajectory_action(action_text, actions, action_id, frame_idx) # interact with the environment next_obs_text, reward, done, next_info = env.step(action_text, parallel=True) episode_logger['reward'] += reward # done = done or len(next_info['valid_act']) == 0 # history part past_obs = "" if config.use_history: active_entity = obs_history.extract_entity(next_info['valid_act']) past_obs = obs_history.retrieve_obs(active_entity) obs_history.update_history(active_entity, next_obs_text) dump_rewards(reward, frame_idx) dump_trajectory_state(next_obs_text, frame_idx + 1) next_obs_ids = model.encode_observation(past_obs + next_obs_text) next_action_tuple = model.encode_action(next_info['valid_act'], next_info['objs'], next_obs_ids) next_template_ids = next_action_tuple[0] next_obj1_pos = next_action_tuple[1] next_obj2_pos = next_action_tuple[2] # update experience replay model.update_experience_replay(s=obs_ids, aset=(template_ids, obj1_pos, obj2_pos), a=action_id, r=reward, done=done, ns=next_obs_ids, na=(next_template_ids, next_obj1_pos, next_obj2_pos)) # tracking behavior trajectories monitor.add_ard(frame_idx, actions[action_id], reward, done, prob) if done or env.env.emulator_halted(): score = next_info['score'] model.reset_hx() next_obs_text, next_info = env.reset(parallel=True) past_obs = "" if config.use_history: obs_history.reset() active_entity = obs_history.extract_entity(info['valid_act']) past_obs = obs_history.retrieve_obs(active_entity) obs_history.update_history(active_entity, obs_text) next_obs_ids = model.encode_observation(past_obs + next_obs_text) next_action_tuple = model.encode_action(next_info['valid_act'], next_info['objs'], next_obs_ids) monitor.add_episode_reward(episode_logger['reward'], score, frame_idx) episode_logger['reward'] = 0 episode_logger['init_time'] = frame_idx episode_logger['num'] += 1 dump_trajectory_state(next_obs_text, frame_idx) return next_obs_ids, next_action_tuple logging(str(config)) obs_text, info = env.reset(parallel=True) past_obs = "" if config.use_history: active_entity = obs_history.extract_entity(info['valid_act']) past_obs = obs_history.retrieve_obs(active_entity) obs_history.update_history(active_entity, obs_text) dump_trajectory_state(obs_text, frame_idx) obs_ids = model.encode_observation(past_obs + obs_text) action_tuple = model.encode_action(info['valid_act'], info['objs'], obs_ids) start = timer() model.reset_time_log() act_time = 0 # pre-fill exp replay for |learn_start| steps if frame_idx < config.learn_start: for time_step in tqdm(range(config.learn_start), desc='non-train step'): obs_ids, action_tuple = actor_step(obs_ids, action_tuple, frame_idx=frame_idx) frame_idx += 1 loop_length = config.experiment_monitor_freq * config.update_freq loop_start = frame_idx // loop_length loop_max = int(config.max_steps / loop_length) + 1 for loop_idx in range(loop_start, loop_max): time_start = loop_idx * loop_length time_end = time_start + loop_length for batch_vars in tqdm(dataset, desc='training step {}-{}'.format( time_start, time_end)): # one step update td_loss, aux_loss = model.learn_step(batch_vars) norm = model.get_trainable_parameter_norm() monitor.add_loss(frame_idx, td_loss, aux_loss, norm) # interact with environment and write data act_ep_time = int(round(time.time() * 1000)) for _ in range(config.update_freq): obs_ids, action_tuple = actor_step(obs_ids, action_tuple, frame_idx) frame_idx += 1 act_time += int(round(time.time() * 1000)) - act_ep_time model.save_networks() model.save_optimizer() model.save_replay() e_r, score = monitor.get_episode_reward_record() action_record = monitor.get_action_record() td_avg, td_max, td_min = monitor.get_td_record() norm_avg, norm_max, norm_min = monitor.get_norm_record() exp_avg, exp_max, exp_min = monitor.get_exploration_record() # aux_avg, aux_max, aux_min = monitor.get_aux_record() logging( 'step {}, time {}, episode {}, R (avg/max/min) ' '{:.1f}/{:.1f}/{:.1f}::{:.1f}/{:.1f}/{:.1f}, ' 'epx (p/n) {:.0f}/{:.0f} \n' 'tpl (max/avg/num) {:.2f}/{:.2f}/{}, ' 'obj (max/avg/num) {:.2f}/{:.2f}/{}, ' 'td (avg/max) {:.3f}/{:.3f}, norm (avg) {:.5f}, ' 'eps {:.3f}/{:.3f}:{:.3f}:{:.3f}'.format( frame_idx, timedelta(seconds=int(timer() - start)), episode_logger['num'], e_r[0], e_r[1], e_r[2], score[0], score[1], score[2], len(model.replay_buffer.priority_buffer), len(model.replay_buffer.buffer), action_record['template'][0], action_record['template'][1], action_record['template'][2], action_record['obj'][0], action_record['obj'][1], action_record['obj'][2], td_avg, td_max, norm_avg, config.epsilon_by_frame(frame_idx), exp_avg, exp_max, exp_min)) model.print_time_log() model.reset_time_log() print('- act time:{}'.format(timedelta(milliseconds=act_time))) act_time = 0 model.save_checkpoint() env.close()
def main(): parser = argparse.ArgumentParser(description="Train an Actor-Critic agent that plays a specific environment.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) required_named = parser.add_argument_group('REQUIRED named arguments') required_named.add_argument("--config_file", type=str, required=True, help="Configuration file for the experiment.") parser.add_argument("--output_dir", type=str, default=EXPERIMENTS_DIR, help="Where to save the experiment files") parser.add_argument("--debug", action="store_true", default=False, help="Activate to run Tensorflow in eager mode.") parser.add_argument("--replace", action="store_true", default=False, help="Activate to replace old experiment with the same name in the output folder.") args = parser.parse_args() # On debug mode all functions are executed normally (eager mode) if args.debug: tf.config.run_functions_eagerly(True) # Get git version repo = Repo(search_parent_directories=True) sha = repo.head.object.hexsha # Use provided configurations file config_file = Path(args.config_file) config = ConfigManager.from_json_file(config_file) # Create experiment folder and handle old results output_dir = Path(args.output_dir) agent_folder = Path(output_dir, config.agent_config.name) deleted_old = False if agent_folder.exists(): if args.replace: shutil.rmtree(agent_folder) deleted_old = True else: raise FileExistsError(f"The experiment {agent_folder} already exists." f"Change output folder, experiment name or use -replace " f"to overwrite.") agent_folder.mkdir(parents=True) # Save experiments configurations and start experiment log prepare_file_logger(logger, logging.INFO, Path(agent_folder, "experiment.log")) logger.info(f"Running experiment {config.agent_config.name}") if deleted_old: logger.info(f"Deleted old experiment in {agent_folder}") config.log_configurations(logger) experiment_config_file = Path(agent_folder, "configurations.json") logger.info(f"Saving experiment configurations to {experiment_config_file}") config.to_json_file(experiment_config_file) wandbrun = wandb.init(project=f"AC-{config.agent_config.env}", name=config.agent_config.name, group=config.agent_config.agent_type, notes=config.agent_config.desc, config=config.as_single_dict(), reinit=True, dir=f"experiments/{config.agent_config.name}") # Create agent agent = get_agent(config.agent_config.agent_type)(agent_path=agent_folder, config=config) start_time = time.time() test_reward = agent.train_policy(training_config=config.training_config) train_time = time.time() - start_time experiment_info = {"mean_test_reward": float(test_reward), "name": config.agent_config.name, "description": config.agent_config.desc, "git_hash": sha, "train_time": train_time} with open(Path(agent_folder, "experiment_information.json"), "w") as outfile: json.dump(experiment_info, outfile, indent=4) wandbrun.finish()