def main(args): # init env train_env_args, test_env_args = args['train_env_args'], args['test_env_args'] train_env = init_env(**train_env_args) test_env = init_env(**test_env_args) # init net and agent device_online = torch.device(args['device_online']) device_train = torch.device(args['device_train']) nn_online = init_actor_critic(args['actor_critic_nn_type'], args['actor_critic_nn_args']) nn_train = init_actor_critic(args['actor_critic_nn_type'], args['actor_critic_nn_args']) nn_online.to(device_online) nn_train.to(device_train) policy = args['policy'] policy_args = args['policy_args'] agent_online = AgentInference(nn_online, device_online, policy, policy_args) train_agent_args = args['train_agent_args'] agent_type = train_agent_args['agent_type'] if agent_type == 'A2C': agent_class = A2C elif agent_type == 'PPO': agent_class = PPO elif agent_type == 'V-MPO': agent_class = VMPO else: raise ValueError(f'only A2C, PPO and V-MPO agents supported, provided {agent_type}') optimization_params = train_agent_args['optimization_params'] additional_params = train_agent_args['additional_params'] agent_train = agent_class( nn_train, device_train, policy, policy_args, normalize_adv=train_agent_args['normalize_advantage'], returns_estimator=train_agent_args['returns_estimator'], **optimization_params, **additional_params ) # init and run trainer trainer_args = args['trainer_args'] trainer = OnPolicyTrainer( agent_online, agent_train, train_env, **trainer_args, test_env=test_env, log_dir=args['log_dir'] ) if args['load_checkpoint'] is not None: trainer.load(args['load_checkpoint']) training_args = args['training_args'] trainer.train(**training_args) train_env.close() test_env.close()
def main(args): test_env_args = args['test_env_args'] test_env = init_env(**test_env_args) # init agent device_train = torch.device(args['device_train']) nn_train = init_actor_critic(args['actor_critic_nn_type'], args['actor_critic_nn_args']) agent = BehaviorCloning(image_env, observation_size, action_size, args.hidden_size, device, args.policy, lr=args.learning_rate, clip_grad=args.clip_grad, normalize_adv=None, returns_estimator=None, gamma=None, entropy=None) # init demo buffer demo_data = BCDataSet(args.demo_file) demo_buffer = DataLoader(demo_data, args.batch_size, shuffle=True) # init trainer and train trainer = BehaviorCloningTrainer(agent, test_env, demo_buffer, args.log_dir) if args.load_checkpoint is not None: trainer.load(args.load_checkpoint) trainer.train(args.n_epoch, args.n_tests_per_epoch) test_env.close()
def play_from_folder( folder, config_path, checkpoint_path, deterministic, silent, pause_first, n_episodes, save_gif, reward_threshold, save_demo, ): if save_gif: raise ValueError('gif saving is not yet implemented...') with open(folder + config_path) as f: config = yaml.safe_load(f) test_env_args = config['test_env_args'] test_env_args['env_num'] = 1 test_env = init_env(**test_env_args) device = torch.device('cpu') nn_online = init_actor_critic(config['actor_critic_nn_type'], config['actor_critic_nn_args']) nn_online.to(device) policy = config['policy'] policy_args = config['policy_args'] agent = AgentInference(nn_online, device, policy, policy_args) agent.load(folder + checkpoint_path, map_location='cpu') agent.eval() play_n_episodes(test_env, agent, deterministic, n_episodes, silent, reward_threshold, save_demo, pause_first) test_env.close()
def main(): create_log_dir(log_dir, __file__) test_env = init_env(**env_args, env_num=test_env_num) demo_data = BCDataSet(demo_file) demo_buffer = DataLoader(demo_data, batch_size=batch_size, shuffle=True) agent = make_agent_online() trainer = BehaviorCloningTrainer(agent, demo_buffer, test_env=test_env, log_dir=log_dir) trainer.train(**train_args) test_env.close()
def main(args): # init env test_env_args = args['test_env_args'] test_env = init_env(**test_env_args) # init net and agent device_train = torch.device(args['device_train']) nn_train = init_actor_critic(args['actor_critic_nn_type'], args['actor_critic_nn_args']) nn_train.to(device_train) policy = args['policy'] policy_args = args['policy_args'] train_agent_args = args['train_agent_args'] optimization_params = train_agent_args['optimization_params'] additional_params = train_agent_args['additional_params'] agent = BehaviorCloning(nn_train, device_train, policy, policy_args, **optimization_params, **additional_params) # init demo buffer demo_data = BCDataSet(args['demo_file']) demo_buffer = DataLoader(demo_data, args['batch_size'], shuffle=True) # init trainer and train trainer = BehaviorCloningTrainer(agent, demo_buffer, test_env=test_env, log_dir=args['log_dir']) if args['load_checkpoint'] is not None: trainer.load(args.load_checkpoint) training_args = args['training_args'] trainer.train(**training_args) test_env.close()
config["algo"], config["network"], config["share_policy"], config["seed"], ) config["meta"] = args.meta config["br"] = args.br if config["trainer"] == "meta": config["output_dir"] += "-{}-{}".format(config["meta"], config["br"]) config["output_dir"] += "-{}".format(now.strftime("%Y%m%d%H:%M:%S")) config["save_path"] = os.path.join(config["output_dir"], config["save_path"]) env, n_agents, env_info = init_env(config) print("output folder is: ", config["output_dir"]) print("env info: ", env.observation_space, env.action_space, env.n_agents) with tf.device(device): trainer = TRAINERs[config["trainer"]](env, env.n_agents, config) trainer.train() # create time logging later_time = datetime.now() difference = later_time - now print(later_time.strftime("%Y%m%d%H:%M:%S")) with open(config["output_dir"] + "/" + 'time.txt', 'a') as a_writer: a_writer.write('Total time in seconds: {}'.format(