Пример #1
0
def main(args):
    # init env
    train_env_args, test_env_args = args['train_env_args'], args['test_env_args']
    train_env = init_env(**train_env_args)
    test_env = init_env(**test_env_args)

    # init net and agent
    device_online = torch.device(args['device_online'])
    device_train = torch.device(args['device_train'])

    nn_online = init_actor_critic(args['actor_critic_nn_type'], args['actor_critic_nn_args'])
    nn_train = init_actor_critic(args['actor_critic_nn_type'], args['actor_critic_nn_args'])

    nn_online.to(device_online)
    nn_train.to(device_train)

    policy = args['policy']
    policy_args = args['policy_args']

    agent_online = AgentInference(nn_online, device_online, policy, policy_args)

    train_agent_args = args['train_agent_args']
    agent_type = train_agent_args['agent_type']
    if agent_type == 'A2C':
        agent_class = A2C
    elif agent_type == 'PPO':
        agent_class = PPO
    elif agent_type == 'V-MPO':
        agent_class = VMPO
    else:
        raise ValueError(f'only A2C, PPO and V-MPO agents supported, provided {agent_type}')
    optimization_params = train_agent_args['optimization_params']
    additional_params = train_agent_args['additional_params']

    agent_train = agent_class(
        nn_train, device_train,
        policy, policy_args,
        normalize_adv=train_agent_args['normalize_advantage'],
        returns_estimator=train_agent_args['returns_estimator'],
        **optimization_params, **additional_params
    )

    # init and run trainer
    trainer_args = args['trainer_args']
    trainer = OnPolicyTrainer(
        agent_online, agent_train,
        train_env,
        **trainer_args,
        test_env=test_env,
        log_dir=args['log_dir']
    )

    if args['load_checkpoint'] is not None:
        trainer.load(args['load_checkpoint'])

    training_args = args['training_args']
    trainer.train(**training_args)

    train_env.close()
    test_env.close()
Пример #2
0
def main(args):
    test_env_args = args['test_env_args']
    test_env = init_env(**test_env_args)

    # init agent
    device_train = torch.device(args['device_train'])
    nn_train = init_actor_critic(args['actor_critic_nn_type'],
                                 args['actor_critic_nn_args'])
    agent = BehaviorCloning(image_env,
                            observation_size,
                            action_size,
                            args.hidden_size,
                            device,
                            args.policy,
                            lr=args.learning_rate,
                            clip_grad=args.clip_grad,
                            normalize_adv=None,
                            returns_estimator=None,
                            gamma=None,
                            entropy=None)

    # init demo buffer
    demo_data = BCDataSet(args.demo_file)
    demo_buffer = DataLoader(demo_data, args.batch_size, shuffle=True)

    # init trainer and train
    trainer = BehaviorCloningTrainer(agent, test_env, demo_buffer,
                                     args.log_dir)
    if args.load_checkpoint is not None:
        trainer.load(args.load_checkpoint)
    trainer.train(args.n_epoch, args.n_tests_per_epoch)
    test_env.close()
Пример #3
0
def play_from_folder(
    folder,
    config_path,
    checkpoint_path,
    deterministic,
    silent,
    pause_first,
    n_episodes,
    save_gif,
    reward_threshold,
    save_demo,
):
    if save_gif:
        raise ValueError('gif saving is not yet implemented...')

    with open(folder + config_path) as f:
        config = yaml.safe_load(f)

    test_env_args = config['test_env_args']
    test_env_args['env_num'] = 1
    test_env = init_env(**test_env_args)

    device = torch.device('cpu')
    nn_online = init_actor_critic(config['actor_critic_nn_type'],
                                  config['actor_critic_nn_args'])
    nn_online.to(device)
    policy = config['policy']
    policy_args = config['policy_args']
    agent = AgentInference(nn_online, device, policy, policy_args)
    agent.load(folder + checkpoint_path, map_location='cpu')
    agent.eval()
    play_n_episodes(test_env, agent, deterministic, n_episodes, silent,
                    reward_threshold, save_demo, pause_first)
    test_env.close()
def main():
    create_log_dir(log_dir, __file__)

    test_env = init_env(**env_args, env_num=test_env_num)
    demo_data = BCDataSet(demo_file)
    demo_buffer = DataLoader(demo_data, batch_size=batch_size, shuffle=True)
    agent = make_agent_online()

    trainer = BehaviorCloningTrainer(agent,
                                     demo_buffer,
                                     test_env=test_env,
                                     log_dir=log_dir)
    trainer.train(**train_args)

    test_env.close()
Пример #5
0
def main(args):
    # init env
    test_env_args = args['test_env_args']
    test_env = init_env(**test_env_args)

    # init net and agent
    device_train = torch.device(args['device_train'])
    nn_train = init_actor_critic(args['actor_critic_nn_type'],
                                 args['actor_critic_nn_args'])
    nn_train.to(device_train)
    policy = args['policy']
    policy_args = args['policy_args']

    train_agent_args = args['train_agent_args']
    optimization_params = train_agent_args['optimization_params']
    additional_params = train_agent_args['additional_params']

    agent = BehaviorCloning(nn_train, device_train, policy, policy_args,
                            **optimization_params, **additional_params)

    # init demo buffer
    demo_data = BCDataSet(args['demo_file'])
    demo_buffer = DataLoader(demo_data, args['batch_size'], shuffle=True)

    # init trainer and train
    trainer = BehaviorCloningTrainer(agent,
                                     demo_buffer,
                                     test_env=test_env,
                                     log_dir=args['log_dir'])
    if args['load_checkpoint'] is not None:
        trainer.load(args.load_checkpoint)

    training_args = args['training_args']
    trainer.train(**training_args)

    test_env.close()
Пример #6
0
        config["algo"],
        config["network"],
        config["share_policy"],
        config["seed"],
    )

    config["meta"] = args.meta
    config["br"] = args.br

    if config["trainer"] == "meta":
        config["output_dir"] += "-{}-{}".format(config["meta"], config["br"])
    config["output_dir"] += "-{}".format(now.strftime("%Y%m%d%H:%M:%S"))
    config["save_path"] = os.path.join(config["output_dir"],
                                       config["save_path"])

    env, n_agents, env_info = init_env(config)

    print("output folder is: ", config["output_dir"])
    print("env info: ", env.observation_space, env.action_space, env.n_agents)
    with tf.device(device):
        trainer = TRAINERs[config["trainer"]](env, env.n_agents, config)
        trainer.train()

    # create time logging

    later_time = datetime.now()
    difference = later_time - now
    print(later_time.strftime("%Y%m%d%H:%M:%S"))

    with open(config["output_dir"] + "/" + 'time.txt', 'a') as a_writer:
        a_writer.write('Total time in seconds: {}'.format(