示例#1
0
def train_agent(cmdl):
    global_time = time.perf_counter()

    env = utils.env_factory(cmdl, "training")
    eval_env = utils.env_factory(cmdl, "evaluation")

    name = cmdl.agent_type
    env_space = (env.action_space, env.observation_space)
    agent = get_agent(name)(env_space, cmdl)
    eval_env_space = (env.action_space, env.observation_space)
    eval_agent = get_agent("evaluation")(eval_env_space, cmdl)

    agent.display_setup(env, cmdl)

    ep_cnt = 1
    fps_time = time.perf_counter()

    s, r, done = env.reset(), 0, False

    for step_cnt in range(cmdl.training_steps):
        a = agent.evaluate_policy(s)
        _s, _a = s.clone(), a
        s, r, done, _ = env.step(a)
        agent.improve_policy(_s, _a, r, s, done)

        step_cnt += 1
        agent.gather_stats(r, done)

        # Do some reporting
        if step_cnt != 0 and step_cnt % cmdl.report_frequency == 0:
            agent.display_stats(fps_time)
            agent.display_model_stats()
            fps_time = time.perf_counter()
            gc.collect()

        # Start doing an evaluation
        eval_ready = step_cnt >= cmdl.eval_start
        if eval_ready and (step_cnt % cmdl.eval_frequency == 0):
            eval_time = time.perf_counter()
            evaluate_agent(step_cnt, eval_env, eval_agent, agent.policy, cmdl)
            gc.collect()
            fps_time = fps_time + (time.perf_counter() - eval_time)

        if done:
            ep_cnt += 1
            s, r, done = env.reset(), 0, False

    agent.display_final_report(ep_cnt, step_cnt, global_time)
def main(args):
    env_kwargs = argument_parser.prepare_env_kwargs(args)
    env = gym.make(args.environment_name, **env_kwargs)
    state = env.reset()
    agent = agents.get_agent(args.agent,
                             env_name=args.environment_name,
                             network_architecture=args.value_estimator,
                             init_state=state,
                             num_of_actions=env.action_space.n)
    if args.agent in ['dqn', 'a2c']:
        agent.load_weights(args.weights)
        out_path = pathlib2.Path('/'.join(args.weights.split('/')[:-1]))
        agent.eval()
    else:
        out_path = pathlib2.Path('out/{}'.format(args.agent))
        agent.set_action_space(env_kwargs['action_space'])

    evaluator = create_validation_engine(agent, env)

    path_plotter = path_plot.Plotter(args.environment_name, out_path)
    state_recorder = StateRecorder(args.environment_name, out_path)
    action_recorder = ActionRecorder(args.environment_name, out_path,
                                     env_kwargs['action_space'])
    evaluator.attach(state_recorder)
    evaluator.attach(action_recorder)
    evaluator.attach(path_plotter)
    evaluator.attach(LapTimeMeasure(out_path, args.environment_name))
    # evaluator.attach(ProgressBar(persist=False))

    engine_state = evaluator.run(data=StepGenerator(
        env,
        agent,
        max_steps=args.max_steps,
        break_if_collision=args.break_if_collision),
                                 max_epochs=1000)
def main(args):
    session_id = get_new_session_id()
    logger = Logger(session_id)

    env_names = re.findall(r'[\(|,]([^\(|,|\)]+)', args.environment_name)
    envs = []
    for env_name in env_names:
        env = gym.make(
            env_name, **argparse.prepare_env_kwargs(args,
                                                    gazebo_multienv=True))
        args.port_gazebo = str(int(args.port_gazebo) + 1)
        args.port_ros = str(int(args.port_ros) + 1)
        envs.append(env)
    state = [env.reset() for env in envs][0]
    agent = agents.get_agent(
        args.agent,
        **prepare_agent_kwargs(args, state, logger, envs[0].action_space.n))
    if args.pretrained:
        print('load pretrained weights: ', args.pretrained)
        agent.load_weights(args.pretrained)
    agent.train()
    saver = NetSaver(args, session_id)

    trainer = create_reinforce_engine(agent, envs, args)

    # trainer.attach(ProgressBar(persist=False)) # Key error 'percentage' after a few k of epochs !?
    trainer.attach(saver)
    trainer.attach(logger)

    engine_state = trainer.run(data=MultienvStepGenerator(
        envs, agent, max_steps=args.max_steps),
                               max_epochs=args.epochs_count)
示例#4
0
def train_seq(init_model, get_optim, multitask, args):
    """ Train sequentially """
    from agents import get_agent
    wrappers = getattr(args.lifelong, "wrappers", list())
    agent_class = get_agent(args.lifelong.mode, base_wrappers=wrappers)
    agent = agent_class(init_model, get_optim, multitask, args)
    agent.train_sequentially()
示例#5
0
def train_agent(cmdl):
    step_cnt = 0
    ep_cnt = 0
    preprocess = Preprocessor(cmdl.env_class).transform

    env = utils.get_new_env(cmdl.env_name)
    agent = get_agent(cmdl.agent.name)(env.action_space, cmdl.agent)
    display_setup(env, cmdl)

    start_time = time.time()
    while step_cnt < cmdl.training.step_no:

        ep_cnt += 1
        o, r, done = env.reset(), 0, False
        s = preprocess(o)

        while not done:
            a = agent.evaluate_policy(s)
            o, r, done, _ = env.step(a)
            _s, _a = s, a
            s = preprocess(o)
            agent.improve_policy(_s, _a, r, s, done)

            step_cnt += 1
            agent.gather_stats(r, done)

        if ep_cnt % cmdl.report_freq == 0:
            agent.display_stats(start_time)
            agent.display_model_stats()

    end_time = time.time()
    display_stats(ep_cnt, step_cnt, end_time - start_time)
    """
示例#6
0
def main(_):
    spec = cluster_spec(config.num_workers, 1)
    cluster = tf.train.ClusterSpec(spec).as_cluster_def()

    signal.signal(signal.SIGHUP, shutdown)
    signal.signal(signal.SIGINT, shutdown)
    signal.signal(signal.SIGTERM, shutdown)

    if config.job_name == "worker":
        server = tf.train.Server(cluster,
                                 job_name="worker",
                                 task_index=config.task,
                                 config=tf.ConfigProto(
                                     intra_op_parallelism_threads=1,
                                     inter_op_parallelism_threads=2))

        env = create_env(config.env_id, client_id=str(config.task))
        model_fn = lambda: get_model(config)(env.observation_space.shape, env.
                                             action_space.n)
        agent = get_agent(config)(model_fn, env, config)
        trainer = Trainer(agent, env, server, config.task, config.log_dir)

        if config.is_train:
            trainer.train()
        else:
            trainer.test()
    else:
        server = tf.train.Server(
            cluster,
            job_name="ps",
            task_index=config.task,
            config=tf.ConfigProto(device_filters=["/job:ps"]))
        while True:
            time.sleep(1000)
示例#7
0
def main(args):
    session_id = get_new_session_id()
    logger = Logger(session_id)

    env = gym.make(args.environment_name, **argparse.prepare_env_kwargs(args))
    state = env.reset()
    agent = agents.get_agent(
        args.agent,
        **prepare_agent_kwargs(args, state, logger, env.action_space.n))
    if args.pretrained:
        print('load pretrained weights: ', args.pretrained)
        agent.load_weights(args.pretrained)
    agent.train()
    saver = NetSaver(args, session_id)

    trainer = create_reinforce_engine(agent, env, args)

    # trainer.attach(ProgressBar(persist=False)) # Key error 'percentage' after a few k of epochs !?
    trainer.attach(saver)
    trainer.attach(logger)

    engine_state = trainer.run(data=StepGenerator(env,
                                                  agent,
                                                  max_steps=args.max_steps),
                               max_epochs=args.epochs_count)
示例#8
0
def predict_for_malmo(shared_objects, cfg):
    predict_queue = shared_objects["predict_queue"]
    send_back_queues = shared_objects["send_back_queues"]

    N = len(send_back_queues)

    # -- Initialize agent and wrap it in a Binary18BatchAgentWrapper :)
    Agent = get_agent(cfg.agent.type)
    agent = Agent(cfg.agent.name, ENV_ACTIONS, cfg, shared_objects)

    print_info("{:s}<{:s}> learns from queue. |role={:d}".format(
        cfg.agent.name, cfg.agent.type, cfg.agent.role
    ))

    dtype = torch.LongTensor(0)
    if cfg.general.use_cuda:
        dtype = dtype.cuda()

    while True:

        # -- 1.

        tasks = []
        while True:
            try:
                t = predict_queue.get(True, 0.1)
                tasks.append(t)
            except Queue.Empty:
                if len(tasks) > 1:
                    break

        # print("Let's predict for {:d} losers.".format(len(tasks)))

        # -- 4.
        # 4.a. Create batch from transitions
        # !! This is incomplete

        ids = torch.LongTensor([_id for (_id, _, _, _) in tasks])
        state = torch.cat([torch.LongTensor(s) for (_, s, _, _) in tasks],
                          0).float()
        done = torch.cat([torch.LongTensor(d) for (_, _, d, _) in tasks], 0)
        tokens = torch.LongTensor([token for (_, _, _, token) in tasks])

        if cfg.general.use_cuda:
            ids = ids.cuda()
            state = state.cuda()
            done = done.cuda()

        actions = agent.batch_predict(ids, state, done)
        # print(actions.unsqueeze(0))
        for _id, action, token in zip(ids, actions.tolist(), tokens):
            send_back_queues[_id].send((action, token))
            # print("Sent to {:d} action {:d}".format(_id, action))
        # print("Done! Waiting again...")
        tasks.clear()
示例#9
0
def main(op_check):
    agent = get_agent(S.agent.name)

    # Load Pretrained
    if S.load_weights_dir:
        agent.load_snapshot(S.load_weights_dir)

    if op_check:
        agent.operation_check()
    else:
        agent.train()
示例#10
0
def train_agent(cmdl):
    step_cnt = 0
    ep_cnt = 0
    start_time = time.time()

    env = utils.get_new_env(cmdl.env_name, cmdl)
    eval_env = EvaluationMonitor(gym.make(cmdl.env_name), cmdl)

    name = cmdl.agent.name
    agent = get_agent(name)(env.action_space, cmdl.agent)
    eval_agent = get_agent(name)(eval_env.action_space, cmdl.agent, False)

    preprocess = Preprocessor(cmdl.env_class).transform
    agent.display_setup(env, cmdl)

    while step_cnt < cmdl.training.step_no:

        ep_cnt += 1
        o, r, done = env.reset(), 0, False
        s = preprocess(o)

        while not done:
            a = agent.evaluate_policy(s)
            o, r, done, _ = env.step(a)
            _s, _a = s, a
            s = preprocess(o)
            agent.improve_policy(_s, _a, r, s, done)

            step_cnt += 1
            agent.gather_stats(r, done)

        if step_cnt % cmdl.report_freq == 0:
            agent.display_stats(start_time)
            agent.display_model_stats()
            gc.collect()

        if step_cnt % cmdl.eval_freq == 0:
            evaluate_agent(step_cnt, eval_env, eval_agent, agent.policy, cmdl)

    end_time = time.time()
    agent.display_final_report(ep_cnt, step_cnt, end_time - start_time)
示例#11
0
文件: utils.py 项目: 2dotstwice/dlcli
def search_metadata(url='', key='', org='', account='', metadata='', **kwargs):
    agent_names = []
    org_list = orgs.get_orgs(url=url, org=org, account=account, key=key)
    for o in org_list:
        account_list = accounts.get_accounts(url=url, org=o['name'], key=key)
        for acc in account_list:
            agent_list = agents.get_agents(url=url, org=org, account=acc['name'], key=key)
            for summary in agent_list:
                agent_names.append(summary['name'])
            for agent in agent_names:
                try:
                    search_hash = flatten(agents.get_agent(url=url, org=org, account=acc['name'], key=key, agent_name=agent))
                    if metadata in search_hash.keys() or metadata in search_hash.values():
                        click.echo('Organization: ' + o['name'] + ' Account: ' + acc['name'] + ' Agent: ' + agent)
                except:
                    continue
示例#12
0
    def action():
        ckeck_training_deamon()

        agent = agents.get_agent(request.client_id)
        if agent.replay_memory_lock.acquire():
            agent.replay_memory.add_trajectories_base64(request.trajectories)
            agent.replay_memory_lock.release()

        fresh_model = agent.get_fresh_model()
        if fresh_model is not None:
            agent.update_runtime_parameters()
            return {
                "model": fresh_model,
                "runtime_parameters": agent.runtime_parameters.__dict__
            }
        else:
            return {}
示例#13
0
def search_metadata(url='', key='', org='', account='', metadata='', **kwargs):
    agent_names = []
    org_list = orgs.get_orgs(url=url, org=org, account=account, key=key)
    for o in org_list:
        account_list = accounts.get_accounts(url=url, org=o['name'], key=key)
        for acc in account_list:
            agent_list = agents.get_agents(url=url, org=org, account=acc['name'], key=key)
            for summary in agent_list:
                agent_names.append(summary['name'])
            for agent in agent_names:
                try:
                    search_hash = flatten(agents.get_agent(url=url, org=org, account=acc['name'],
                                                           key=key, agent_name=agent))
                    if metadata in search_hash.keys() or metadata in search_hash.values():
                        click.echo('Organization: %s Account: %s Agent: %s' % (o['name'], acc['name'], agent))
                except:
                    continue
示例#14
0
def train_agent(shared_objects, cfg):

    env = PigChaseEnvironment(
        parse_clients_args(cfg.envs.minecraft.ports),
        PigChaseTopDownStateBuilder(),
        role=cfg.agent.role,
        randomize_positions=cfg.envs.minecraft.randomize_positions)
    agent = get_agent(cfg.agent.type)(cfg.agent.name, ENV_ACTIONS)

    print(
        clr(
            "[ %s ] type=%s, role=%d. Agent started." %
            (cfg.agent.name, cfg.agent.type, cfg.agent.role), 'cyan'))

    obs = env.reset()
    reward = 0
    is_terminal = False
    viz_rewards = []
    ep_cnt = 0

    start_time = time.time()

    print("No of epochs: %d. Max no of steps/epoch: %d" %
          (cfg.training.episodes_no, cfg.training.max_step_no))

    training_steps = cfg.training.episodes_no * cfg.training.max_step_no
    for step in range(1, training_steps + 1):
        # check if env needs reset
        if env.done:
            obs = env.reset()
            ep_cnt += 1
            if ep_cnt % cfg.general.report_freq == 0:
                print("[DQN] Ep: %d | Rw: %d" %
                      (ep_cnt, sum(viz_rewards) / cfg.general.report_freq))
                viz_rewards.clear()

        # select an action
        action = agent.act(obs, reward, is_terminal, is_training=True)

        # take a step
        obs, reward, is_terminal = env.do(action)
        viz_rewards.append(reward)

    elapsed_time = time.time() - start_time
    print("Finished in %.2f seconds at %.2ffps." %
          (elapsed_time, training_steps / elapsed_time))
示例#15
0
def run_once(args):

    cfg, run_id, path = args

    sim_path = path + "/" + cfg.simulator.save_folder
    if not os.path.exists(sim_path):
        os.makedirs(sim_path)

    simulator = Simulator(cfg, sim_path, log)
    simulator.start()

    # -- Set seed
    cfg.general.seed = utils.set_seed(cfg.general.seed)

    # -- Load simulator
    # TODO 2 start server with config
    # TODO 2 Save simulator config in path ( see line 41 with save_config(

    # -- Resume agent and metrics if checkpoints are available
    resume_path = path + "/" + cfg.checkpoint
    if resume_path:
        log.info("Resuming training ...")
        cfg.agent.resume = resume_path
    logging.info('listening to server %s:%s', cfg.simulator.host,
                 cfg.simulator.port)

    # -- Get agent
    agent = get_agent(cfg.agent)
    agent.set_simulator(cfg)

    os.chdir(sim_path)

    benchmark_agent = DemoBenchmark(cfg.simulator.town)

    # -- Init finished
    #save_config(os.path.join(cfg.general.common.save_path, "ran_cfg"), cfg)

    # Now actually run the driving_benchmark
    #import pdb; pdb.set_trace()
    run_driving_benchmark(agent, benchmark_agent, cfg.simulator.town,
                          cfg.simulator.carla_log_name,
                          cfg.simulator.continue_experiment,
                          cfg.simulator.host, cfg.simulator.port)

    simulator.kill_process()
示例#16
0
def run_once(args):
    cfg, run_id, path = args

    # -- Set seed
    cfg.general.seed = utils.set_seed(cfg.general.seed)

    # -- Get data loaders
    data_loader = get_data_loader(cfg.data_loader)

    train_data = data_loader.get_train_loader()
    test_data = data_loader.get_test_loader()

    # -- Resume agent and metrics if checkpoints are available
    # TODO Resume
    if cfg.checkpoint != "":
        resume_path = path + "/" + cfg.checkpoint
        log.info("Resuming training ...")
        cfg.agent.resume = resume_path

    # -- Get agent
    agent = get_agent(cfg.agent)

    # -- Should have some kind of reporting agent
    # TODO Implement reporting agent

    # -- Init finished
    save_config(os.path.join(cfg.general.common.save_path, "ran_cfg"), cfg)

    eval_freq = cfg.train.eval_freq
    no_epochs = cfg.train.no_epochs - agent.get_train_epoch()

    for epoch in range(no_epochs):
        log.info("Train epoch: {}".format(epoch))
        agent.train(train_data)
        if epoch % eval_freq == 0:
            agent.test(test_data)
        print("Finished an epoch :D")

    with open(path + "/loss_values_train", "wb") as f:
        pickle.dump(agent.loss_values_train, f)

    with open(path + "/loss_values_test", "wb") as f:
        pickle.dump(agent.loss_values_test, f)

    agent.eval_agent()
示例#17
0
def run_once(args):
    cfg, run_id, path = args

    # -- Set seed
    cfg.general.seed = utils.set_seed(cfg.general.seed)

    # -- Resume agent and metrics if checkpoints are available
    # TODO Resume
    resume_path = path + "/" + cfg.checkpoint
    if resume_path:
        log.info("Resuming training ...")
        cfg.agent.resume = resume_path

    # -- Get agent
    agent = get_agent(cfg.agent)

    # -- Should have some kind of reporting agent
    # TODO Implement reporting agent

    # -- Init finished
    save_config(os.path.join(cfg.general.common.save_path, "ran_cfg"), cfg)

    agent.eval_agent()
示例#18
0
def main():

    print_info("Starting...")

    # -- Read configuration
    config = read_config()

    # -- Configure Torch
    if config.general.seed != 0:
        torch.manual_seed(config.general.seed)
        if config.general.use_cuda:
            torch.cuda.manual_seed_all(config.general.seed)

    # Configure model
    shared_model = get_model(config.model.name)
    if config.general.use_cuda:
        shared_model.cuda()
    shared_model.share_memory()

    # Get agent
    agent = get_agent(config.agent.type)(config.agent.name, ENV_ACTIONS,
                                         shared_model, config)

    # Shared statistics
    shared_stats = AtomicStatistics()

    # Shared objects
    shared_objects = {"agent": agent, "stats_leRMS": shared_stats}

    start_time = time.time()

    #Train Agent
    train_agent_simulated(shared_objects, config)

    total_time = time.time() - start_time
    print_info("Everything done in {:.2f}!".format(total_time))
示例#19
0
def run_once(args):
    cfg, run_id, path = args

    # -- Set seed
    cfg.general.seed = utils.set_seed(cfg.general.seed)

    # -- Resume agent and metrics if checkpoints are available
    # TODO Resume
    resume_path = path + "/" + cfg.checkpoint
    if resume_path:
        log.info("Network_activation ...")
        cfg.agent.resume = resume_path

    # -- Get agent
    agent = get_agent(cfg.agent)

    if cfg.eval_model is False:
        log.info("Not in eval mode")
        return

    if cfg.image_number != -1:
        eval_network(agent, cfg)
    else:
        pass
示例#20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--agent_name',
        '-a',
        dest='agent_name',
        action='store',
        required=True,
        help='Name of agent to be used to retieve config and agent object.')
    parser.add_argument('--shuffle_times',
                        '-s',
                        dest='shuffle_times',
                        type=int,
                        action='store',
                        required=True,
                        help='Times to shuffle the dataset.')
    parser.add_argument('--reward_func',
                        '-r',
                        dest='reward_func',
                        default='default',
                        action='store',
                        help='Reward function.')
    parser.add_argument('--output_name',
                        '-o',
                        dest='output_name',
                        default='',
                        action='store',
                        help='prefix of output score files.')

    args = parser.parse_args()
    config = get_config(args.agent_name)
    dataset = WarfarinDataSet(config)
    regrets = np.zeros((args.shuffle_times, dataset.size()))
    precision = np.zeros((args.shuffle_times, dataset.size()))
    reward_func = get_reward_func(args.reward_func)

    for i in range(args.shuffle_times):
        agent = get_agent(args.agent_name, config, dataset)
        dataset.shuffle()
        regret = 0
        corrects = 0
        for ts, data in tqdm(enumerate(dataset)):
            features = data['features']
            label = data['label']
            action, context = agent.act(features)
            reward = reward_func(label, action)
            agent.feedback(reward, context)

            # Calacualte Eval metrics
            regret -= reward
            regrets[i][ts] = regret

            if is_correct_action(label, action):
                corrects += 1
            precision[i][ts] = corrects / (ts + 1)
        print('{} final regret: {} final average precision: {}'.format(
            i, regret, precision[i][-1]))

    if args.output_name:
        output_name = args.output_name
    else:
        output_name = args.agent_name
    avg_regrets = np.average(regrets, axis=0)
    avg_precision = np.average(precision, axis=0)
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(range(dataset.size()), avg_regrets, 'b')
    fig.savefig("data/scores/{}-regret.png".format(output_name))
    print(np.std(regrets, axis=0))
    with open("data/scores/{}-regret-values.txt".format(output_name),
              mode='w') as f:
        f.write(';'.join(map(lambda x: ','.join(map(str, x)), regrets)))

    fig2 = plt.figure()
    ax2 = fig2.add_subplot(1, 1, 1)
    ax2.plot(range(dataset.size()), avg_precision, 'b')
    fig2.savefig("data/scores/{}-precision.png".format(output_name))
    with open("data/scores/{}-precision-values.txt".format(output_name),
              mode='w') as f:
        f.write(';'.join(map(lambda x: ','.join(map(str, x)), precision)))
示例#21
0
def run(full_args: Namespace) -> None:
    # import torch.multiprocessing as mp
    # mp.set_start_method('spawn')

    args = full_args.main
    agent_args = full_args.agent
    model_args = full_args.model
    env_args = full_args.env_cfg
    extra_logs = getattr(full_args, "extra_logs", None)

    if args.seed == 0:
        args.seed = full_args.run_id + 1
    max_eprews = args.max_eprews

    post_process_args(agent_args)
    post_process_args(model_args)

    model_dir = getattr(args, "model_dir", full_args.out_dir)
    print(model_dir)

    # ==============================================================================================
    # @ torc_rl repo original

    # Define logger, CSV writer and Tensorboard writer

    logger = utils.get_logger(model_dir)
    csv_file, csv_writer = utils.get_csv_writer(model_dir)
    tb_writer = None
    if args.tb:
        from tensorboardX import SummaryWriter
        tb_writer = SummaryWriter(model_dir)

    # Log command and all script arguments

    logger.info("{}\n".format(" ".join(sys.argv)))
    logger.info("{}\n".format(args))

    # ==============================================================================================
    # Set seed for all randomness sources
    utils.seed(args.seed)

    # ==============================================================================================
    # Generate environments

    envs = []

    # Get environment wrapper
    wrapper_method = getattr(full_args.env_cfg, "wrapper", None)
    if wrapper_method is None:

        def idem(x):
            return x

        env_wrapper = idem
    else:
        env_wrappers = [getattr(environment, w_p) for w_p in wrapper_method]

        def env_wrapp(w_env):
            for wrapper in env_wrappers[::-1]:
                w_env = wrapper(w_env)
            return w_env

        env_wrapper = env_wrapp

    actual_procs = getattr(args, "actual_procs", None)
    master_make_envs = getattr(full_args.env_cfg, "master_make_envs", False)

    if actual_procs:
        # Split envs in chunks
        no_envs = args.procs
        envs, chunk_size = get_envs(full_args,
                                    env_wrapper,
                                    no_envs,
                                    master_make=master_make_envs)
        first_env = envs[0][0]
        print(
            f"NO of envs / proc: {chunk_size}; No of processes {len(envs[1:])} + Master"
        )
    else:
        for i in range(args.procs):
            env = env_wrapper(gym.make(args.env))
            env.max_steps = full_args.env_cfg.max_episode_steps
            env.no_stacked_frames = full_args.env_cfg.no_stacked_frames

            env.seed(args.seed + 10000 * i)
            envs.append(env)
        first_env = envs[0]

    # Generate evaluation envs
    eval_envs = []
    if full_args.env_cfg.no_eval_envs > 0:
        no_envs = full_args.env_cfg.no_eval_envs
        eval_envs, chunk_size = get_envs(full_args,
                                         env_wrapper,
                                         no_envs,
                                         master_make=master_make_envs)

    # Define obss preprocessor
    max_image_value = full_args.env_cfg.max_image_value
    normalize_img = full_args.env_cfg.normalize
    obs_space, preprocess_obss = utils.get_obss_preprocessor(
        args.env,
        first_env.observation_space,
        model_dir,
        max_image_value=max_image_value,
        normalize=normalize_img)

    # ==============================================================================================
    # Load training status
    try:
        status = utils.load_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}

    saver = utils.SaveData(model_dir,
                           save_best=args.save_best,
                           save_all=args.save_all)
    model, agent_data, other_data = None, dict(), None
    try:
        # Continue from last point
        model, agent_data, other_data = saver.load_training_data(best=False)
        logger.info("Training data exists & loaded successfully\n")
    except OSError:
        logger.info("Could not load training data\n")

    # ==============================================================================================
    # Load Model

    if model is None:
        model = get_model(model_args,
                          obs_space,
                          first_env.action_space,
                          use_memory=model_args.use_memory,
                          no_stacked_frames=env_args.no_stacked_frames)
        logger.info(f"Model [{model_args.name}] successfully created\n")

        # Print Model info
        logger.info("{}\n".format(model))

    if torch.cuda.is_available():
        model.cuda()
    logger.info("CUDA available: {}\n".format(torch.cuda.is_available()))

    # ==============================================================================================
    # Load Agent

    algo = get_agent(full_args.agent,
                     envs,
                     model,
                     agent_data,
                     preprocess_obss=preprocess_obss,
                     reshape_reward=None,
                     eval_envs=eval_envs)

    has_evaluator = hasattr(algo,
                            "evaluate") and full_args.env_cfg.no_eval_envs > 0

    # ==============================================================================================
    # Train model

    crt_eprew = 0
    if "eprew" in other_data:
        crt_eprew = other_data["eprew"]
    num_frames = status["num_frames"]
    total_start_time = time.time()
    update = status["update"]
    update_start_time = time.time()

    while num_frames < args.frames:
        # Update model parameters

        logs = algo.update_parameters()

        num_frames += logs["num_frames"]
        update += 1

        if has_evaluator:
            if update % args.eval_interval == 0:
                algo.evaluate()

        prev_start_time = update_start_time
        update_start_time = time.time()

        # Print logs
        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_start_time - prev_start_time)
            duration = int(time.time() - total_start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(
                logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(
                logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += [
                "num_frames_" + key for key in num_frames_per_episode.keys()
            ]
            data += num_frames_per_episode.values()
            header += ["entropy", "value", "policy_loss", "value_loss"]
            data += [
                logs["entropy"], logs["value"], logs["policy_loss"],
                logs["value_loss"]
            ]
            header += ["grad_norm"]
            data += [logs["grad_norm"]]

            # add log fields that are not in the standard log format (for example value_int)
            extra_fields = extra_log_fields(header, list(logs.keys()))
            header.extend(extra_fields)
            data += [logs[field] for field in extra_fields]

            # print to stdout the standard log fields + fields required in config
            keys_format, printable_data = print_keys(header, data, extra_logs)
            logger.info(keys_format.format(*printable_data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_writer.writerow(header)
            csv_writer.writerow(data)
            csv_file.flush()

            if args.tb:
                for field, value in zip(header, data):
                    tb_writer.add_scalar(field, value, num_frames)

            status = {"num_frames": num_frames, "update": update}

            crt_eprew = list(rreturn_per_episode.values())[0]

        # -- Save vocabulary and model

        if args.save_interval > 0 and update % args.save_interval == 0:
            # preprocess_obss.vocab.save()

            saver.save_training_data(model, algo.get_save_data(), crt_eprew)

            logger.info("Model successfully saved")

            utils.save_status(status, model_dir)

        if crt_eprew > max_eprews != 0:
            print("Reached max return 0.93")
            exit()
示例#22
0
def run(mazeType, ai, trials, width, height, showplan):
    # setup some data structure for reporting results
    results = []

    # for each trial
    for trial in range(trials):
        result = {}

        # create a list of agents, will only be one if `all` is not selected
        agents = []
        if ai == ALL:
            for agent in ALL_AGENTS:
                agents.append((get_agent(agent), agent))
        else:
            agents.append((get_agent(ai), ai))

        # create a list of mazes, will only be one if `all` is not selected
        mazes = []
        if mazeType == ALL:
            for maze in ALL_MAZES:
                mazes.append((get_maze(maze, width, height), maze))
        else:
            mazes.append((get_maze(mazeType, width, height), mazeType))

        # for each maze
        for mazeTuple in mazes:
            maze = mazeTuple[0]
            if showplan:
                print_maze(maze, width, height)
            mazeName = mazeTuple[1]

            result[mazeName] = {}

            # for each agent
            for agentTuple in agents:
                agentName = agentTuple[1]
                agent = agentTuple[0]

                # create a problem object for this maze, corner to corner

                problem = Problem((1, 1), (width - 2, height - 2), maze, width,
                                  height)

                result[mazeName][agentName] = {}

                time_zero = time.time()
                # get a plan from this agent
                plan = agent.getPlan(problem)

                # record results of this agent-maze pair into the reporting data structure
                result[mazeName][agentName]['time'] = time.time() - time_zero
                result[mazeName][agentName]['length'] = len(plan)
                result[mazeName][agentName]['nodes'] = problem.nodes_explored

                if showplan:
                    print(plan)

        results.append(result)

    # print tabulated results
    by_maze = {}
    by_ai = {}
    time_table = {}
    length_table = {}
    nodes_table = {}

    for result in results:
        for mazeName in result:
            if mazeName not in by_maze:
                by_maze[mazeName] = {"times": [], "lengths": [], "nodes": []}
            if mazeName not in time_table:
                time_table[mazeName] = {}
            if mazeName not in length_table:
                length_table[mazeName] = {}
            if mazeName not in nodes_table:
                nodes_table[mazeName] = {}

            for agentName in result[mazeName]:
                if agentName not in by_ai:
                    by_ai[agentName] = {
                        "times": [],
                        "lengths": [],
                        "nodes": []
                    }
                if agentName not in time_table[mazeName]:
                    time_table[mazeName][agentName] = []
                if agentName not in length_table[mazeName]:
                    length_table[mazeName][agentName] = []
                if agentName not in nodes_table[mazeName]:
                    nodes_table[mazeName][agentName] = []

                by_maze[mazeName]['times'].append(
                    result[mazeName][agentName]['time'])
                by_maze[mazeName]['lengths'].append(
                    result[mazeName][agentName]['length'])
                by_maze[mazeName]['nodes'].append(
                    result[mazeName][agentName]['nodes'])

                by_ai[agentName]['times'].append(
                    result[mazeName][agentName]['time'])
                by_ai[agentName]['lengths'].append(
                    result[mazeName][agentName]['length'])
                by_ai[agentName]['nodes'].append(
                    result[mazeName][agentName]['nodes'])

                time_table[mazeName][agentName].append(
                    result[mazeName][agentName]['time'])
                length_table[mazeName][agentName].append(
                    result[mazeName][agentName]['length'])
                nodes_table[mazeName][agentName].append(
                    result[mazeName][agentName]['nodes'])

    maze_list = [mazeName for mazeName in by_maze]

    print('\nMean Results by Maze:\n')
    print("\t\t\tTime\tLength\tNodes")
    for mazeName in by_maze:
        print('{maze: <16}'.format(maze=mazeName) + '\t' +
              str(mean(by_maze[mazeName]['times'])) + '\t' +
              str(mean(by_maze[mazeName]['lengths'])) + '\t' +
              str(mean(by_maze[mazeName]['nodes'])))

    print('\nMean Results by Agent:\n')
    print("\t\t\tTime\tLength\tNodes")
    for agentName in by_ai:
        print('{agent: <16}'.format(agent=agentName) + '\t' +
              str(mean(by_ai[agentName]['times'])) + '\t' +
              str(mean(by_ai[agentName]['lengths'])) + '\t' +
              str(mean(by_ai[agentName]['nodes'])))

    print('\nMean Times by Agent Maze combinations:\n')
    print('{a:<16}'.format(a='') +
          '\t'.join(map(lambda x: '{item: <16}'.format(item=x), maze_list)))
    for agentName in by_ai:
        print('{agent: <16}'.format(agent=agentName) + '\t'.join(
            map(lambda x: '{a:<16}'.format(a=mean(time_table[x][agentName])),
                maze_list)))

    print('\nMean Length by Agent Maze combinations:\n')
    print('{a:<16}'.format(a='') +
          '\t'.join(map(lambda x: '{item: <16}'.format(item=x), maze_list)))
    for agentName in by_ai:
        print('{agent: <16}'.format(agent=agentName) + '\t'.join(
            map(lambda x: '{a:<16}'.format(a=mean(length_table[x][agentName])),
                maze_list)))

    print('\nMean Nodes Explored by Agent Maze combinations:\n')
    print('{a:<16}'.format(a='') +
          '\t'.join(map(lambda x: '{item: <16}'.format(item=x), maze_list)))
    for agentName in by_ai:
        print('{agent: <16}'.format(agent=agentName) + '\t'.join(
            map(lambda x: '{a:<16}'.format(a=mean(nodes_table[x][agentName])),
                maze_list)))
示例#23
0
def train_from_malmo(shared_objects, cfg):
    batch_size = cfg.general.batch_size
    queue = shared_objects["queue"]
    session = shared_objects["session"]
    reset = shared_objects["reset"]

    # -- Initialize agent and wrap it in a Binary18BatchAgentWrapper :)
    Agent = get_agent(cfg.agent.type)
    agent = Agent(cfg.agent.name, ENV_ACTIONS, cfg, shared_objects)

    print_info("{:s}<{:s}> learns from queue. |role={:d}".format(
        cfg.agent.name, cfg.agent.type, cfg.agent.role))

    dtype = torch.LongTensor(0)
    if cfg.general.use_cuda:
        dtype = dtype.cuda()

    episodes_no = cfg.training.episodes_no
    best_r_ep = None
    best_r_frame = None

    frame_rewards = []
    episode_rewards = []

    for episode in range(1, episodes_no + 1):
        # 1. Checks the queue. If less than 32.. check again, else goto 2.
        # 2. Inform others to drop future experiences and wait for new params.
        # 3. Collect transitions
        # 4. Train agent and update parameters
        # 5. Drop any shit from queue
        # 6. Inform others that they should take the new params. Go to 1.

        # -- 1.
        while queue.qsize() < batch_size:
            time.sleep(.1)

        # -- 2.
        reset.value = 1

        # -- 3.

        transitions = []
        while len(transitions) < batch_size:
            try:
                t = queue.get()
                transitions.append(t)
            except Queue.Empty:
                print("futere")
                break
        while not queue.empty():
            try:
                t = queue.get()
                transitions.append(t)
            except Queue.Empty:
                print("futere")
                break

        # -- 4.
        # 4.a. Create batch from transitions
        # !! This is incomplete

        print(transitions)
        (s, r, d, a) = transitions[0][0]
        (s, r, d, a) = torch.LongTensor(s), torch.FloatTensor(
            r), torch.LongTensor(d), torch.LongTensor(a)
        _s = s.new().resize_(torch.Size([0]) + s.size()[1:])
        _a = a.new().resize_(torch.Size([0]) + a.size()[1:])
        _r = r.new().resize_(torch.Size([0]) + r.size()[1:])
        _d = d.new().resize_(torch.Size([0]) + d.size()[1:])

        # -- Apply padding on short games
        n = len(transitions)
        max_len = max([len(game) for game in transitions])
        avg_len = np.mean([len(game) for game in transitions])

        fake = [(_s, _r, _d, _a)]
        transitions = [t + fake * (max_len - len(t)) for t in transitions]
        transitions = list(map(list, zip(*transitions)))

        all_r = .0
        total_r = .0

        a = time.time()
        rewards_no = 0
        for step, all_t in enumerate(transitions):
            states = torch.cat(
                list(map(lambda t: torch.LongTensor(t[0]), all_t)), 0)
            rewards = torch.cat(
                list(map(lambda t: torch.FloatTensor(t[1]), all_t)), 0)
            done = torch.cat(
                list(map(lambda t: torch.LongTensor(t[2]), all_t)), 0)
            actions = torch.cat(
                list(map(lambda t: torch.LongTensor(t[3]), all_t)), 0)

            _alive_no = states.size(0)
            print("Alive: {:d}, but {:d} are dead!".format(
                _alive_no,
                done.nonzero().nelement()))

            assert actions.size(0) == _alive_no
            assert rewards.size(0) == _alive_no
            assert done.size(0) == _alive_no

            if cfg.general.use_cuda:
                states = states.cuda()
                rewards = rewards.cuda()
                done = done.cuda()
                actions = actions.cuda()

            # print("---------Step {} ==========".format(step))
            # print("Some transition:")
            # one_hot = states
            # print("rewards: :", rewards)
            # print("done: ", done)
            # print("action: ", actions)
            # print(
            #     one_hot[0, 4] + one_hot[0, 5] * 2 + one_hot[0, 6] * 3 +
            #     one_hot[
            #         0, 7] * 4
            #     + one_hot[0, 8] * 7
            #     + one_hot[0, 13] * 11)

            agent.act(states, rewards, done, True, actions=actions)
            all_r += rewards.sum()
            rewards_no += _alive_no
            total_r += rewards.sum()

        b = time.time()

        agent.reset()

        # -- 5.

        session.value = session.value + 1

        while not queue.empty():
            try:
                queue.get_nowait()
            except Queue.Empty:
                break

        print_info("Go again!")
        reset.value = 0

        all_r /= rewards_no
        total_r /= n

        do_save = False

        if best_r_frame is None or best_r_frame < all_r:
            do_save = True
            best_r_frame = all_r
            r_str = clr("{:.6f}".format(best_r_frame), "white", "on_magenta")
            # salveaza ceva
            save_model(self, best_r_frame, episode, save_only_min=False)
            # agent.save_model()
        else:
            r_str = clr("{:.6f}".format(all_r), "magenta")

        if best_r_ep is None or best_r_ep < total_r:
            do_save = True
            best_r_ep = total_r
            r2_str = clr("{:.6f}".format(best_r_ep), "white", "on_magenta")
            # salveaza ceva
            # agent.save_model()

        else:
            r2_str = clr("{:.6f}".format(total_r), "magenta")

        print_info("Episode: " + clr("{:d}".format(episode), "blue") +
                   clr(" | ", "yellow") + "Rewards per episode: " + r2_str +
                   clr(" | ", "yellow") + "Rewards per frame: " + r_str +
                   clr(" | ", "yellow") + "Batch size: " +
                   clr("{:d}".format(n), "blue") + clr(" | ", "yellow") +
                   "Avg length: " + clr("{:.2f}".format(avg_len), "blue") +
                   clr(" | ", "yellow") + "Back time: " +
                   clr("{:.2f}".format(b - a), "blue"))

        if do_save:
            agent.model_utils.save_model(all_r,
                                         total_r,
                                         episode,
                                         save_only_min=False)

        frame_rewards.append(all_r)
        episode_rewards.append(total_r)

        print("-----------------")
        print("Last ten:")
        print("Last ten step rewards: ", frame_rewards[-10:])
        print("Last ten epis rewards: ", episode_rewards[-10:])
        print("-----------------")

        torch.save(
            torch.stack([
                torch.FloatTensor(frame_rewards),
                torch.FloatTensor(frame_rewards)
            ]), "results/rewards.torch")
示例#24
0
def run(full_args: Namespace, return_models: bool = False):
    if sys.argv[0].startswith("train"):
        import os
        full_args.out_dir = os.path.dirname(sys.argv[1])

    args = full_args.main
    agent_args = full_args.agent
    model_args = full_args.model
    extra_logs = getattr(full_args, "extra_logs", None)
    main_r_key = getattr(full_args, "main_r_key", None)

    if args.seed == 0:
        args.seed = full_args.run_id + 1
    max_eprews = args.max_eprews
    max_eprews_window = getattr(args, "max_eprews_window", 1)

    post_process_args(agent_args)
    post_process_args(model_args)

    model_dir = getattr(args, "model_dir", full_args.out_dir)
    print(model_dir)

    # ==============================================================================================
    # @ torc_rl repo original

    # Define logger, CSV writer and Tensorboard writer

    logger = utils.get_logger(model_dir)
    csv_file, csv_writer = utils.get_csv_writer(model_dir)
    tb_writer = None
    if args.tb:
        from tensorboardX import SummaryWriter
        tb_writer = SummaryWriter(model_dir)

    # Log command and all script arguments

    logger.info("{}\n".format(" ".join(sys.argv)))
    logger.info("{}\n".format(args))

    # ==============================================================================================
    # Set seed for all randomness sources
    utils.seed(args.seed)

    # ==============================================================================================
    # Generate environments

    envs = []

    # Get env wrappers - must be a list of elements
    wrapper_method = getattr(full_args.env_cfg, "wrapper", None)
    if wrapper_method is None:

        def idem(x):
            return x

        env_wrapper = idem
    else:
        env_wrappers = [getattr(gym_wrappers, w_p) for w_p in wrapper_method]

        def env_wrapp(w_env):
            for wrapper in env_wrappers[::-1]:
                w_env = wrapper(w_env)
            return w_env

        env_wrapper = env_wrapp

    actual_procs = getattr(args, "actual_procs", None)
    no_actions = getattr(full_args.env_cfg, "no_actions", 6)

    if actual_procs:
        # Split envs in chunks
        no_envs = args.procs
        envs, chunk_size = get_envs(full_args,
                                    env_wrapper,
                                    no_envs,
                                    n_actions=no_actions)
        first_env = envs[0][0]
        print(
            f"NO of envs / proc: {chunk_size}; No of processes {len(envs[1:])} + Master"
        )
    else:
        for i in range(args.procs):
            env = env_wrapper(gym.make(args.env))
            env.max_steps = full_args.env_cfg.max_episode_steps

            env.seed(args.seed + 10000 * i)
            envs.append(env)
        first_env = envs[0]

    # Generate evaluation envs
    eval_envs = []
    eval_episodes = getattr(full_args.env_cfg, "eval_episodes", 0)
    if full_args.env_cfg.no_eval_envs > 0:
        no_envs = full_args.env_cfg.no_eval_envs
        eval_envs, chunk_size = get_envs(full_args,
                                         env_wrapper,
                                         no_envs,
                                         n_actions=no_actions)

    # Define obss preprocessor
    max_image_value = full_args.env_cfg.max_image_value
    normalize_img = full_args.env_cfg.normalize
    permute = getattr(full_args.env_cfg, "permute", False)
    obss_preprocessor = getattr(full_args.env_cfg, "obss_preprocessor", None)
    obs_space, preprocess_obss = utils.get_obss_preprocessor(
        args.env,
        first_env.observation_space,
        model_dir,
        max_image_value=max_image_value,
        normalize=normalize_img,
        permute=permute,
        type=obss_preprocessor)

    first_obs = first_env.reset()
    if "state" in first_obs:
        full_state_size = first_obs["state"].shape

        # Add full size shape
        add_to_cfg(full_args, MAIN_CFG_ARGS, "full_state_size",
                   full_state_size)

    if "position" in first_obs:
        position_size = first_obs["position"].shape

        # Add full size shape
        add_to_cfg(full_args, MAIN_CFG_ARGS, "position_size", position_size)

    # Add the width and height of environment for position estimation
    model_args.width = first_env.unwrapped.width
    model_args.height = first_env.unwrapped.height

    # ==============================================================================================
    # Load training status
    try:
        status = utils.load_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}

    saver = utils.SaveData(model_dir,
                           save_best=args.save_best,
                           save_all=args.save_all)
    model, agent_data, other_data = None, dict(), None
    try:
        # Continue from last point
        model, agent_data, other_data = saver.load_training_data(best=False)
        logger.info("Training data exists & loaded successfully\n")
    except OSError:
        logger.info("Could not load training data\n")

    # ==============================================================================================
    # Load Model

    if model is None:
        model = get_model(model_args,
                          obs_space,
                          first_env.action_space,
                          use_memory=model_args.mem)
        logger.info(f"Model [{model_args.name}] successfully created\n")

        # Print Model info
        logger.info("{}\n".format(model))

    if torch.cuda.is_available():
        model.cuda()
    logger.info("CUDA available: {}\n".format(torch.cuda.is_available()))

    # ==============================================================================================
    # Load Agent

    algo = get_agent(full_args.agent,
                     envs,
                     model,
                     agent_data,
                     preprocess_obss=preprocess_obss,
                     reshape_reward=None,
                     eval_envs=eval_envs,
                     eval_episodes=eval_episodes)

    has_evaluator = hasattr(algo,
                            "evaluate") and full_args.env_cfg.no_eval_envs > 0

    if return_models:
        return algo, model, envs, saver

    # ==============================================================================================
    # Train model

    prev_rewards = []
    crt_eprew = 0
    if "eprew" in other_data:
        crt_eprew = other_data["eprew"]
    num_frames = status["num_frames"]
    total_start_time = time.time()
    update = status["update"]
    update_start_time = time.time()

    while num_frames < args.frames:
        # Update model parameters

        logs = algo.update_parameters()

        num_frames += logs["num_frames"]
        update += 1

        if update % args.eval_interval == 0 and has_evaluator:
            eval_logs = algo.evaluate(eval_key=main_r_key)
            logs.update(eval_logs)

        prev_start_time = update_start_time
        update_start_time = time.time()

        # Print logs
        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_start_time - prev_start_time)
            duration = int(time.time() - total_start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(
                logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(
                logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += [
                "num_frames_" + key for key in num_frames_per_episode.keys()
            ]
            data += num_frames_per_episode.values()
            header += ["entropy", "value", "policy_loss", "value_loss"]
            data += [
                logs["entropy"], logs["value"], logs["policy_loss"],
                logs["value_loss"]
            ]
            header += ["grad_norm"]
            data += [logs["grad_norm"]]

            # add log fields that are not in the standard log format (for example value_int)
            extra_fields = extra_log_fields(header, list(logs.keys()))
            header.extend(extra_fields)
            data += [logs[field] for field in extra_fields]

            # print to stdout the standard log fields + fields required in config
            keys_format, printable_data = print_keys(header, data, extra_logs)
            logger.info(keys_format.format(*printable_data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_writer.writerow(header)
            csv_writer.writerow(data)
            csv_file.flush()

            if args.tb:
                for field, value in zip(header, data):
                    tb_writer.add_scalar(field, value, num_frames)

            status = {"num_frames": num_frames, "update": update}

            if main_r_key is None:
                crt_eprew = list(rreturn_per_episode.values())[0]
                prev_rewards.append(crt_eprew)
            else:
                crt_eprew = logs[main_r_key]
                prev_rewards.append(logs[main_r_key])

        # -- Save vocabulary and model

        if args.save_interval > 0 and update % args.save_interval == 0:
            preprocess_obss.vocab.save()

            saver.save_training_data(model, algo.get_save_data(), crt_eprew)

            logger.info("Model successfully saved")

            utils.save_status(status, model_dir)

        check_rew = np.mean(prev_rewards[-max_eprews_window:])
        if len(prev_rewards) > max_eprews_window and check_rew > max_eprews:
            print(
                f"Reached mean return {max_eprews} for a window of {max_eprews_window} steps"
            )
            exit()
	def pull(self, endpoint, flag_id, flag):
		headers={"User-Agent":get_agent(),}#"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0"
		flag_id = self.loader(flag_id)

		team_host = "http://%s:8000" % endpoint
		billing_cell = "/billing/%s/" % flag_id["account"]["username"]
		validate_cell = "/validate/%s/" % flag_id["tid"]

		with requests.Session() as s:
			s.cookies.set("sessionid", flag_id["sid"])
			s.cookies.set("transaction_id", flag_id["tid"])
			
			try:

				check = s.get(team_host + billing_cell,
								timeout=self.conn_timeout,
								headers=headers)

			except requests.ConnectionError as ex:
				self.logger.error(self.validate_step_err % unicode(ex))
				self.logger.debug(str(ex), exc_info=True)
				return (Result.DOWN, flag_id)
			except requests.HTTPError as ex:
				self.logger.error(self.validate_step_err % str(ex))
				self.logger.debug(str(ex), exc_info=True)
				return (Result.MUMBLE, flag_id)
			except requests.Timeout as ex:
				self.logger.error(self.validate_step_err % str(ex))
				self.logger.debug(str(ex), exc_info=True)
				return (Result.DOWN, flag_id)						
			except Exception as ex:
				self.logger.error(self.validate_step_err % str(ex))
				self.logger.debug(str(ex), exc_info=True)
				return (Result.DOWN, flag_id)

			s.cookies.set("transaction_sign", flag_id["tsign"])

			try:

				validate = s.get(team_host + validate_cell,
									timeout=self.conn_timeout,
									headers=headers)

			except requests.ConnectionError as ex:
				self.logger.error(self.validate_step_err % unicode(ex))
				self.logger.debug(str(ex), exc_info=True)
				return (Result.DOWN, flag_id)
			except requests.HTTPError as ex:
				self.logger.error(self.validate_step_err % str(ex))
				self.logger.debug(str(ex), exc_info=True)
				return (Result.MUMBLE, flag_id)
			except requests.Timeout as ex:
				self.logger.error(self.validate_step_err % str(ex))
				self.logger.debug(str(ex), exc_info=True)
				return (Result.DOWN, flag_id)						
			except Exception as ex:
				self.logger.error(self.validate_step_err % str(ex))
				self.logger.debug(str(ex), exc_info=True)
				return (Result.DOWN, flag_id)

			if (check.status_code == 200) and (validate.status_code == 200):
				flag_stat = s.cookies.get("valid")
				if flag_stat is None:
					return Result.MUMBLE

				elif flag_stat == "True":

					if flag_id["account"]["billing"]["sign"] == check.text.replace('\n','').replace('\r',''):
						return Result.UP
					else:
						return Result.CORRUPT
				else:
					Result.MUMBLE
			else:
				return Result.MUMBLE
示例#26
0
def train_on_simulator(shared_objects, cfg):
    batch_size = cfg.general.batch_size
    stats = shared_objects["stats_leRMS"]

    # -- Initialize simulated environment
    env = ArificialMalmo(cfg.envs.simulated)
    print_info(
        "Environment initialized (batch_size:={:d}).".format(batch_size))

    # -- Initialize agent and wrap it in a Binary18BatchAgentWrapper :)
    Agent = get_agent(cfg.agent.type)
    agent = Agent(cfg.agent.name, ENV_ACTIONS, cfg, shared_objects)
    agent_runner = Binary18BatchAgentWrapper(agent, cfg.agent.name, cfg)

    print_info(
        "{:s}<{:s}> agent is up and waiting to learn. |role={:d}".format(
            cfg.agent.name, cfg.agent.type, cfg.agent.role))

    # -- Initialize alien
    alien = VillagePeopleEnvChallengeAgent(PigChaseChallengeAgent_V,
                                           cfg.alien.name, env._board_one_hot,
                                           cfg)
    print_info("Alien is up.")

    # -- Start training
    agents = [alien, agent_runner]
    agent_idx = 1

    env_agents_data = [env.agent0, env.agent1]

    dtype = torch.LongTensor(0)
    if cfg.general.use_cuda:
        dtype = dtype.cuda()

    def restartGame():
        obs = env.reset()
        reward = torch.zeros(batch_size).type_as(dtype)
        done = torch.zeros(batch_size).type_as(dtype)

        for agent in agents:
            agent.reset()
        return obs, reward, done

    obs, reward, done = restartGame()
    ep_cnt = 0
    crt_agent = 0

    viz_rewards = torch.LongTensor(batch_size).type_as(dtype)
    viz_steps = torch.LongTensor(batch_size).type_as(dtype)

    # Batch of agents used for evaluation during training.
    eval_agents_count = batch_size
    if cfg.evaluation.during_training.truncate:
        eval_agents_count = int(batch_size * cfg.agent.exploration[0][1])

    viz_rewards = torch.LongTensor(eval_agents_count).type_as(dtype)

    viz_rewards.fill_(0)
    viz_steps.fill_(0)

    start_time = time.time()
    episode_time = AverageMeter()
    report_freq = cfg.general.report_freq

    print_info("No of epochs: {:d}. Max no of steps/epoch: {:d}".format(
        cfg.training.episodes_no, cfg.training.max_step_no))

    training_steps = cfg.training.episodes_no * cfg.training.max_step_no * 2

    start_episode_time = time.time()
    start_report_time = time.time()

    max_freq_r = -100
    max_freq_r_ep = -1

    for step in range(1, training_steps + 1):
        # check if env needs reset
        if env.done.all():
            episode_time.update(time.time() - start_episode_time)
            start_episode_time = time.time()

            obs, reward, done = restartGame()
            ep_cnt += 1
            stats.inc_episodes(batch_size)
            crt_agent = 0

            if ep_cnt % report_freq == 0:
                batch_mean_reward = torch.sum(viz_rewards) / report_freq
                game_mean_reward = batch_mean_reward / eval_agents_count
                last_report_time = time.time() - start_report_time
                start_report_time = time.time()
                r_step = torch.mean(viz_rewards.float() / viz_steps.float())

                if game_mean_reward > max_freq_r:
                    max_freq_r = game_mean_reward
                    max_freq_r_ep = ep_cnt
                    agent.model_utils.save_model(r_step,
                                                 game_mean_reward,
                                                 ep_cnt,
                                                 save_only_min=False)

                print_info("Ep: %d | batch_avg_R: %.4f | game_avg_R: %.4f "
                           "| R_step: %.4f | (Max_R: %.4f at ep %d)" %
                           (ep_cnt, batch_mean_reward, game_mean_reward,
                            r_step, max_freq_r, max_freq_r_ep))
                print_info(
                    "Ep: %d | (Ep_avg_time: %.4f) | (Last_report: %.4f)" %
                    (ep_cnt, episode_time.avg, last_report_time))
                viz_rewards.fill_(0)
                viz_steps.fill_(0)

        # select an action
        agent_act = agents[crt_agent].act(
            obs, reward, done, (1 - env_agents_data[crt_agent].got_done))
        stats.inc_frames((1 - env.done.long()).sum())

        # take a step
        obs, reward, done = env.do(agent_act)
        crt_agent = (crt_agent + 1) % 2

        if crt_agent == agent_idx:
            viz_steps.add_(1 - env.done.long())
            viz_rewards.add_(reward[:eval_agents_count])

    elapsed_time = time.time() - start_time
    print("Finished in %.2f seconds at %.2ffps." %
          (elapsed_time, training_steps / elapsed_time))
	def get_post_form_headers(self, data=""):
		headers={"User-Agent":get_agent(), #"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0"
				"Content-Type":"application/x-www-form-urlencoded",
				"Content-Length": str(len(data))}
		return headers
示例#28
0
def collect_from_malmo(id_, shared_objects, cfg):
    clients = cfg.envs.minecraft.ports
    my_id = id_
    reset = shared_objects["reset"]
    session_id = shared_objects["session"]
    queue = shared_objects["queue"]

    if "predict_queue" in shared_objects:
        use_predict_queue = True
        predict_queue = shared_objects["predict_queue"]
        answer_queue = shared_objects["answer_pipe"][my_id]
    else:
        use_predict_queue = False

    # ----------------------- Run Challenge agent ------------------------------
    challenger_stopped = mp.Value("i", 0)
    shared_obj_ = {"stopped": challenger_stopped}
    p = mp.Process(target=run_challenge_agent,
                   args=(id_, clients, shared_obj_))
    p.start()
    sleep(5)

    # ----------------------- Run VillageP Agent -------------------------------
    # --- Start agent

    agent_role = 1
    cfg.general.use_cuda = True

    if not use_predict_queue:
        agent_actor = get_agent(cfg.agent.type)(cfg.agent.name, ENV_ACTIONS,
                                                cfg, shared_objects)
        # SET Not max predictor
        agent_actor.predict_max = False

    # agent = PigChaseVillagePopleAgent(ENV_AGENT_NAMES[agent_role], ENV_ACTIONS,
    #                                   agent_actor,
    #                                   use_cuda=cfg.general.use_cuda)

    state_builder = PigChaseVillagePeopleBuilder18Binary(agent_role)
    print("A3C: ", clients)
    env = PigChaseEnvironment(clients,
                              state_builder,
                              role=1,
                              randomize_positions=True)

    agent_done = False
    reward = 0
    episode = 0
    step = 0
    obs = env.reset()
    received_none = 0

    while obs is None:
        # this can happen if the episode ended with the first
        # action of the other agent
        # print('Warning: received obs == None.')
        received_none += 1
        if received_none == 10:
            print("[[{}]] Panic !!! > Received {} None in a row".format(
                id_, received_none))

        if received_none == 100:
            print("[[{}]] Panic! Challenger stopped."
                  " Received {} None in a row".format(id_, received_none))
            return -1

    print("[[{}]] Born an playing!".format(id_))

    ep_states = []
    crt_session_id = session_id.value
    while True:
        step += 1
        # check if env needs reset
        # print("AGENT123123")

        if env.done or agent_done:
            # print("[[{}]] Done ep {}.".format(id_, episode))

            if challenger_stopped.value < 0:
                print("[[{}]] Child process ended!!".format(id_))
                pass

            if reset.value == 1:
                # --- Master is training network

                # ---- Restart ----------------------
                # TODO restart MInecraft process

                while reset.value == 1:
                    sleep(0.1)
                ep_states.clear()

            if session_id.value != crt_session_id:
                ep_states.clear()
                crt_session_id = session_id.value

            if len(ep_states) > 0:
                # --- Will be restarted
                state_ = torch.LongTensor(obs).unsqueeze(0)
                done_ = torch.LongTensor([int(agent_done)])
                reward_ = torch.FloatTensor([reward])
                if use_predict_queue:
                    predict_queue.put(
                        (my_id, state_.cpu().numpy(), done_.cpu().numpy(), 23))
                    (act, _) = answer_queue.recv()
                    act = torch.LongTensor([act])
                else:
                    act = agent_actor.act(state_.cuda(), reward_.cuda(),
                                          done_.cuda(), False)
                    # act = agent_actor.act(state_, reward_, done_, False)
                ep_states.append((state_.cpu().numpy(), reward_.cpu().numpy(),
                                  done_.cpu().numpy(), act.cpu().numpy()))

                queue.put(ep_states)
                ep_states = []

            obs = env.reset()
            received_none = 0
            while obs is None:
                # this can happen if the episode ended with the first
                # action of the other agent
                # print('Warning: received obs == None.')
                received_none += 1
                if received_none == 10:
                    print(
                        "[[{}]] Panic !!! > Received {} None in a row".format(
                            id_, received_none))

                if received_none == 10000:
                    print("[[{}]] Panic! Challenger stopped."
                          " Received {} None in a row".format(
                              id_, received_none))
                    sleep(5)
                obs = env.reset()

            episode += 1

        state_ = torch.LongTensor(obs).unsqueeze(0)
        reward_ = torch.FloatTensor([reward])
        done_ = torch.LongTensor([int(agent_done)])

        if not agent_done:
            if use_predict_queue:
                predict_queue.put(
                    (my_id, state_.cpu().numpy(), done_.cpu().numpy(), 23))
                (act, _) = answer_queue.recv()
                act = torch.LongTensor([act])
            else:
                act = agent_actor.act(state_.cuda(), reward_.cuda(),
                                      done_.cuda(), False)
        else:
            reward_[0] = 0
            done_[0] = 0
            if use_predict_queue:
                predict_queue.put(
                    (my_id, state_.cpu().numpy(), done_.cpu().numpy(), 23))
                (act, _) = answer_queue.recv()
                act = torch.LongTensor([act])
            else:
                act = agent_actor.act(state_.cuda(), reward_.cuda(),
                                      done_.cuda(), False)
                # act = agent_actor.act(state_, reward_, done_, False)

        ep_states.append((state_.cpu().numpy(), reward_.cpu().numpy(),
                          done_.cpu().numpy(), act.cpu().numpy()))

        obs, reward, agent_done = env.do(act[0])
示例#29
0
    rewards = []
    dt_string = datetime.now().strftime("%d%m%Y%H%M%S")
    dir_name = "runs/eval_env_{}_agent_{}_memory_{}_{}".format(
        args.env_name, args.agent_type, args.memory_type, dt_string)
    logger = SummaryWriter(log_dir=dir_name)
    with open(os.path.join(dir_name, 'command_line_args.txt'), 'w') as f:
        json.dump(args.__dict__, f, indent=2)

    # Prepare envirornment
    env = environments.get_env(args.env_name)

    # Prepare memory module
    memory_module = memory.get_module(args.memory_type, args)

    # Prepare agent
    agent = agents.get_agent(args.agent_type, env, memory_module, dir_name,
                             device, args)

    # Load saved model
    agent.load_model(args.model_path)

    # Iterate through episodes
    for episode in range(args.num_episodes):
        # Run episode and get reward
        rewards.append(run_episode(env, agent))
        # Get average reward and log results, and add to to all_logging_dict
        reward_np = np.array(rewards)
        i = reward_np.shape[
            0] if reward_np.shape[0] < args.avg_episodes else args.avg_episodes
        avg_reward = reward_np[-i:].mean()
        logging_dict = {
            "avg_rewards": avg_reward,
示例#30
0
    if isinstance(config.model.load, str):
        checkpoint = torch.load(config.model.load)
        iteration = checkpoint['iteration']
        reward = checkpoint['reward']
        print("LOADING MODEL: {} ---> MAX R: {}".format(config.model.load,
                                                        reward))
        shared_model.load_state_dict(checkpoint['state_dict'])
    if config.general.use_cuda:
        shared_model.cuda()
    shared_objects = {
        "model": shared_model,
        "stats_leRMS": AtomicStatistics()
    }

    agent_actor = get_agent(config.agent.type)(config.agent.name,
                                               ENV_ACTIONS, config,
                                               shared_objects)
    agent_role = 1
    ag1 = Binary18BatchAgentWrapper(agent_actor, config.agent.name, config,
                                    is_training=False)

    # ag1 = VillagePeopleEnvChallengeAgent(PigChaseChallengeAgent_V, "Agent_2",
    #                                      env._board_one_hot, config)
    # ag1 = MalmoAgentWrapper(PigChaseChallengeAgent, "Agent_1", config)
    # ag1 = VillagePeopleEnvRandomAgent("Agent_2", config)

    agents = [ag0, ag1]
    env_agents = [env.agent0, env.agent1]
    start = time.time()

示例#31
0
def train():
    # initialize environments and set up logging folders
    config = utils.get_rl_args()
    rom = config.rom_path.format(config.env_id)
    env = utils.make_env(rom, 0, max_episode_steps=config.env_step_limit)

    frame_idx = 0
    resume_flag = False
    if bool(config.resume_folder):
        folder_path = os.path.join(config.log_dir, config.env_id,
                                   config.resume_folder)
        if os.path.exists(folder_path):
            print('## Resume training from ', folder_path)

            last_frame_idx = 0
            with open(os.path.join(folder_path, 'loss.csv'), 'r') as f:
                for line in f:
                    if line[0] == '#':
                        continue
                    last_frame_idx = int(line.split(',')[0])
            print('## ## last_frame_idx', last_frame_idx)
            frame_idx = last_frame_idx
            if os.path.exists(os.path.join(folder_path, 'model.pt')):
                resume_flag = True
        else:
            print('## Initialize training from ', folder_path)
            try:
                os.makedirs(folder_path)
            except OSError:
                print('Creating {} folder failed.'.format(folder_path))
    else:
        folder_path = utils.setup_experiment_folder(config)

    model = agents.get_agent(config=config, env=env, log_dir=folder_path)
    monitor = utils.ExperimentMonitor(config, folder_path)

    if resume_flag:
        print('## load checkpoint from ', folder_path)
        model.load_checkpoint(folder_path)
        monitor.add_separator()

    dataset = utils.wrap_experience_replay(
        model.replay_buffer,
        config,
        size_limit=config.experiment_monitor_freq * config.batch_size)

    episode_logger = {'reward': 0, 'init_time': 0, 'num': 0}
    greedy = (config.exploit_type == 'greedy')

    # some logging functions
    def logging(s, print_=True, log_=True):
        if print_:
            print(s)
        if log_:
            with open(os.path.join(folder_path, 'log.txt'), 'a+') as f_log:
                f_log.write(s + '\n')
                f_log.flush()

    def dump_trajectory_action(action_text, actions, action_id, frame_idx):
        if (episode_logger['num'] %
                config.training_dump_freq == config.training_dump_freq - 1):
            frame_num = frame_idx - episode_logger['init_time']
            logging('[Episode {} step {}] Act: {}=({})\n'.format(
                episode_logger['num'], frame_num, action_text,
                actions[action_id]))
        return

    def dump_trajectory_state(obs_text, frame_idx):
        if (episode_logger['num'] %
                config.training_dump_freq == config.training_dump_freq - 1):
            st = obs_text.split('|')
            logging('[Episode {} step {}] Obs: \nl={}\ni={}\no={}\n'.format(
                episode_logger['num'], frame_idx - episode_logger['init_time'],
                clean(st[0]), clean(st[1]), clean(st[2])))
        return

    def dump_rewards(reward, frame_idx):
        if (episode_logger['num'] %
                config.training_dump_freq == config.training_dump_freq - 1):
            logging('[Episode {} step {}] Reward:{}, CumR:{}'.format(
                episode_logger['num'],
                frame_idx - episode_logger['init_time'],
                reward,
                episode_logger['reward'],
            ))
        return

    # history observation
    obs_history = utils.ObservationHistory(config.history_window)

    # interact with the environment
    def actor_step(obs_ids, action_tuple, frame_idx):
        # compute current action
        template_ids, obj1_pos, obj2_pos, actions = action_tuple
        epsilon = config.epsilon_by_frame(frame_idx)

        action_id, action_text, prob = model.get_action(obs_ids,
                                                        action_tuple,
                                                        epsilon,
                                                        greedy=greedy)

        dump_trajectory_action(action_text, actions, action_id, frame_idx)

        # interact with the environment
        next_obs_text, reward, done, next_info = env.step(action_text,
                                                          parallel=True)
        episode_logger['reward'] += reward
        #
        done = done or len(next_info['valid_act']) == 0

        # history part
        past_obs = ""
        if config.use_history:
            active_entity = obs_history.extract_entity(next_info['valid_act'])
            past_obs = obs_history.retrieve_obs(active_entity)
            obs_history.update_history(active_entity, next_obs_text)

        dump_rewards(reward, frame_idx)
        dump_trajectory_state(next_obs_text, frame_idx + 1)
        next_obs_ids = model.encode_observation(past_obs + next_obs_text)
        next_action_tuple = model.encode_action(next_info['valid_act'],
                                                next_info['objs'],
                                                next_obs_ids)

        next_template_ids = next_action_tuple[0]
        next_obj1_pos = next_action_tuple[1]
        next_obj2_pos = next_action_tuple[2]

        # update experience replay
        model.update_experience_replay(s=obs_ids,
                                       aset=(template_ids, obj1_pos, obj2_pos),
                                       a=action_id,
                                       r=reward,
                                       done=done,
                                       ns=next_obs_ids,
                                       na=(next_template_ids, next_obj1_pos,
                                           next_obj2_pos))
        # tracking behavior trajectories
        monitor.add_ard(frame_idx, actions[action_id], reward, done, prob)

        if done or env.env.emulator_halted():
            score = next_info['score']
            model.reset_hx()
            next_obs_text, next_info = env.reset(parallel=True)
            past_obs = ""
            if config.use_history:
                obs_history.reset()
                active_entity = obs_history.extract_entity(info['valid_act'])
                past_obs = obs_history.retrieve_obs(active_entity)
                obs_history.update_history(active_entity, obs_text)

            next_obs_ids = model.encode_observation(past_obs + next_obs_text)
            next_action_tuple = model.encode_action(next_info['valid_act'],
                                                    next_info['objs'],
                                                    next_obs_ids)

            monitor.add_episode_reward(episode_logger['reward'], score,
                                       frame_idx)
            episode_logger['reward'] = 0
            episode_logger['init_time'] = frame_idx
            episode_logger['num'] += 1
            dump_trajectory_state(next_obs_text, frame_idx)

        return next_obs_ids, next_action_tuple

    logging(str(config))

    obs_text, info = env.reset(parallel=True)

    past_obs = ""
    if config.use_history:
        active_entity = obs_history.extract_entity(info['valid_act'])
        past_obs = obs_history.retrieve_obs(active_entity)
        obs_history.update_history(active_entity, obs_text)

    dump_trajectory_state(obs_text, frame_idx)
    obs_ids = model.encode_observation(past_obs + obs_text)
    action_tuple = model.encode_action(info['valid_act'], info['objs'],
                                       obs_ids)

    start = timer()
    model.reset_time_log()
    act_time = 0

    # pre-fill exp replay for |learn_start| steps
    if frame_idx < config.learn_start:
        for time_step in tqdm(range(config.learn_start),
                              desc='non-train step'):
            obs_ids, action_tuple = actor_step(obs_ids,
                                               action_tuple,
                                               frame_idx=frame_idx)
            frame_idx += 1
    loop_length = config.experiment_monitor_freq * config.update_freq
    loop_start = frame_idx // loop_length
    loop_max = int(config.max_steps / loop_length) + 1
    for loop_idx in range(loop_start, loop_max):
        time_start = loop_idx * loop_length
        time_end = time_start + loop_length
        for batch_vars in tqdm(dataset,
                               desc='training step {}-{}'.format(
                                   time_start, time_end)):
            # one step update

            td_loss, aux_loss = model.learn_step(batch_vars)
            norm = model.get_trainable_parameter_norm()
            monitor.add_loss(frame_idx, td_loss, aux_loss, norm)
            # interact with environment and write data
            act_ep_time = int(round(time.time() * 1000))
            for _ in range(config.update_freq):
                obs_ids, action_tuple = actor_step(obs_ids, action_tuple,
                                                   frame_idx)
                frame_idx += 1
            act_time += int(round(time.time() * 1000)) - act_ep_time

        model.save_networks()
        model.save_optimizer()
        model.save_replay()

        e_r, score = monitor.get_episode_reward_record()
        action_record = monitor.get_action_record()
        td_avg, td_max, td_min = monitor.get_td_record()
        norm_avg, norm_max, norm_min = monitor.get_norm_record()
        exp_avg, exp_max, exp_min = monitor.get_exploration_record()
        # aux_avg, aux_max, aux_min = monitor.get_aux_record()
        logging(
            'step {}, time {}, episode {}, R (avg/max/min) '
            '{:.1f}/{:.1f}/{:.1f}::{:.1f}/{:.1f}/{:.1f}, '
            'epx (p/n) {:.0f}/{:.0f} \n'
            'tpl (max/avg/num) {:.2f}/{:.2f}/{}, '
            'obj (max/avg/num) {:.2f}/{:.2f}/{}, '
            'td (avg/max) {:.3f}/{:.3f}, norm (avg) {:.5f}, '
            'eps {:.3f}/{:.3f}:{:.3f}:{:.3f}'.format(
                frame_idx, timedelta(seconds=int(timer() - start)),
                episode_logger['num'], e_r[0], e_r[1], e_r[2], score[0],
                score[1], score[2], len(model.replay_buffer.priority_buffer),
                len(model.replay_buffer.buffer), action_record['template'][0],
                action_record['template'][1], action_record['template'][2],
                action_record['obj'][0], action_record['obj'][1],
                action_record['obj'][2], td_avg, td_max, norm_avg,
                config.epsilon_by_frame(frame_idx), exp_avg, exp_max, exp_min))
        model.print_time_log()
        model.reset_time_log()
        print('- act time:{}'.format(timedelta(milliseconds=act_time)))
        act_time = 0

    model.save_checkpoint()
    env.close()
示例#32
0
def main():
    parser = argparse.ArgumentParser(description="Train an Actor-Critic agent that plays a specific environment.",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    required_named = parser.add_argument_group('REQUIRED named arguments')
    required_named.add_argument("--config_file", type=str, required=True,
                                help="Configuration file for the experiment.")
    parser.add_argument("--output_dir", type=str, default=EXPERIMENTS_DIR,
                        help="Where to save the experiment files")
    parser.add_argument("--debug", action="store_true", default=False,
                        help="Activate to run Tensorflow in eager mode.")
    parser.add_argument("--replace", action="store_true", default=False,
                        help="Activate to replace old experiment with the same name in the output folder.")
    args = parser.parse_args()

    # On debug mode all functions are executed normally (eager mode)
    if args.debug:
        tf.config.run_functions_eagerly(True)

    # Get git version
    repo = Repo(search_parent_directories=True)
    sha = repo.head.object.hexsha

    # Use provided configurations file
    config_file = Path(args.config_file)
    config = ConfigManager.from_json_file(config_file)

    # Create experiment folder and handle old results
    output_dir = Path(args.output_dir)
    agent_folder = Path(output_dir, config.agent_config.name)
    deleted_old = False
    if agent_folder.exists():
        if args.replace:
            shutil.rmtree(agent_folder)
            deleted_old = True
        else:
            raise FileExistsError(f"The experiment {agent_folder} already exists."
                                  f"Change output folder, experiment name or use -replace "
                                  f"to overwrite.")
    agent_folder.mkdir(parents=True)

    # Save experiments configurations and start experiment log
    prepare_file_logger(logger, logging.INFO, Path(agent_folder, "experiment.log"))
    logger.info(f"Running experiment {config.agent_config.name}")
    if deleted_old:
        logger.info(f"Deleted old experiment in {agent_folder}")
    config.log_configurations(logger)
    experiment_config_file = Path(agent_folder, "configurations.json")
    logger.info(f"Saving experiment configurations to {experiment_config_file}")
    config.to_json_file(experiment_config_file)

    wandbrun = wandb.init(project=f"AC-{config.agent_config.env}",
                          name=config.agent_config.name,
                          group=config.agent_config.agent_type,
                          notes=config.agent_config.desc,
                          config=config.as_single_dict(),
                          reinit=True,
                          dir=f"experiments/{config.agent_config.name}")

    # Create agent
    agent = get_agent(config.agent_config.agent_type)(agent_path=agent_folder, config=config)

    start_time = time.time()
    test_reward = agent.train_policy(training_config=config.training_config)
    train_time = time.time() - start_time

    experiment_info = {"mean_test_reward": float(test_reward),
                       "name": config.agent_config.name,
                       "description": config.agent_config.desc,
                       "git_hash": sha,
                       "train_time": train_time}
    with open(Path(agent_folder, "experiment_information.json"), "w") as outfile:
        json.dump(experiment_info, outfile, indent=4)

    wandbrun.finish()