Exemplo n.º 1
0
        size=cfg["size"],
        max_ep_len=cfg["train"]["max_ep_len"],
    )
    buffer = Buffer(
        num_env=num_env,
        maxlen=int(cfg["buffer"]["size"] / num_env),
        obs_shape=(4, ),
        device=cfg["buffer"]["device"],
    )
    model = DQN(cfg["agent"]["rnn_size"]).cuda()
    pred = Predictor(buffer, cfg)
    if cfg["random"]:
        warmup = 1e8
    else:
        cp = torch.load("models/dqn.pt")
        model.load_state_dict(cp)
        model.eval()
        pred.load()
        warmup = 0
    actor = actor_iter(envs, model, pred, warmup, eps=0.01)

    reward = []
    while len(reward) < 128:
        full_step = buffer.get_recent(2)
        step, hx, log = actor.send(full_step)
        buffer.append(step)
        if "reward" in log:
            reward.append(log["reward"])

    wandb.log({"final_reward": np.mean(reward)})
Exemplo n.º 2
0
from predictor import Predictor

if __name__ == "__main__":
    cfg = load_cfg("default")
    cfg["env"] = "pol"

    num_env = cfg["agent"]["actors"]
    env = make_vec_envs(
        num=1,
        size=3,
        max_ep_len=cfg["train"]["max_ep_len"],
        seed=10,
    )
    model = DQN(cfg["agent"]["rnn_size"], device="cpu")
    pred = Predictor(None, cfg, device="cpu")
    actor = actor_iter(env, model, pred, 0, eps=0)
    buffer = Buffer(num_env=1, maxlen=2, obs_shape=(4, ), device="cpu")

    cp = torch.load("models/dqn.pt", map_location="cpu")
    model.load_state_dict(cp)
    model.eval()
    pred.load()

    for n_iter in range(2000):
        full_step = buffer.get_recent(2, "cpu")
        step, hx, log_a = actor.send(full_step)
        buffer.append(step)
        # env.render()
        os.system("clear")
        env.remotes[0].send(('render', None))
        env.remotes[0].recv()
Exemplo n.º 3
0
    num_env = cfg["agent"]["actors"]
    fstack = cfg["agent"]["frame_stack"]
    envs = make_vec_envs(cfg["env"], num_env, cfg["seed"], cfg["train"]["max_ep_len"])

    buffer = Buffer(
        num_env=num_env,
        maxlen=int(cfg["buffer"]["size"] / num_env),
        obs_shape=envs.observation_space.shape,
        device=cfg["buffer"]["device"],
    )
    model = DQN(envs.action_space.n, fstack).cuda().train()
    wmse = WMSE(buffer, cfg)
    pred = Predictor(buffer, wmse.encoder, envs.action_space.n, cfg)
    learner = Learner(model, buffer, pred, cfg)
    actor = actor_iter(
        envs, model, pred, cfg["buffer"]["warmup"], eps=cfg["agent"].get("eps")
    )

    start_train = int(cfg["buffer"]["warmup"] / num_env)
    log_every = cfg["train"]["log_every"]
    train_every = cfg["train"]["learner_every"]
    wmse_every = cfg["train"]["w_mse_every"]

    def save():
        torch.save(model.state_dict(), "models/dqn.pt")
        wmse.save()
        pred.save()

    count = trange(int(cfg["train"]["frames"] / 4 / num_env), smoothing=0.05)
    for n_iter in count:
        full_step = buffer.get_recent(fstack + 1)
Exemplo n.º 4
0
    fstack = cfg["agent"]["frame_stack"]
    envs = make_vec_envs(cfg["env"],
                         num_env,
                         max_ep_len=cfg["train"]["max_ep_len"])
    num_action = envs.action_space.n

    buffer = Buffer(
        num_env=num_env,
        maxlen=int(cfg["buffer"]["size"] / num_env),
        obs_shape=envs.observation_space.shape,
        device=cfg["buffer"]["device"],
    )
    wmse = WMSE(buffer, cfg)
    idf = IDF(buffer=buffer, num_action=num_action)
    cpc = CPC(buffer=buffer, num_action=num_action)
    actor = actor_iter(envs, None, None, cfg["buffer"]["warmup"], eps=1)

    pretrain = int(cfg["buffer"]["warmup"] / num_env)
    for n_iter in trange(pretrain):
        step, hx, log = next(actor)
        buffer.append(step)

    # batch = 256
    for i in trange(20000):
        cur_log = wmse.train()
        if i % 200 == 0:
            wandb.log(cur_log)
    torch.save(wmse.encoder.state_dict(), "models/conv_wmse.pt")

    # batch = 256
    for i in trange(20000):
Exemplo n.º 5
0
    wandb.init(project="lwm", config=cfg)

    num_env = cfg["agent"]["actors"]
    fstack = cfg["agent"]["frame_stack"]
    envs = make_vec_envs(cfg["env"], num_env, cfg["seed"])

    buffer = Buffer(
        num_env=num_env,
        maxlen=int(cfg["buffer"]["size"] / num_env),
        obs_shape=envs.observation_space.shape,
        device=cfg["buffer"]["device"],
    )
    model = DQN(envs.action_space.n, fstack).cuda().train()
    wmse = WMSE(buffer, cfg)
    pred = Predictor(buffer, wmse.encoder, envs.action_space.n, cfg)
    actor = actor_iter(envs, model, pred, 0, eps=0.001)

    wmse.load(), pred.load()
    cp = torch.load("models/dqn.pt", map_location="cuda")
    model.load_state_dict(cp)
    model.eval()

    while True:
        full_step = buffer.get_recent(fstack + 1)
        step, hx, log = actor.send(full_step)
        buffer.append(step)
        if "reward" in log:
            wandb.log({"final_reward": log["reward"]})
            break

    wandb.save("models/dqn.pt")
Exemplo n.º 6
0
    envs = make_vec_envs(
        num=num_env,
        size=cfg["size"],
        max_ep_len=cfg["train"]["max_ep_len"],
    )
    buffer = Buffer(
        num_env=num_env,
        maxlen=int(cfg["buffer"]["size"] / num_env),
        obs_shape=(4,),
        device=cfg["buffer"]["device"],
    )
    model = DQN(cfg["agent"]["rnn_size"]).cuda().train()
    pred = Predictor(buffer, cfg)
    learner = Learner(model, buffer, pred, cfg)
    eps = cfg["agent"].get("eps")
    actor = actor_iter(envs, model, pred, cfg["buffer"]["warmup"], eps=eps)

    start_train = int(cfg["buffer"]["warmup"] / num_env)
    log_every = cfg["train"]["log_every"]
    train_every = cfg["train"]["learner_every"]

    count = trange(int(cfg["train"]["frames"] / num_env), smoothing=0.05)
    for n_iter in count:
        full_step = buffer.get_recent(2)
        step, hx, log = actor.send(full_step)
        buffer.append(step)

        if n_iter == start_train and cfg["add_ri"]:
            for i in trange(1000):
                cur_log = pred.train()
                if i % 100 == 0: