示例#1
0
def main():
    args = get_args()
    print_args(args)

    log_dir = create_log_dir(args)
    if not args.evaluate:
        writer = SummaryWriter(log_dir)
    SEED = 721
    env = make_env(args)  # "LaserTag-small2-v0"   "SlimeVolleyPixel-v0"

    print(env.observation_space, env.action_space)

    set_global_seeds(args.seed)
    env.seed(args.seed)

    if args.evaluate:
        test(env, args)
        env.close()
        return

    train(env, args, writer)

    writer.export_scalars_to_json(os.path.join(log_dir, "all_scalars.json"))
    writer.close()
    env.close()
示例#2
0
def main():
    args = get_args()
    args.noisy = True
    args.double = True
    args.dueling = True
    args.prioritized_replay = True
    args.c51 = True
    args.multi_step = 3
    args.load_agents = True
    args.num_agents = 12
    args.read_model = None
    args.evaluate = False
    print_args(args)

    log_dir = create_log_dir(args)
    if not args.evaluate:
        writer = SummaryWriter(log_dir)

    env = PanicEnv(num_agents=args.num_agents,
                   scenario_=Scenario.Two_Exits,
                   load_agents=True,
                   read_agents=False)

    set_global_seeds(args.seed)

    if args.evaluate:
        test(env, args)
        return

    train(env, args, writer)

    writer.export_scalars_to_json(os.path.join(log_dir, "all_scalars.json"))
    writer.close()
def main():
    args = get_args()
    print_args(args)
    model_path = f'models/bilateral_dqn/{args.env}'
    os.makedirs(model_path, exist_ok=True)

    log_dir = create_log_dir(args)
    if not args.evaluate:
        writer = SummaryWriter(log_dir)
    SEED = 721
    if args.num_envs == 1 or args.evaluate:
        env = make_env(
            args)  # "SlimeVolley-v0", "SlimeVolleyPixel-v0" 'Pong-ram-v0'
    else:
        VectorEnv = [
            DummyVectorEnv, SubprocVectorEnv
        ][1]  # https://github.com/thu-ml/tianshou/blob/master/tianshou/env/venvs.py
        env = VectorEnv([lambda: make_env(args) for _ in range(args.num_envs)])
    print(env.observation_space, env.action_space)

    set_global_seeds(args.seed)
    env.seed(args.seed)

    if args.evaluate:
        test(env, args, model_path)
        env.close()
        return

    train(env, args, writer, model_path)

    # writer.export_scalars_to_json(os.path.join(log_dir, "all_scalars.json"))
    writer.close()
    env.close()
示例#4
0
文件: main.py 项目: ai4ce/SNAC
def main():
    args = get_args()
    print_args(args)

    if args.evaluate:
        if args.env == "1DStatic":
            env = Env1DStatic(args)
        elif args.env == "1DDynamic":
            env = Env1DDynamic_Validation(args)
        elif args.env == "2DStatic":
            env = Env2DStatic(args)
        elif args.env == "2DDynamic":
            env = Env2DDynamic_Validation(args)
        elif args.env == "3DStatic":
            env = Env3DStatic(args)
        elif args.env == "3DDynamic":
            env = Env3DDynamic_Validation(args)
    else:
        if args.env == "1DStatic":
            env = Env1DStatic(args)
        elif args.env == "1DDynamic":
            env = Env1DDynamic(args)
        elif args.env == "2DStatic":
            env = Env2DStatic(args)
        elif args.env == "2DDynamic":
            env = Env2DDynamic(args)
        elif args.env == "3DStatic":
            env = Env3DStatic(args)
        elif args.env == "3DDynamic":
            env = Env3DDynamic(args)

        datetime = time.time()
        save_hyperparameters(args, datetime)
        log_dir = create_log_dir(args)
        writer = SummaryWriter(log_dir)

    set_global_seeds(args.seed)
    env.seed(args.seed)

    if args.evaluate:
        validate(env, args)
    else:
        train(env, args, writer, datetime)
        writer.flush()
        writer.close()

    env.close()
示例#5
0
def main():
    args = get_args()
    print_args(args)

    log_dir = create_log_dir(args)
    if not args.evaluate:
        writer = SummaryWriter(log_dir)

    env = make_atari(args.env)
    env = wrap_atari_dqn(env, args)

    set_global_seeds(args.seed)
    env.seed(args.seed)

    if args.evaluate:
        test(env, args)
        env.close()
        return

    train(env, args, writer)

    writer.export_scalars_to_json(os.path.join(log_dir, "all_scalars.json"))
    writer.close()
    env.close()
示例#6
0
def main():
    args = get_args()
    print_args(args)

    log_dir = create_log_dir(args)
    wandb.init(project=args.wandb_project,
               name=args.wandb_name,
               notes=args.wandb_notes,
               config=args)

    env = make_atari(args.env)
    env = wrap_atari_dqn(env, args)

    set_global_seeds(args.seed)
    env.seed(args.seed)

    if args.evaluate:
        test(env, args)
        env.close()
        return

    train(env, args)

    env.close()
def main():
    Exploiter = 'DQN'
    EvaluatedModel = 'NashDQN'

    args = get_args()
    # args.against_baseline=False
    print_args(args)

    env = make_env(
        args)  # "SlimeVolley-v0", "SlimeVolleyPixel-v0" 'Pong-ram-v0'
    print(env.observation_space, env.action_space)

    model_prefix = model_metadata[args.env]

    exploiter = load_exploiter(env, Exploiter, args)
    evaluated_model = load_evaluated_model(env, EvaluatedModel, args)

    model_dir = "models/nash_dqn/{}/{}/".format(args.env, model_prefix)
    exploiter_dir = "models/nash_dqn/{}/{}/exploiter/".format(
        args.env, model_prefix)
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(exploiter_dir, exist_ok=True)

    log_dir = create_log_dir(args)
    if not args.evaluate:
        writer = SummaryWriter(log_dir)

    set_global_seeds(args.seed)
    env.seed(args.seed)

    # Parse all models saved during training in order
    filelist, epi_list = [], []
    for filename in os.listdir(model_dir):
        if filename.endswith("dqn"):
            filelist.append(filename.split('_')[0] +
                            '_')  # remove '_dqn' at end
            epi_list.append(int(filename.split('_')[0]))
    sort_idx = np.argsort(epi_list).tolist()
    filelist = [x for _, x in sorted(zip(epi_list, filelist))
                ]  # sort filelist according to the sorting of epi_list
    epi_list.sort()  # filelist.sort() will not give correct answer
    print(epi_list)

    # Evaluate/exploit all models saved during training in order
    eval_data = {}
    for f, i in zip(filelist, epi_list):
        print('load model: ', i, model_dir, f)
        # if i>17000:
        evaluated_model.load_model(model_dir + f,
                                   eval=True,
                                   map_location='cuda:0')
        exploiter_path = exploiter_dir + f

        r, l = exploit(env,
                       evaluated_model,
                       exploiter,
                       args,
                       exploiter_path=exploiter_path)
        eval_data[str(i)] = [r, l]
    save_dir = 'data/{}/'.format(args.env)
    os.makedirs(save_dir, exist_ok=True)
    if args.fictitious:
        save_dir += '/fictitious_eval_data.npy'
    else:
        save_dir += '/eval_data.npy'
    np.save(save_dir, eval_data)

    writer.close()
    env.close()
示例#8
0
def train(env, args, writer):
    current_model = DQN(env, args).to(args.device)
    target_model = DQN(env, args).to(args.device)

    for para in target_model.parameters():
        para.requires_grad = False

    if args.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()
    #target_model.eval()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(current_model, args)

    update_target(current_model, target_model)
    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
        args.buffer_size = replay_buffer.it_capacity
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)

    print_args(args)
    state_deque = deque(maxlen=args.multi_step)
    reward_deque = deque(maxlen=args.multi_step)
    action_deque = deque(maxlen=args.multi_step)

    if args.optim == 'adam':
        optimizer = optim.Adam(current_model.parameters(),
                               lr=args.lr,
                               eps=args.adam_eps,
                               betas=(0.9, args.beta2))
    elif args.optim == 'laprop':
        optimizer = laprop.LaProp(current_model.parameters(),
                                  lr=args.lr,
                                  betas=(0.9, args.beta2))

    reward_list, length_list, loss_list = [], [], []
    episode_reward = 0.
    episode_length = 0

    prev_time = time.time()
    prev_frame = 1

    state = env.reset()
    evaluation_interval = args.evaluation_interval
    for frame_idx in range(1, args.max_frames + 1):
        if args.render:
            env.render()

        if args.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        action = current_model.act(
            torch.FloatTensor(state).to(args.device), epsilon)

        next_state, raw_reward, done, _ = env.step(action)
        if args.clip_rewards:
            reward = np.clip(raw_reward, -1., 1.)
        else:
            reward = raw_reward
        state_deque.append(state)
        reward_deque.append(reward)
        action_deque.append(action)

        if len(state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(reward_deque, args.gamma)
            n_state = state_deque[0]
            n_action = action_deque[0]
            replay_buffer.push(n_state, n_action, n_reward, next_state,
                               np.float32(done))

        state = next_state
        episode_reward += raw_reward
        episode_length += 1

        if episode_length >= 9950:
            while not done:
                _, _, done, _ = env.step(random.randrange(env.action_space.n))

        if done:
            state = env.reset()
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            if episode_length > 10000:
                print('{:.2f}'.format(episode_reward), end='')
            writer.add_scalar("data/episode_reward", episode_reward, frame_idx)
            writer.add_scalar("data/episode_length", episode_length, frame_idx)
            episode_reward, episode_length = 0., 0
            state_deque.clear()
            reward_deque.clear()
            action_deque.clear()

        if len(replay_buffer
               ) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(current_model, target_model, replay_buffer,
                                   optimizer, args, beta)
            loss_list.append(loss.item())
            writer.add_scalar("data/loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % evaluation_interval == 0:
            if len(length_list) > 0:
                print_log(frame_idx, prev_frame, prev_time, reward_list,
                          length_list, loss_list, args)
                reward_list.clear(), length_list.clear(), loss_list.clear()
                prev_frame = frame_idx
                prev_time = time.time()
                save_model(current_model, args)
            else:
                evaluation_interval += args.evaluation_interval
        if frame_idx % 200000 == 0:
            if args.adam_eps == 1.5e-4:
                save_model(current_model,
                           args,
                           name="{}_{}".format(args.optim, frame_idx))
            else:
                save_model(current_model,
                           args,
                           name="{}{:.2e}_{}".format(args.optim, args.adam_eps,
                                                     frame_idx))

    reward_list.append(episode_reward)
    length_list.append(episode_length)
    print_log(frame_idx, prev_frame, prev_time, reward_list, length_list,
              loss_list, args)
    reward_list.clear(), length_list.clear(), loss_list.clear()
    prev_frame = frame_idx
    prev_time = time.time()

    save_model(current_model, args)