Exemplo n.º 1
0
def execute(args, params, device):
    utils.kill_game_processes()
    env = main.make_env(args, params)

    result_name1, writer1, net1, tgt_net1, selector1, epsilon_tracker1, agent1, exp_source1, buffer1, optimizer1 = main.make_components(args, params, device, env, 0)
    result_name2, writer2, net2, tgt_net2, selector2, epsilon_tracker2, agent2, exp_source2, buffer2, optimizer2 = main.make_components(args, params, device, env, 1)

    frame = 0
    frame_idx1 = 0
    frame_idx2 = 0
    eval_states1 = None
    eval_states2 = None

    date_time = datetime.now().strftime("%b%d_%H-%M-%S")
    with common.RewardTracker(writer1, params['stop_reward_player1'], net1, date_time + result_name1 + ".dat", 0, env) as reward_tracker1, \
            common.RewardTracker(writer2, params['stop_reward_player2'], net2, date_time + result_name2 + ".dat", 1, env) as reward_tracker2:

        # fill histories
        main.train(args, params, device, buffer1, epsilon_tracker1, frame_idx1, exp_source1, reward_tracker1, selector1, optimizer1, net1, tgt_net1, writer1, eval_states1)
        main.train(args, params, device, buffer2, epsilon_tracker2, frame_idx2, exp_source2, reward_tracker2, selector2, optimizer2, net2, tgt_net2, writer2, eval_states2)

        while True:
            if frame // args.units % 2 == 0:
                frame_idx1 += 1
                if main.train(args, params, device, buffer1, epsilon_tracker1, frame_idx1, exp_source1, reward_tracker1, selector1, optimizer1, net1, tgt_net1, writer1, eval_states1):
                    break
            else:
                frame_idx2 += 1
                if main.train(args, params, device, buffer2, epsilon_tracker2, frame_idx2, exp_source2, reward_tracker2, selector2, optimizer2, net2, tgt_net2, writer2, eval_states2):
                    break

            frame += 1

            if args.maxFrames > 0 and frame_idx1 > args.maxFrames:
                break
Exemplo n.º 2
0
def play_func(params, net, cuda, exp_queue):
    env = make_env(params)

    writer = SummaryWriter(comment="-" + params['run_name'] +
                           "-05_new_wrappers")
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, cuda=cuda)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params['gamma'], steps_count=1)
    exp_source_iter = iter(exp_source)

    frame_idx = 0

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            exp = next(exp_source_iter)
            exp_queue.put(exp)

            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx,
                                         selector.epsilon):
                    break

    exp_queue.put(None)
Exemplo n.º 3
0
def execute(args, params, device):
    utils.kill_game_processes()
    env = main.make_env(args, params)

    result_name1, writer1, net1, tgt_net1, agent1, exp_source1, buffer1, optimizer1 = main.make_components(args, params, device, env, 0)
    net2 = ptan.agent.TargetNet(net1)
    agent2 = ptan.agent.DQNAgent(lambda x: net1.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device)

    frame = 0
    frame_idx1 = 0

    date_time = datetime.now().strftime("%b%d_%H-%M-%S")
    with common.RewardTracker(writer1, params['stop_reward_player1'], net1, date_time + result_name1 + ".dat", 0, env) as reward_tracker1:

        # fill history
        main.train(params, buffer1, device, frame_idx1, exp_source1, reward_tracker1, optimizer1, net1, tgt_net1, writer1)

        while True:
            if frame // args.units % 2 == 0:
                state, _, _, _  = env.step((1, -1))
                action, _ = agent2([state])
                state, reward, done, _ = env.step((1, action[0]))
                if done:
                    state = env.reset()
            else:
                frame_idx1 += 1
                if main.train(params, buffer1, device, frame_idx1, exp_source1, reward_tracker1, optimizer1, net1, tgt_net1, writer1):
                    break

            if args.maxFrames > 0 and frame_idx1 > args.maxFrames:
                break

            frame += 1
            if frame % NET_SYNC == 0:
                net2.sync()
Exemplo n.º 4
0
def play_func(params, net, cuda, exp_queue):
    env = gym.make(params.env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    device = torch.device("cuda" if cuda else "cpu")

    writer = SummaryWriter(comment="-" + params.run_name + "-03_parallel")

    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=params.gamma,
                                                           steps_count=1)
    exp_source_iter = iter(exp_source)

    frame_idx = 0

    with common.RewardTracker(writer, params.stop_reward) as reward_tracker:
        while True:
            frame_idx += 1
            exp = next(exp_source_iter)
            exp_queue.put(exp)

            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx,
                                         selector.epsilon):
                    break

    exp_queue.put(None)
Exemplo n.º 5
0
def grads_func(proc_name, net, cuda, train_queue):
    envs = [make_env() for _ in range(NUM_ENVS)]

    agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], cuda=cuda, apply_softmax=True)
    exp_source = ptan.experience.ExperienceSourceFirstLast(envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)

    batch = []
    frame_idx = 0
    writer = SummaryWriter(comment=proc_name)

    with common.RewardTracker(writer, stop_reward=REWARD_BOUND) as tracker:
        with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker:
            for exp in exp_source:
                frame_idx += 1
                new_rewards = exp_source.pop_total_rewards()
                if new_rewards and tracker.reward(new_rewards[0], frame_idx):
                    break

                batch.append(exp)
                if len(batch) < GRAD_BATCH:
                    continue

                states_v, actions_t, vals_ref_v = \
                    common.unpack_batch(batch, net, last_val_gamma=GAMMA**REWARD_STEPS, cuda=cuda)
                batch.clear()

                net.zero_grad()
                logits_v, value_v = net(states_v)
                loss_value_v = F.mse_loss(value_v, vals_ref_v)

                log_prob_v = F.log_softmax(logits_v, dim=1)
                adv_v = vals_ref_v - value_v.detach()
                log_prob_actions_v = adv_v * log_prob_v[range(GRAD_BATCH), actions_t]
                loss_policy_v = -log_prob_actions_v.mean()

                prob_v = F.softmax(logits_v, dim=1)
                entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean()

                loss_v = entropy_loss_v + loss_value_v + loss_policy_v
                loss_v.backward()

                tb_tracker.track("advantage", adv_v, frame_idx)
                tb_tracker.track("values", value_v, frame_idx)
                tb_tracker.track("batch_rewards", vals_ref_v, frame_idx)
                tb_tracker.track("loss_entropy", entropy_loss_v, frame_idx)
                tb_tracker.track("loss_policy", loss_policy_v, frame_idx)
                tb_tracker.track("loss_value", loss_value_v, frame_idx)
                tb_tracker.track("loss_total", loss_v, frame_idx)

                # gather gradients
                nn_utils.clip_grad_norm(net.parameters(), CLIP_GRAD)
                grads = [param.grad.data.cpu().numpy() if param.grad is not None else None
                         for param in net.parameters()]
                train_queue.put(grads)

    train_queue.put(None)
Exemplo n.º 6
0
def execute(args, params, device):
    utils.kill_game_processes()

    env = main.make_env(args, params)

    net1 = dqn_model.RainbowDQN(env.observation_space.shape,
                                env.action_space.n)
    net1.load_state_dict(
        torch.load(args.model1, map_location=lambda storage, loc: storage))

    agent1 = ptan.agent.DQNAgent(lambda x: net1.qvals(x),
                                 ptan.actions.ArgmaxActionSelector(),
                                 device=torch.device("cpu"))

    result_name = "-" + "-rainbow" + "-scenario=" + args.scenario + "-units=" + str(
        args.units)
    writer1 = SummaryWriter(comment=result_name + "-player0")

    env.reset()

    total_reward1 = 0.0
    counter1 = collections.Counter()

    epsilon = 0.02
    frame_idx1 = 0

    with common.RewardTracker(writer1, 100, net1, "x.dat", 0,
                              env) as reward_tracker1:

        while True:
            frame_idx1 += 1
            if np.random.random() < epsilon:
                action = [env.action_space.sample()]
            else:
                state, _, _, _ = env.step((0, -1))
                action, _ = agent1([state], [None])

            counter1[action[0]] += 1
            _, reward, done, _ = env.step((0, action[0]))

            total_reward1 += reward
            if done:
                reward_tracker1.reward(total_reward1, frame_idx1)
                total_reward1 = 0.0

                env.reset()

                net1.load_state_dict(
                    torch.load(args.model1,
                               map_location=lambda storage, loc: storage))

            if args.maxFrames > 0 and frame_idx1 > args.maxFrames:
                break
Exemplo n.º 7
0
def play_func(params, net, cuda, fsa, exp_queue, fsa_nvec=None):
    device = torch.device("cuda" if cuda else "cpu")
    env = make_env(params)

    writer = SummaryWriter(comment="-" + params['run_name'] + "-05_new_wrappers")
    if not fsa:
        selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
        epsilon_tracker = common.EpsilonTracker(selector, params)
        agent = ptan.agent.DQNAgent(net, selector, device=device, fsa=fsa)
    else:
        if 'Index' in net.__class__.__name__:
            selector = ptan.actions.EpsilonGreedyActionSelectorFsa(fsa_nvec, epsilon=params['epsilon_start'])
            epsilon_tracker = common.IndexedEpsilonTracker(selector, params, fsa_nvec)
            agent = ptan.agent.DQNAgent(net, selector, device=device, fsa=fsa, epsilon_tracker=epsilon_tracker)
        else:
            selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
            epsilon_tracker = common.EpsilonTracker(selector, params)
            agent = ptan.agent.DQNAgent(net, selector, device=device, fsa=fsa)
            # epsilon_tracker = common.IndexedEpsilonTrackerNoStates(selector, params, fsa_nvec)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
    exp_source_iter = iter(exp_source)

    frame_idx = 0

    with common.RewardTracker(writer, params['stop_reward'], params['telemetry'], params['plot']) as reward_tracker:
        while True:
            frame_idx += 1
            exp = next(exp_source_iter)
            exp_queue.put(exp)

            if not fsa or 'Index' not in net.__class__.__name__:
                epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            new_scores = exp_source.pop_total_scores()
            if new_rewards:
                if not fsa or 'Index' not in net.__class__.__name__:
                    new_score = [] if not new_scores else new_scores[0]
                    if reward_tracker.reward(new_rewards[0], new_score, frame_idx, selector.epsilon, params['plot']):
                        break
                else:
                    new_score = [] if not new_scores else new_scores[0]
                    if reward_tracker.reward(new_rewards[0], new_score, frame_idx, selector.epsilon_dict, params['plot']):
                        break

    exp_queue.put(None)
Exemplo n.º 8
0
def execute(args, params, device):
    utils.kill_game_processes()
    env = main.make_env(args, params)

    result_name, writer, net, tgt_net, agent, exp_source, buffer, optimizer = main.make_components(
        args, params, device, env, 0)

    frame_idx = 0

    date_time = datetime.now().strftime("%b%d_%H-%M-%S")
    with common.RewardTracker(writer, params['stop_reward_player1'], net,
                              date_time + result_name + ".dat", 0,
                              env) as reward_tracker:
        while True:
            frame_idx += 1
            if main.train(params, buffer, device, frame_idx, exp_source,
                          reward_tracker, optimizer, net, tgt_net, writer):
                break

            if args.maxFrames > 0 and frame_idx > args.maxFrames:
                break
Exemplo n.º 9
0
def play_func(params, net, cuda, exp_queue, device_id):
    env_name = params['env_name']
    run_name = params['run_name']
    if 'max_games' not in params:
        max_games = 16000
    else:
        max_games = params['max_games']
    env = gym.make(env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    device = torch.device("cuda:{}".format(device_id) if cuda else "cpu")

    if 'save_iter' not in params:
        save_iter = 500
    else:
        save_iter = params['save_iter']

    writer = SummaryWriter(comment="-" + params['run_name'] + "-03_parallel")

    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
    exp_source_iter = iter(exp_source)

    fh = open('models/{}_metadata.csv'.format(run_name), 'w')
    out_csv = csv.writer(fh)

    frame_idx = 0
    game_idx = 1
    model_count = 0
    model_stats = []
    mean_rewards = []
    best_reward = 0
    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            exp = next(exp_source_iter)
            exp_queue.put(exp)

            epsilon_tracker.frame(frame_idx)
            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                status, num_games, mean_reward, epsilon_str = reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon)
                mean_rewards.append(mean_reward)
                if status:
                    break
                if game_idx and (game_idx % save_iter == 0):
                    # write to disk
                    np.savetxt('models/{}_reward.txt'.format(run_name), np.array(mean_rewards))
                    if mean_reward > best_reward:
                        print("Saving model...")
                        model_name = 'models/{}_{}.pth'.format(run_name, game_idx)
                        torch.save(net, model_name)
                        new_row = [model_name, num_games, mean_reward, epsilon_str]
                        out_csv.writerow(new_row)
                        best_reward = mean_reward
                if game_idx == max_games:
                    break
                game_idx += 1

    print("Saving final model...")
    model_name = 'models/{}_{}.pth'.format(run_name, game_idx)
    net.to(torch.device('cpu'))
    torch.save(net, model_name)
    net.to(device)
    new_row = [model_name, num_games, mean_reward, epsilon_str]
    out_csv.writerow(new_row)
    np.savetxt('models/{}_reward.txt'.format(run_name), np.array(mean_rewards))
    # plt.figure(figsize=(16, 9))
    # plt.tight_layout()
    # plt.title('Reward vs time, {}'.format(run_name))
    # plt.xlabel('Iteration')
    # plt.ylabel('Reward')
    # ys = np.array(mean_rewards)
    # plt.plot(ys, c='r')
    # plt.savefig('models/{}_reward.png'.format(run_name))
    # plt.close()
    fh.close()

    exp_queue.put(None)
Exemplo n.º 10
0
    # Initialise weights and copy from net to target net
    tf.global_variables_initializer().run()
    sync_nets.run()
    # Action selector
    selector = rl.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    # Epsilon increment
    epsilon_tracker = common.EpsilonTracker(selector, params)
    # DQN agent
    agent = rl.agent.DQNAgent(state, net_q, selector)
    # Experience source
    exp_source = rl.experience_ptan.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'])
    # Memory buffer
    buffer = rl.experience_ptan.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])

    frame_idx = 0
    with common.RewardTracker(writer) as reward_tracker:
        # Initial save
        saver.save(sess, save_dir, global_step=global_step)
        while frame_idx < total_frames:
            frame_idx += 1
            buffer.populate(1)
            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon)

            # Don't train while filling memory
            if len(buffer) < rep_init:
                continue
def main(_config,_run):
    
    logger = _run
    SAVE_NAME = _config['SAVE_NAME']
    LOAD_SAVED_MODEL = _config['LOAD_SAVED_MODEL']
    MODEL_PATH_FINAL = _config['MODEL_PATH_FINAL']
    total_steps = 1000000

    params = common.HYPERPARAMS['gamePlay2']
    params['epsilon_frames'] *= 2
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    args = parser.parse_args()

    env = gym.make(params['env_name'],glob_conf=_config,logger=logger)
    #env = ptan.common.wrappers.wrap_dqn(env)

    writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow-beta200")
    net = RainbowDQN(env.observation_space.shape, env.action_space.n).to(device)
    
    #net.load_state_dict(torch.load(  ))
    name_load = current_path +"/models" +MODEL_PATH_FINAL
    if _config['LOAD_SAVED_MODEL']:
        mdl, opt, lss = load_ckp(MODEL_PATH_FINAL, net, optimizer)
        net = mdl
        optimizer = opt

    tgt_net = ptan.agent.TargetNet(net)
    agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device)
    # change the step_counts to change multi step prediction
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS)
    buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    today = datetime.datetime.now()
    todays_date_full = str(today.year) + "_" + str(today.month) + "_" + str(today.day) + "_"
    todays_date_full += str(today.hour) + "_" + str(today.minute) + "_" + str(today.second)
    folder_name = todays_date_full +"_"+experiment_name
    results_dir = current_path + "/results/" + folder_name
    results_dir_weights = results_dir + "/weights"
    os.makedirs(results_dir)
    os.makedirs(results_dir_weights)

    frame_idx = 0
    beta = BETA_START
    best_mean_reward = 0.0
    eval_states = None
    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while frame_idx < total_steps:
            frame_idx += 1
            buffer.populate(1)
            beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)
            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                # start saving the model after actual training begins
                if frame_idx > 100:
                    if best_mean_reward is None or best_mean_reward < reward_tracker.mean_reward:
                        torch.save(net.state_dict(),
                                   SAVE_NAME + "-best.dat")

                        if best_mean_reward is not None:
                            print("Best mean reward updated %.3f -> %.3f, model saved" % \
                                  (best_mean_reward, reward_tracker.mean_reward))
                        if not reward_tracker.mean_reward == 0:
                            best_mean_reward = reward_tracker.mean_reward

                if reward_tracker.reward(new_rewards[0], frame_idx):
                    break

            if len(buffer) < params['replay_initial']:
                continue
            if eval_states is None:
                eval_states, _, _ = buffer.sample(STATES_TO_EVALUATE, beta)
                eval_states = [np.array(transition.state, copy=False) for transition in eval_states]
                eval_states = np.array(eval_states, copy=False)

            optimizer.zero_grad()
            batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta)
            loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model,
                                               params['gamma'] ** REWARD_STEPS, device=device)

            # if frame_idx % 10000 == 0:
            if frame_idx % 5000 == 0:
                checkpoint = ({
                    'model': net.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'loss': loss_v,
                    'num_step': frame_idx
                })
                torch.save(checkpoint, results_dir_weights + "/rainbow" + str(frame_idx) + "step.dat")

                # Save network parameters as histogram
                for name, param in net.named_parameters():
                    writer.add_histogram(name, param.clone().cpu().data.numpy(), frame_idx)
            loss_v.backward()
            optimizer.step()
            buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy())

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()

            if logger:
                loss_v.item()
                logger.log_scalar("loss", loss_v.item())
                logger.log_scalar("mean_reward", reward_tracker.mean_reward)
                
    """자식 process가 메인 process에 데이터 전달"""
    train_queue = mp.Queue(
        maxsize=PROCESSES_COUNT)  #꽉찬 큐에는 새로 입력 불가능 (for on-policy)
    data_proc_list = []

    for _ in range(PROCESSES_COUNT):  #자식 별
        data_proc = mp.Process(target=data_func,
                               args=(net, device, train_queue))
        data_proc.start()  #data_fun()이 자식 process에서 실행
        data_proc_list.append(data_proc)
    """학습"""
    batch = []
    step_idx = 0

    try:
        with common.RewardTracker(writer, stop_reward=REWARD_BOUND) as tracker:
            with ptan.common.utils.TBMeanTracker(writer,
                                                 batch_size=100) as tb_tracker:

                while True:
                    train_entry = train_queue.get()

                    #queue에 있는게 reward
                    if isinstance(train_entry, TotalReward):
                        if tracker.reward(train_entry.reward, step_idx):
                            break
                        continue

                    #queue에 있는게 reward가 아닌 expsource객체 (에피소드가 끝남)
                    step_idx += 1
                    batch.append(train_entry)
Exemplo n.º 13
0
def main():
    global params_save_file

    game = 'spaceinvaders'
    params_save_file += '-' + game

    params = config.HYPERPARAMS[game]
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    args = parser.parse_args()

    env = gym.make(params['env_name'])
    env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames'])

    print("Parameters:")
    print(params)
    sys.stdout.flush()

    writer = SummaryWriter(comment="-" + params['run_name'] + "-prio-replay")
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    if args.cuda:
        net.cuda()

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'],
                    steps_count=1)
    buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0
    beta = BETA_START

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += params['steps']
            buffer.populate(params['steps'])
            epsilon_tracker.frame(frame_idx)
            beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                writer.add_scalar("beta", beta, frame_idx)
                if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon, last_dq_losses):
                    break

            if len(buffer) < params['replay_initial']:
                continue

            optimizer.zero_grad()
            batch, batch_indices, batch_weights = buffer.sample(params['batch_size'] * params['steps'], beta)
            loss_v, sample_prios = calc_loss(batch, batch_weights, net, tgt_net.target_model,
                                                params["gamma"], cuda=args.cuda)
            loss_v.backward()
            optimizer.step()
            buffer.update_priorities(batch_indices, sample_prios)

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()

            if frame_idx % params['save_params_every'] == 0:
                torch.save(net.state_dict(), params_save_file + str(frame_idx))

    torch.save(net.state_dict(), params_save_file + str(frame_idx))
Exemplo n.º 14
0
def main():
    global params_save_file

    game = 'revenge'
    params_save_file += '-' + game
    params = config.HYPERPARAMS[game]
    parser = argparse.ArgumentParser()

    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    args = parser.parse_args()

    env = gym.make(params['env_name'])
    env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames'])

    print("Parameters:")
    print(params)
    sys.stdout.flush()

    writer = SummaryWriter(comment="-" + params['run_name'] + "-dqfd(PDD DQN)")
    net = dqn_model.DuelingDQN(env.observation_space.shape, env.action_space.n)
    if args.cuda:
        net.cuda()

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda)

    demo_data = demo_data_reader.get_demo_data(env, game, num_states=params['demo_size'], skip=params['skip-frames'])
    exp_source = ptan.experience.ExperienceSourceNFirstLast(env, agent, gamma=params['gamma'],
                    steps_count=params['n-steps'], demo_data=demo_data)
    buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
    buffer.populate_demo_data()
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'], weight_decay=L2_REG_LAMBDA)

    print("Demo data size: {}".format(buffer.demo_samples))
    sys.stdout.flush()

    frame_idx = 0
    beta = BETA_START

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += params['steps']
            if frame_idx > params['pretrain_steps']:
                buffer.populate(params['steps'])
            else:
                if frame_idx % 500 == 0:
                    writer.add_scalar("beta", beta, frame_idx)
                    reward_tracker.record_training(frame_idx, selector.epsilon, last_dq_losses, last_n_losses,
                        last_e_losses, last_demo_sizes)

            epsilon_tracker.frame(frame_idx)
            beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                writer.add_scalar("beta", beta, frame_idx)
                if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon, last_dq_losses,
                    last_n_losses, last_e_losses, last_demo_sizes):
                    break

            optimizer.zero_grad()
            batch, batch_indices, batch_weights = buffer.sample(params['batch_size'] * params['steps'], beta)
            batch_demo_mask = (np.array(batch_indices) < buffer.demo_samples).astype(np.uint8)

            loss_v, sample_prios = calc_loss(batch, batch_demo_mask, batch_weights, net, tgt_net.target_model,
                                                params["gamma"], params["gamma"] ** params['n-steps'],
                                                cuda=args.cuda)
            loss_v.backward()
            optimizer.step()

            buffer.update_priorities(batch_indices, sample_prios)

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()

            if frame_idx % params['save_params_every'] == 0:
                torch.save(net.state_dict(), params_save_file + str(frame_idx))

    torch.save(net.state_dict(), params_save_file + str(frame_idx))
Exemplo n.º 15
0
def train_model(cuda, phase, premodel, pdays):
    """
    cuda : True / False
    phase : 1~3
    premodel: data/phase1_model.data
    pdays: integer
    """
    device = torch.device("cuda" if cuda else "cpu")
    phase = int(phase)
    if phase == 1:
        config = sconfig
    elif phase == 2:
        config = mconfig
    elif phase == 3:
        config = pconfig

    run_name = "v" + config.version + "-phase" + str(phase)
    saves_path = os.path.join("saves", run_name)
    os.makedirs(saves_path, exist_ok=True)

    save_name = ""

    writer = SummaryWriter(comment=run_name)

    prices_list, val_prices_list = data.load_prices(config.choices)

    if phase == 1:
        s_env = environ.StocksEnvS(prices_list)
        stock_env = s_env
        val_stock_env = environ.StocksEnvS(val_prices_list)
        save_name = "{}.data".format(run_name)
    elif phase == 2:
        # phase 1 의 network 그래프를 로드한다.
        s_env = environ.StocksEnvS(prices_list)
        prenet = models.SimpleFFDQN(s_env.observation_space.shape[0],
                                    s_env.action_space.n)  #.to(device)
        models.load_model(premodel, prenet)

        # phase2 환경 생성
        stock_env = environ.StocksEnvM(prices_list, prenet)
        val_stock_env = environ.StocksEnvM(val_prices_list, prenet)
        save_name = "{}.data".format(run_name)
    elif phase == 3:
        predict_days = int(pdays)
        stock_env = pdenviron.PredEnv(prices_list=prices_list, predict_days=7)
        val_stock_env = pdenviron.PredEnv(prices_list=prices_list,
                                          predict_days=7)
        save_name = "{}-{}.data".format(run_name, predict_days)

    net = models.SimpleFFDQN(stock_env.observation_space.shape[0],
                             stock_env.action_space.n).to(device)
    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(config.epsilon_start)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        stock_env, agent, config.gamma, steps_count=config.reward_steps)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source,
                                                    config.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=config.learning_rate)

    # main training loop
    step_idx = 0
    eval_states = None
    best_mean_val = None

    with common.RewardTracker(writer, np.inf,
                              group_rewards=100) as reward_tracker:
        while step_idx < config.end_step:
            step_idx += 1
            buffer.populate(1)
            selector.epsilon = max(
                config.epsilon_stop,
                config.epsilon_start - step_idx / config.epsilon_steps)

            new_rewards = exp_source.pop_rewards_steps()
            if new_rewards:
                reward_tracker.reward(new_rewards[0], step_idx,
                                      selector.epsilon)

            if len(buffer) < config.replay_initial:
                continue

            if eval_states is None:
                print("Initial buffer populated, start training")
                eval_states = buffer.sample(config.states_to_evaluate)
                eval_states = [
                    np.array(transition.state, copy=False)
                    for transition in eval_states
                ]
                eval_states = np.array(eval_states, copy=False)

            if step_idx % config.eval_every_step == 0:
                mean_val = common.calc_values_of_states(eval_states,
                                                        net,
                                                        device=device)
                writer.add_scalar("values_mean", mean_val, step_idx)
                if best_mean_val is None or best_mean_val < mean_val:
                    if best_mean_val is not None:
                        print("%d: Best mean value updated %.3f -> %.3f" %
                              (step_idx, best_mean_val, mean_val))
                    best_mean_val = mean_val
                    #torch.save(net.state_dict(), os.path.join(saves_path, "mean_val-%.3f.data" % mean_val))

            optimizer.zero_grad()
            batch = buffer.sample(config.batch_size)
            loss_v = common.calc_loss(batch,
                                      net,
                                      tgt_net.target_model,
                                      config.gamma**config.reward_steps,
                                      device=device)
            loss_v.backward()
            optimizer.step()

            if step_idx % config.target_net_sync == 0:
                tgt_net.sync()

            if step_idx % config.checkpoint_every_step == 0:
                idx = step_idx // config.checkpoint_every_step
                torch.save(
                    net.state_dict(),
                    os.path.join(saves_path, "checkpoint-%d.data" % idx))

            if step_idx % config.validation_every_step == 0:
                res = validation.validation_run(stock_env, net, device=device)
                for key, val in res.items():
                    writer.add_scalar(key + "_test", val, step_idx)
                res = validation.validation_run(val_stock_env,
                                                net,
                                                device=device)
                for key, val in res.items():
                    writer.add_scalar(key + "_val", val, step_idx)

        models.save_model(os.path.join(saves_path, save_name), net,
                          {"predict_days": predict_days})
    train_queue = mp.Queue(maxsize=PROCESSES_COUNT)
    data_proc_list = []
    for _ in range(PROCESSES_COUNT):
        data_proc = mp.Process(target=data_func,
                               args=(net, device, train_queue))
        data_proc.start()
        data_proc_list.append(data_proc)

    batch_states = []
    batch_actions = []
    batch_vals_ref = []
    step_idx = 0
    batch_size = 0

    try:
        with common.RewardTracker(writer, REWARD_BOUND) as tracker:
            with ptan.common.utils.TBMeanTracker(writer, 100) as tb_tracker:
                while True:
                    train_entry = train_queue.get()
                    if isinstance(train_entry, TotalReward):
                        if tracker.reward(train_entry.reward, step_idx):
                            break
                        continue

                    states_t, actions_t, vals_ref_t = train_entry
                    batch_states.append(states_t)
                    batch_actions.append(actions_t)
                    batch_vals_ref.append(vals_ref_t)
                    step_idx += states_t.size()[0]
                    batch_size += states_t.size()[0]
                    if batch_size < BATCH_SIZE:
Exemplo n.º 17
0
def play_func(params, net, cuda, exp_queue, device_id):
    """
    With multiple envs, the exp_source class will return experiences
    (defined as a tuple of (state_framestack, action, reward, last_state_framestack) alternating between
    the two environments. Otherwise it returns just experinces from a single env. Even if the games have different
    frame shapes, they will by reduced to 84x84

    *** There is a reason that it reinitializes the envs in this function that has to do with parallelization ***
    """
    run_name = 'demon_invaders'
    if 'max_games' not in params:
        max_games = 16000
    else:
        max_games = params['max_games']

    envSI = gym.make('SpaceInvadersNoFrameskip-v4')
    envSI = ptan.common.wrappers.wrap_dqn(envSI)

    envDA = gym.make('DemonAttackNoFrameskip-v4')
    envDA = ptan.common.wrappers.wrap_dqn(envDA)

    device = torch.device("cuda:{}".format(device_id) if cuda else "cpu")

    if 'save_iter' not in params:
        save_iter = 500
    else:
        save_iter = params['save_iter']

    writer = SummaryWriter(comment="-" + run_name + "-03_parallel")

    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        [envSI, envDA], agent, gamma=params['gamma'], steps_count=1)
    exp_source_iter = iter(exp_source)

    fh = open('models_multi/{}_metadata.csv'.format(run_name), 'w')
    out_csv = csv.writer(fh)

    frame_idx = 0
    game_idx = 1
    model_count = 0
    model_stats = []
    mean_rewards = []
    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            exp = next(exp_source_iter)
            exp_queue.put(exp)

            epsilon_tracker.frame(frame_idx)
            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                status, num_games, mean_reward, epsilon_str = reward_tracker.reward(
                    new_rewards[0], frame_idx, selector.epsilon)
                mean_rewards.append(mean_reward)
                if status:
                    break
                if game_idx and (game_idx % save_iter == 0):
                    # write to disk
                    print("Saving model...")
                    model_name = 'models_multi/{}_{}_{}.pth'.format(
                        run_name, params['secondary'], game_idx)
                    net.to(torch.device('cpu'))
                    torch.save(net, model_name)
                    net.to(device)
                    new_row = [model_name, num_games, mean_reward, epsilon_str]
                    out_csv.writerow(new_row)
                    np.savetxt(
                        'models_multi/{}_{}_reward.txt'.format(
                            run_name, params['secondary']),
                        np.array(mean_rewards))
                if game_idx == max_games:
                    break
                game_idx += 1

    print("Saving final model...")
    model_name = 'models_multi/{}_{}_{}.pth'.format(run_name,
                                                    params['secondary'],
                                                    game_idx)
    net.to(torch.device('cpu'))
    torch.save(net, model_name)
    net.to(device)
    new_row = [model_name, num_games, mean_reward, epsilon_str]
    out_csv.writerow(new_row)
    np.savetxt(
        'models_multi/{}_{}_reward.txt'.format(run_name, params['secondary']),
        np.array(mean_rewards))
    # plt.figure(figsize=(16, 9))
    # plt.tight_layout()
    # plt.title('Reward vs time, {}'.format(run_name))
    # plt.xlabel('Iteration')
    # plt.ylabel('Reward')
    # ys = np.array(mean_rewards)
    # plt.plot(ys, c='r')
    # plt.savefig('models_multi/{}_reward.png'.format(run_name))
    # plt.close()
    fh.close()

    exp_queue.put(None)
Exemplo n.º 18
0
def train_agent(
    run_name,
    data_paths=conf.default_data_paths,
    validation_paths=conf.default_validation_paths,
    model=models.DQNConv1D,
    large=False,
    load_checkpoint=None,
    saves_path=None,
    eps_steps=None,
):
    """
    Main function for training the agents

    :run_name: a string of choice that dictates where to save
    :data_paths: dict specifying what data to train with
    :validation_paths: dict specifying what data to validate with
    :model: what model to use
    :large: whether or not to use large feature set
    :load_checkpoint: an optinal path to checkpoint to load from
    """

    print("=" * 80)
    print("Training starting".rjust(40 + 17 // 2))
    print("=" * 80)

    # Get training data
    stock_data = data.get_data_as_dict(data_paths, large=large)
    val_data = data.get_data_as_dict(validation_paths, large=large)

    # Setup before training can begin
    step_idx = 0
    eval_states = None
    best_mean_val = None
    EPSILON_STEPS = eps_steps if eps_steps is not None else conf.EPSILON_STEPS

    # Use GPU if available, else fall back on CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[Info] Using device: {device}")

    # Set up the path to save the checkpoints to
    if saves_path is None:
        saves_path = os.path.join("saves", run_name)
    else:
        saves_path = os.path.join(saves_path, run_name)

    print(f"[Info] Saving to path: {saves_path}")

    os.makedirs(saves_path, exist_ok=True)

    # Create the gym-environment that the agent will interact with during training
    env = environ.StocksEnv(
        stock_data,
        bars_count=conf.BARS_COUNT,
        reset_on_close=conf.RESET_ON_CLOSE,
        random_ofs_on_reset=conf.RANDOM_OFS_ON_RESET,
        reward_on_close=conf.REWARD_ON_CLOSE,
        large=large,
    )

    env = wrappers.TimeLimit(env, max_episode_steps=1000)

    # Create the gym-environment that the agent will interact with when validating
    env_val = environ.StocksEnv(
        val_data,
        bars_count=conf.BARS_COUNT,
        reset_on_close=conf.RESET_ON_CLOSE,
        random_ofs_on_reset=conf.RANDOM_OFS_ON_RESET,
        reward_on_close=conf.REWARD_ON_CLOSE,
        large=large,
    )

    # Create the model
    net = model(env.observation_space.shape, env.action_space.n).to(device)

    print("Using network:".rjust(40 + 14 // 2))
    print("=" * 80)
    print(net)

    # Initialize agent and epsilon-greedy action-selector from the ptan package
    # The ptan package provides some helper and wrapper functions for ease of
    # use of reinforcement learning
    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(conf.EPSILON_START)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, conf.GAMMA, steps_count=conf.REWARD_STEPS)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source,
                                                    conf.REPLAY_SIZE)
    optimizer = optim.Adam(net.parameters(), lr=conf.LEARNING_RATE)

    # If a checkpoint is supplied to the function –> resume the training from there
    if load_checkpoint is not None:
        state = torch.load(load_checkpoint)
        net.load_state_dict(state["model_state_dict"])
        optimizer.load_state_dict(state["optimizer_state_dict"])
        step_idx = state["step_idx"]
        best_mean_val = state["best_mean_val"]
        print(
            f"State loaded –> step index: {step_idx}, best mean val: {best_mean_val}"
        )

        net.train()

    # Create a reward tracker, i.e. an object that keeps track of the
    # rewards the agent gets during training
    reward_tracker = common.RewardTracker(np.inf, group_rewards=100)

    # The main training loop
    print("Training loop starting".rjust(40 + 22 // 2))
    print("=" * 80)

    # Run the main training loop
    while True:
        step_idx += 1
        buffer.populate(1)

        # Get current epsilon for epsilon-greedy action-selection
        selector.epsilon = max(conf.EPSILON_STOP,
                               conf.EPSILON_START - step_idx / EPSILON_STEPS)

        # Take a step and get rewards
        new_rewards = exp_source.pop_rewards_steps()
        if new_rewards:
            reward_tracker.reward(new_rewards[0], step_idx, selector.epsilon)

        # As long as not enough data is in buffer, go to top again
        if len(buffer) < conf.REPLAY_INITIAL:
            continue

        if eval_states is None:
            print("Initial buffer populated, start training")
            eval_states = buffer.sample(conf.STATES_TO_EVALUATE)
            eval_states = [
                np.array(transition.state, copy=False)
                for transition in eval_states
            ]
            eval_states = np.array(eval_states, copy=False)

        # Evaluate the model every x number of steps
        # and update the currently best performance if better value gotten
        if step_idx % conf.EVAL_EVERY_STEP == 0:
            mean_val = common.calc_values_of_states(eval_states,
                                                    net,
                                                    device=device)
            # If new best value –> save the model, both with meta data for resuming training
            # and as the full object for use in testing
            if best_mean_val is None or best_mean_val < mean_val:
                if best_mean_val is not None:
                    print(
                        f"{step_idx}: Best mean value updated {best_mean_val:.3f} -> {mean_val:.3f}"
                    )
                best_mean_val = mean_val
                # Save checkpoint with meta data
                torch.save(
                    {
                        "model_state_dict": net.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "step_idx": step_idx,
                        "best_mean_val": best_mean_val,
                    },
                    os.path.join(saves_path, f"mean_val-{mean_val:.3f}.data"),
                )
                # Save full object for testing
                torch.save(
                    net,
                    os.path.join(saves_path,
                                 f"mean_val-{mean_val:.3f}-fullmodel.data"),
                )

        # Reset optimizer's gradients before optimization step
        optimizer.zero_grad()
        batch = buffer.sample(conf.BATCH_SIZE)
        # Calculate the loss
        loss_v = common.calc_loss(
            batch,
            net,
            tgt_net.target_model,
            conf.GAMMA**conf.REWARD_STEPS,
            device=device,
        )
        # Calculate the gradient
        loss_v.backward()
        # Do one step of gradient descent
        optimizer.step()

        # Sync up the to networks we're using
        # Two networks in this manner should increase the agent's ability to converge
        if step_idx % conf.TARGET_NET_SYNC == 0:
            tgt_net.sync()

        # Every 1 million steps, save model in case something happens
        # so we can resume training in that case
        if step_idx % conf.CHECKPOINT_EVERY_STEP == 0:
            idx = step_idx // conf.CHECKPOINT_EVERY_STEP
            torch.save(
                {
                    "model_state_dict": net.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "step_idx": step_idx,
                    "best_mean_val": best_mean_val,
                },
                os.path.join(saves_path, f"checkpoint-{idx}.data"),
            )
            torch.save(net, os.path.join(saves_path, f"fullmodel-{idx}.data"))

    print("Training done")
Exemplo n.º 19
0
    agent = ptan.agent.DQNAgent(net, selector, device=device)

exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                       agent,
                                                       gamma=params['gamma'],
                                                       steps_count=args.nsteps)

buffer = ptan.experience.ExperienceReplayBuffer(
    exp_source, buffer_size=params['replay_size'])

optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

frame_idx = 0
eval_states = None  # will be populated with held-out states

with common.RewardTracker(writer, params['stop_reward']
                          ) as reward_tracker:  #create a reward tracker object
    while True:
        frame_idx += 1
        # ExperienceReplayBuffer asks the ExperienceSourceFirstLast to iterate by one step to get the next transition
        # ExperienceSourceFirstLast feeds observation to obtain action
        # Agent calculated Q-values through the NN
        # Action selector selects action
        # Action is fed into ExperienceSource to obtain reward and next obs
        # Buffer stores transition in FIFO order
        buffer.populate(1)  # iterates ExperienceReplayBuffer by 1 step.
        # this in turn iterates exp_source [ExperienceSourceFirstLast] by one step
        # one single experience step
        # Experience = namedtuple('Experience', ['state', 'action', 'reward', 'done'])

        # Class ExperienceSource provides us full subtrajectories of given length as the list of (s, a, r, s') objects.
        # Now it returns single object on every iteration, which is again a namedtuple with the following fields:
Exemplo n.º 20
0
    tgt_net = ptan.agent.TargetNet(net)
    agent = ptan.agent.DQNAgent(lambda x: net.qvals(x),
                                ptan.actions.ArgmaxActionSelector(),
                                device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS)
    buffer = ptan.experience.PrioritizedReplayBuffer(exp_source,
                                                     params['replay_size'],
                                                     PRIO_REPLAY_ALPHA)
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0
    beta = BETA_START

    with common.RewardTracker(MODEL_NAME, net, writer,
                              params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)
            beta = min(
                1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx):
                    break

            if len(buffer) < params['replay_initial']:
                continue

            optimizer.zero_grad()
    net = AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n)
    if args.cuda:
        net.cuda()
    print(net)

    agent = ptan.agent.ActorCriticAgent(net,
                                        apply_softmax=True,
                                        cuda=args.cuda)
    exp_source = ptan.experience.ExperienceSourceRollouts(
        envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)

    optimizer = optim.RMSprop(net.parameters(), lr=LEARNING_RATE, eps=1e-5)

    step_idx = 0

    with common.RewardTracker(writer, stop_reward=18) as tracker:
        with ptan.common.utils.TBMeanTracker(writer,
                                             batch_size=10) as tb_tracker:
            for mb_states, mb_rewards, mb_actions, mb_values in exp_source:
                # handle new rewards
                new_rewards = exp_source.pop_total_rewards()
                if new_rewards:
                    if tracker.reward(np.mean(new_rewards), step_idx):
                        break

                optimizer.zero_grad()
                states_v = Variable(torch.from_numpy(mb_states))
                mb_adv = mb_rewards - mb_values
                adv_v = Variable(torch.from_numpy(mb_adv))
                actions_t = torch.from_numpy(mb_actions)
                vals_ref_v = Variable(torch.from_numpy(mb_rewards))
Exemplo n.º 22
0
    data_proc_list = []
    # Spawn processes to run data_func
    for _ in range(PROCESSES_COUNT):
        data_proc = mp.Process(
            target=data_func,
            args=(net, device,
                  train_queue))  # The processes will run data_func()
        data_proc.start()
        data_proc_list.append(data_proc)

    batch = []
    step_idx = 0

    try:
        with common.RewardTracker(
                writer, stop_reward=REWARD_BOUND
        ) as tracker:  # Run until reward goal reached
            with ptan.common.utils.TBMeanTracker(
                    writer, batch_size=100) as tb_tracker:  # ??
                while True:
                    # Get one transition from the training queue
                    train_entry = train_queue.get()
                    # If the episode is over we will receive the total reward from that episode
                    if isinstance(train_entry, TotalReward):
                        finished, save_checkpoint = tracker.reward(
                            train_entry.reward, step_idx)
                        if save_checkpoint:
                            torch.save(
                                net.state_dict(),
                                './checkpoints/' + args.name + "-best.dat")
                        if finished:
Exemplo n.º 23
0
        net.cuda()

    tgt_net = ptan.agent.TargetNet(net)
    agent = ptan.agent.DQNAgent(net,
                                ptan.actions.ArgmaxActionSelector(),
                                cuda=args.cuda)

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params['gamma'], steps_count=1)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params['replay_size'])
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx):
                    break

            if len(buffer) < params['replay_initial']:
                continue

            optimizer.zero_grad()
            batch = buffer.sample(params['batch_size'])
            loss_v = common.calc_loss_dqn(batch,
Exemplo n.º 24
0
    writer = SummaryWriter(comment="-simple-" + args.run)
    net = models.SimpleFFDQN(env.observation_space.shape[0], env.action_space.n).to(device)
    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(EPSILON_START)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, GAMMA, steps_count=REWARD_STEPS)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, REPLAY_SIZE)
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

    # main training loop
    step_idx = 0
    eval_states = None
    best_mean_val = None

    with common.RewardTracker(writer, np.inf, group_rewards=100) as reward_tracker:
        while True:
            step_idx += 1
            buffer.populate(1)
            selector.epsilon = max(EPSILON_STOP, EPSILON_START - step_idx / EPSILON_STEPS)

            new_rewards = exp_source.pop_rewards_steps()
            if new_rewards:
                reward_tracker.reward(new_rewards[0], step_idx, selector.epsilon)

            if len(buffer) < REPLAY_INITIAL:
                continue

            if eval_states is None:
                print("Initial buffer populated, start training")
                eval_states = buffer.sample(STATES_TO_EVALUATE)
Exemplo n.º 25
0
def main():

    env = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000)
    save_path = os.path.join("saves", "ddpg-")
    os.makedirs(save_path, exist_ok=True)

    device = torch.device("cuda")
    act_net = model.DDPGActor(env.observation_space.shape[0],
                              env.action_space.shape[0]).to(device)
    crt_net = model.D4PGCritic(env.observation_space.shape[0],
                               env.action_space.shape[0], N_ATOMS, Vmin,
                               Vmax).to(device)
    print(act_net)
    print(crt_net)
    tgt_act_net = common.TargetNet(act_net)
    tgt_crt_net = common.TargetNet(crt_net)
    writer = SummaryWriter(comment="-d4pg_")
    agent = model.AgentDDPG(act_net, device=device)
    exp_source = experience.ExperienceSourceFirstLast(env,
                                                      agent,
                                                      gamma=GAMMA,
                                                      steps_count=REWARD_STEPS)
    buffer = experience.ExperienceReplayBuffer(exp_source,
                                               buffer_size=REPLAY_SIZE)
    act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE)
    crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE)
    frame_idx = 0
    best_reward = None
    with common.RewardTracker(writer) as tracker:
        with common.TBMeanTracker(writer, batch_size=10) as tb_tracker:
            while True:
                frame_idx += 1
                #print("populate")
                buffer.populate(1)
                rewards_steps = exp_source.pop_rewards_steps()
                #print(rewards_steps)
                if rewards_steps:
                    rewards, steps = zip(*rewards_steps)
                    tb_tracker.track("episode_steps", steps[0], frame_idx)
                    tracker.reward(rewards[0], frame_idx)

                if len(buffer) < 100:
                    continue
                batch = buffer.sample(BATCH_SIZE)
                #print("infer")
                states_v, actions_v, rewards_v, dones_mask, last_states_v = common.unpack_batch_ddqn(
                    batch, device)
                #print("train critic")# train critic
                crt_opt.zero_grad()
                crt_distr_v = crt_net(states_v, actions_v)
                last_act_v = tgt_act_net.target_model(last_states_v)
                last_distr_v = F.softmax(tgt_crt_net.target_model(
                    last_states_v, last_act_v),
                                         dim=1)
                proj_distr_v = distr_projection(last_distr_v,
                                                rewards_v,
                                                dones_mask,
                                                gamma=GAMMA**REWARD_STEPS,
                                                device=device)
                prob_dist_v = -F.log_softmax(crt_distr_v, dim=1) * proj_distr_v
                critic_loss_v = prob_dist_v.sum(dim=1).mean()
                critic_loss_v.backward()
                crt_opt.step()
                tb_tracker.track("loss_critic", critic_loss_v, frame_idx)
                #print("train actor")
                # train actor
                act_opt.zero_grad()
                act_opt.zero_grad()
                cur_actions_v = act_net(states_v)
                crt_distr_v = crt_net(states_v, cur_actions_v)
                actor_loss_v = -crt_net.distr_to_q(crt_distr_v)
                actor_loss_v = actor_loss_v.mean()
                actor_loss_v.backward()
                act_opt.step()
                tb_tracker.track("loss_actor", actor_loss_v, frame_idx)
                tgt_act_net.alpha_sync(alpha=1 - 1e-3)
                tgt_crt_net.alpha_sync(alpha=1 - 1e-3)
                if frame_idx % TEST_ITERS == 0:
                    print("testing")
                    env.reset()
                    ts = time.time()
                    rewards, steps = test_net(act_net, env, device=device)
                    print("Test done in %.2f sec, reward %.3f, steps %d" %
                          (time.time() - ts, rewards, steps))
                    writer.add_scalar("test_reward", rewards, frame_idx)
                    writer.add_scalar("test_steps", steps, frame_idx)
                    if best_reward is None or best_reward < rewards:
                        if best_reward is not None:
                            print("Best reward updated: %.3f -> %.3f" %
                                  (best_reward, rewards))
                            name = "best_%+.3f_%d.dat" % (rewards, frame_idx)
                            fname = os.path.join(save_path, name)
                            torch.save(act_net.state_dict(), fname)
                        best_reward = rewards
Exemplo n.º 26
0
def main():
    global params_save_file

    game = 'spaceinvaders'
    params_save_file += '-' + game

    params = config.HYPERPARAMS[game]
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    parser.add_argument("--double", default=True, action="store_true", help="Enable double DQN")
    args = parser.parse_args()

    env = gym.make(params['env_name'])
    env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames'])

    print("Parameters:")
    print(params)
    sys.stdout.flush()

    writer = SummaryWriter(comment="-" + params['run_name'] + "-double=" + str(args.double))
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    if args.cuda:
        net.cuda()

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0
    eval_states = None

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += params['steps']
            buffer.populate(params['steps'])
            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
                    break

            if len(buffer) < params['replay_initial']:
                continue
            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [np.array(transition.state, copy=False) for transition in eval_states]
                eval_states = np.array(eval_states, copy=False)

            optimizer.zero_grad()
            batch = buffer.sample(params['batch_size'] * params['steps'])
            loss_v = calc_loss(batch, net, tgt_net.target_model, gamma=params['gamma'], cuda=args.cuda,
                               double=args.double)
            loss_v.backward()
            optimizer.step()

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()
            if frame_idx % EVAL_EVERY_FRAME == 0:
                mean_val = calc_values_of_states(eval_states, net, cuda=args.cuda)
                writer.add_scalar("values_mean", mean_val, frame_idx)

            if frame_idx % params['save_params_every'] == 0:
                torch.save(net.state_dict(), params_save_file + str(frame_idx))

    torch.save(net.state_dict(), params_save_file + str(frame_idx))
Exemplo n.º 27
0
def play_func(params, net, cuda, exp_queue, device_id):
    """
    The paper suggests sampling the actions from the learner net, so that requires little change from the multienv implementation.

    *** There is a reason that it reinitializes the envs in this function that has to do with parallelization ***
    """
    run_name = params['run_name']
    if 'max_games' not in params:
        max_games = 16000
    else:
        max_games = params['max_games']

    envSI = gym.make('SpaceInvadersNoFrameskip-v4')
    envSI = ptan.common.wrappers.wrap_dqn(envSI)

    envDA = gym.make('DemonAttackNoFrameskip-v4')
    envDA = ptan.common.wrappers.wrap_dqn(envDA)

    device = torch.device("cuda:{}".format(device_id) if cuda else "cpu")

    writer = SummaryWriter(comment="-" + run_name + "-03_parallel")

    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ExperienceSourceFirstLast_AM([envSI, envDA],
                                              agent,
                                              gamma=params['gamma'],
                                              steps_count=1)
    exp_source_iter = iter(exp_source)

    fh = open('mimic_models/{}_metadata.csv'.format(run_name), 'w')
    out_csv = csv.writer(fh)

    frame_idx = 0
    game_idx = 1
    model_count = 0
    model_stats = []
    mean_rewards = []
    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            exp = next(exp_source_iter)
            exp_queue.put(exp)

            epsilon_tracker.frame(frame_idx)
            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                status, num_games, mean_reward, epsilon_str = reward_tracker.reward(
                    new_rewards[0], frame_idx, selector.epsilon)
                mean_rewards.append(mean_reward)
                if status:
                    break
                if game_idx and (game_idx % 500 == 0):
                    # write to disk
                    print("Saving model...")
                    model_name = 'mimic_models/{}_{}.pth'.format(
                        run_name, game_idx)
                    net.to(torch.device('cpu'))
                    torch.save(net, model_name)
                    net.to(device)
                    new_row = [model_name, num_games, mean_reward, epsilon_str]
                    out_csv.writerow(new_row)
                    np.savetxt('mimic_models/{}_reward.txt'.format(run_name),
                               np.array(mean_rewards))
                if game_idx == max_games:
                    break
                game_idx += 1

    print("Saving final model...")
    model_name = 'mimic_models/{}_{}.pth'.format(run_name, game_idx)
    net.to(torch.device('cpu'))
    torch.save(net, model_name)
    net.to(device)
    new_row = [model_name, num_games, mean_reward, epsilon_str]
    out_csv.writerow(new_row)
    np.savetxt('mimic_models/{}_reward.txt'.format(run_name),
               np.array(mean_rewards))
    # plt.figure(figsize=(16, 9))
    # plt.tight_layout()
    # plt.title('Reward vs time, {}'.format(run_name))
    # plt.xlabel('Iteration')
    # plt.ylabel('Reward')
    # ys = np.array(mean_rewards)
    # plt.plot(ys, c='r')
    # plt.savefig('mimic_models/{}_reward.png'.format(run_name))
    # plt.close()
    fh.close()

    exp_queue.put(None)
Exemplo n.º 28
0
    tgt_act_net = ptan.agent.TargetNet(act_net)
    tgt_crt_net = ptan.agent.TargetNet(crt_net)

    writer = SummaryWriter(comment=f"-ddpg_{args.name}")
    agent = dd_utils.AgentDDPG(act_net, device=device, clip_actions=params["clip_actions"])

    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params["gamma"], steps_count=1)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params["replay_size"])

    # the original paper was using a different optimizer for each network
    act_opt = torch.optim.Adam(act_net.parameters(), lr=params["lr"])
    crt_opt = torch.optim.Adam(crt_net.parameters(), lr=params["lr"])

    frame_idx = 0
    best_reward = None
    with common.RewardTracker(writer, stop_reward=params["stopping_reward"]) as tracker:
        with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker:
            while True:
                frame_idx += 1
                buffer.populate(1)
                rewards_steps = exp_source.pop_rewards_steps()
                if rewards_steps:
                    rewards, steps = zip(*rewards_steps)
                    tb_tracker.track("episode_steps", steps[0], frame_idx)
                    tracker.reward(rewards[0], frame_idx)

                if len(buffer) < params["replay_init"]:
                    continue

                batch = buffer.sample(params["batch_size"])
                states_v, actions_v, rewards_v, dones_mask, last_states_v = dd_utils.unpack_batch(batch, device)