예제 #1
0
def main(to_train, save_path):
    torch.manual_seed(1234)
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=False,
                        action="store_true",
                        help="Enable cuda computation")
    parser.add_argument("--env",
                        default=DEFAULT_ENV_NAME,
                        help="default env name")
    args = parser.parse_args()
    device = torch.device(
        "cuda" if args.cuda and torch.cuda.is_available() else "cpu")

    os.makedirs(save_path, exist_ok=True)
    env = wrappers.make_env(args.env)
    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)
    target_net = dqn_model.DQN(env.observation_space.shape,
                               env.action_space.n).to(device)
    print(net)

    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START

    optimizer = optim.Adam(net.parameters(),
                           lr=LEARNING_RATE)  # only need one optimizer

    if to_train:
        train(env, net, target_net, buffer, agent, optimizer, device,
              save_path)
예제 #2
0
    if not os.path.exists('mimic_models'):
        os.makedirs('mimic_models')

    envSI = gym.make('SpaceInvadersNoFrameskip-v4')
    envSI = ptan.common.wrappers.wrap_dqn(envSI)

    envDA = gym.make('DemonAttackNoFrameskip-v4')
    envDA = ptan.common.wrappers.wrap_dqn(envDA)

    assert envSI.action_space.n == envDA.action_space.n, "Different Action Space Lengths"
    assert envSI.observation_space.shape == envDA.observation_space.shape, "Different Obs. Space Shapes"

    print("Loaded Environments: {}l {}".format(envSI.unwrapped.spec.id,
                                               envDA.unwrapped.spec.id))

    expertSI = dqn_model.DQN(envSI.observation_space.shape,
                             envSI.action_space.n)
    expertSI.load_state_dict(
        torch.load(args.si, map_location=device).state_dict())
    expertSI_hidden = dqn_model.DQN_Hidden(envSI.observation_space.shape,
                                           envSI.action_space.n,
                                           expertSI).to(device)
    expertSI = expertSI.to(device)
    expertSI.eval()
    expertSI_hidden.eval()

    expertDA = dqn_model.DQN(envSI.observation_space.shape,
                             envSI.action_space.n)
    expertDA.load_state_dict(
        torch.load(args.da, map_location=device).state_dict())
    expertDA_hidden = dqn_model.DQN_Hidden(envSI.observation_space.shape,
                                           envSI.action_space.n,
예제 #3
0

if __name__ == "__main__":
    mkdir('.', 'checkpoints')
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    parser.add_argument("--env", default=DEFAULT_ENV_NAME,
                        help="Name of the environment, default=" + DEFAULT_ENV_NAME)
    parser.add_argument("--reward", type=float, default=MEAN_REWARD_GOAL,
                        help="Mean reward goal to stop training, default=%.2f" % MEAN_REWARD_GOAL)
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = wrappers.make_env(args.env)

    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
    tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
    writer = SummaryWriter(comment="-" + args.env)
    print(net)

    buffer = ExperienceBuffer(REPLAY_BUFFER_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None
예제 #4
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-m", "--model", required=True, help="Model file to load")
    parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME,
                        help="Environment name to use, default=" + DEFAULT_ENV_NAME)
    parser.add_argument("-r", "--record", help="Directory to store video recording")
    parser.add_argument("--no-visualize", default=True, action='store_false', dest='visualize',
                        help="Disable visualization of the game play")
    args = parser.parse_args()

    env = wrappers.make_env(args.env)
    if args.record:
        mkdir('.', args.record)
        env = gym.wrappers.Monitor(env, args.record)
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    net.load_state_dict(torch.load(args.model, map_location=lambda storage, loc: storage))

    state = env.reset()
    total_reward = 0.0
    c = collections.Counter()

    while True:
        start_ts = time.time()
        if args.visualize:
            env.render()
        state_v = torch.tensor(np.array([state], copy=False))
        q_vals = net(state_v).data.numpy()[0]
        action = np.argmax(q_vals)
        c[action] += 1
        state, reward, done, _ = env.step(action)
예제 #5
0
def main(cuda: bool, env_name: str, reward_stop: float, render: bool,
         weights_fn: str, fps: float, epsilon_fixed: float, no_learn: bool):
    device = torch.device("cuda" if cuda else "cpu")
    # create environment
    env: gym.Env = wrappers.make_env(env_name)

    # create both neural networks
    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)
    tgt_net = dqn_model.DQN(env.observation_space.shape,
                            env.action_space.n).to(device)

    if weights_fn:
        assert os.path.isfile(weights_fn), "File {0} does not exist.".format(
            weights_fn)
        state_dict = torch.load(weights_fn, map_location=device)
        net.load_state_dict(state_dict)
        tgt_net.load_state_dict(state_dict)

    # create summary writer for tensorboard
    writer = SummaryWriter(comment="-" + env_name)

    # create buffer and agent and init epsilon
    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer, render=render)

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards: List[float] = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward: Optional[float] = None

    while True:
        frame_idx += 1
        # update epsilon
        if epsilon_fixed:
            epsilon = epsilon_fixed
        else:
            epsilon = max(EPSILON_FINAL,
                          EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
        # play one step
        t_step_0 = time.time()
        reward = agent.play_step(net, epsilon, device)
        if fps:
            while 1 / (time.time() - t_step_0) > fps:
                time.sleep(0.01)

        if reward is not None:
            # add reward to total and calculate mean
            total_rewards.append(reward)
            mean_reward = np.mean(total_rewards[-100:])

            # meter speed
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()

            # print and write information
            print(
                "%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s"
                % (frame_idx, len(total_rewards), float(mean_reward), epsilon,
                   speed))
            writer.add_scalar("epsilon", epsilon, frame_idx)
            writer.add_scalar("speed", speed, frame_idx)
            writer.add_scalar("reward_100", mean_reward, frame_idx)
            writer.add_scalar("reward", reward, frame_idx)

            if best_mean_reward is None or best_mean_reward < mean_reward:
                torch.save(net.state_dict(), env_name + "-best.dat")
                if best_mean_reward is not None:
                    print(
                        "Best mean reward updated %.3f -> %.3f, model saved" %
                        (best_mean_reward, float(mean_reward)))
                best_mean_reward = float(mean_reward)
            if mean_reward > reward_stop:
                print("Solved in {0} frames!".format(frame_idx))
                break

        if len(buffer) < REPLAY_START_SIZE or no_learn:
            continue

        # sync target net with training net
        if frame_idx % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())

        optimizer.zero_grad()
        batch = buffer.sample(BATCH_SIZE)
        loss_t = calc_loss(batch, net, tgt_net, device=device)
        loss_t.backward()
        optimizer.step()

    writer.close()
예제 #6
0
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    parser.add_argument("-n", type=int, default=STEP_COUNT, help="Steps to do on Bellman unroll")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = gym.make(params.env_name)
    env = drl.common.wrappers.wrap_dqn(env)
    env.seed(common.SEED)
    input_shape = env.observation_space.shape
    n_actions = env.action_space.n

    selector = dac.EpsilonGreedySelector()
    eps_tracker = dac.EpsilonTracker(selector, params.epsilon_start, params.epsilon_final, params.epsilon_frames)

    net = dqn_model.DQN(input_shape, n_actions).to(device)
    agent = dag.DQNAgent(net, selector, device)
    tgt_net = dag.TargetNet(net)

    buffer = dexp.ReplayBuffer(params.replay_size)
    exp_source = dexp.ExperienceSource(env, agent, buffer, args.n, params.gamma)

    writer = SummaryWriter(comment="-" + params.env_name)
    print(net)

    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)
    total_reward = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_m_reward = None
예제 #7
0
from lib import dqn_model, common


# Select hyperparameters
params       = common.HYPERPARAMS['star_gunner']
total_frames = params['total_frames']
rep_init     = params['replay_initial']
tgt_net_sync = params['target_net_sync']
batch_size   = params['batch_size']
HIST_LENGTH = 4
UPDATE_FREQ = 4

# Initialise environment and use dqn wrappers
env = rl.common.wrappers.make_atari(params['env_name'])
env = rl.common.wrappers.wrap_deepmind(env=env, stack_frames=HIST_LENGTH)
makeDQN = dqn_model.DQN(env.action_space.n)

# Placeholders
state  = tf.placeholder(tf.float32, shape=[None, HIST_LENGTH, 84, 84], name='state')
action = tf.placeholder(tf.float32, shape=[None, env.action_space.n], name='action')
reward = tf.placeholder(tf.float32, shape=[None], name='reward')
done   = tf.placeholder(tf.float32, shape=[None], name='done')
state2 = tf.placeholder(tf.float32, shape=[None, HIST_LENGTH, 84, 84], name='next_state')

# Loss function
net_q, net_vars = makeDQN.create_model(state, name='online')
tgt_q, tgt_vars = makeDQN.create_model(state2, name='target')
q = tf.reduce_sum(net_q * action, axis=1)
max_tgt_q = tf.reduce_max(tgt_q, axis=1)
tgt = reward + (1. - done) * params['gamma'] * max_tgt_q
delta = tf.stop_gradient(tgt) - q
예제 #8
0
def main():
    global params_save_file

    game = 'spaceinvaders'
    params_save_file += '-' + game

    params = config.HYPERPARAMS[game]
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    args = parser.parse_args()

    env = gym.make(params['env_name'])
    env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames'])

    print("Parameters:")
    print(params)
    sys.stdout.flush()

    writer = SummaryWriter(comment="-" + params['run_name'] + "-prio-replay")
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    if args.cuda:
        net.cuda()

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'],
                    steps_count=1)
    buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0
    beta = BETA_START

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += params['steps']
            buffer.populate(params['steps'])
            epsilon_tracker.frame(frame_idx)
            beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                writer.add_scalar("beta", beta, frame_idx)
                if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon, last_dq_losses):
                    break

            if len(buffer) < params['replay_initial']:
                continue

            optimizer.zero_grad()
            batch, batch_indices, batch_weights = buffer.sample(params['batch_size'] * params['steps'], beta)
            loss_v, sample_prios = calc_loss(batch, batch_weights, net, tgt_net.target_model,
                                                params["gamma"], cuda=args.cuda)
            loss_v.backward()
            optimizer.step()
            buffer.update_priorities(batch_indices, sample_prios)

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()

            if frame_idx % params['save_params_every'] == 0:
                torch.save(net.state_dict(), params_save_file + str(frame_idx))

    torch.save(net.state_dict(), params_save_file + str(frame_idx))
예제 #9
0
if __name__ == "__main__":
    print('\n\n***********************************************************')
    print("* RELINE model's training on MsPacman game is starting... *")
    print('***********************************************************\n')
    # set the device -> cuda or cpu
    device = "cpu"
    # create the wrapped environment
    env = wrappers.make_env(DEFAULT_ENV_NAME)
    num_actions = 5  # exclude actions: 5 6 7 8
    # 0 -> none
    # 1 -> up
    # 2 -> right
    # 3 -> left
    # 4 -> down
    net = dqn_model.DQN(env.observation_space.shape, num_actions).to(device)
    tgt_net = dqn_model.DQN(env.observation_space.shape,
                            num_actions).to(device)
    print(net)

    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None
예제 #10
0
파일: ddqn.py 프로젝트: stefanzzz22/ptan
def main():
    global params_save_file

    game = 'spaceinvaders'
    params_save_file += '-' + game

    params = config.HYPERPARAMS[game]
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    parser.add_argument("--double", default=True, action="store_true", help="Enable double DQN")
    args = parser.parse_args()

    env = gym.make(params['env_name'])
    env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames'])

    print("Parameters:")
    print(params)
    sys.stdout.flush()

    writer = SummaryWriter(comment="-" + params['run_name'] + "-double=" + str(args.double))
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    if args.cuda:
        net.cuda()

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0
    eval_states = None

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += params['steps']
            buffer.populate(params['steps'])
            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
                    break

            if len(buffer) < params['replay_initial']:
                continue
            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [np.array(transition.state, copy=False) for transition in eval_states]
                eval_states = np.array(eval_states, copy=False)

            optimizer.zero_grad()
            batch = buffer.sample(params['batch_size'] * params['steps'])
            loss_v = calc_loss(batch, net, tgt_net.target_model, gamma=params['gamma'], cuda=args.cuda,
                               double=args.double)
            loss_v.backward()
            optimizer.step()

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()
            if frame_idx % EVAL_EVERY_FRAME == 0:
                mean_val = calc_values_of_states(eval_states, net, cuda=args.cuda)
                writer.add_scalar("values_mean", mean_val, frame_idx)

            if frame_idx % params['save_params_every'] == 0:
                torch.save(net.state_dict(), params_save_file + str(frame_idx))

    torch.save(net.state_dict(), params_save_file + str(frame_idx))