예제 #1
0
    def __init__(self, env, params, net=None, reward=[], loss=[]):
        self.params = params
        self.r_sums = reward
        self.l_means = loss
        # PARAMS
        self.gamma = params["gamma"]
        self.freq_copy = params["freq_copy"]
        self.tau = params["max_tau"]
        self.tau_decay = params["tau_decay"]
        self.min_tau = params["min_tau"]
        self.exploration = params["exploration"]
        self.sigma = params["sigma"]
        self.alpha = params["alpha"]
        self.m = params["m"]
        self.frame_skip = params["frame_skip"]
        self.target_update_strategy = params["target_update_strategy"]
        self.batch_size = params["batch_size"]
        self.cuda = False
        # NEURAL NETWORK
        self.n_action = env.action_space.n
        self.net = QModel(self.n_action)
        if net is not None:
            self.net.load_state_dict(net)
        self.target = copy.deepcopy(self.net)
        self.optimizer = params["optimizer"](self.net.parameters(),
                                             lr=self.sigma)
        self.criterion = params["criterion"]()
        self.buff = Buffer(params["buffer_size"])

        self.env = wrappers.AtariPreprocessing(env,
                                               frame_skip=self.frame_skip,
                                               screen_size=84,
                                               grayscale_obs=True,
                                               scale_obs=True)
        self.env = wrappers.FrameStack(self.env, self.m)
예제 #2
0
    def __init__(self,
                 variant="dueling",
                 use_gpus=False,
                 learning_rate=0.00025,
                 headless=False,
                 epoch_length=50000):
        super().__init__()
        #self.env = gym.make("Pong-v0")
        self.headless = headless
        self.env = gym.make("Pong-v0")
        self.env = wrappers.FrameStack(
            wrappers.ResizeObservation(wrappers.GrayScaleObservation(self.env),
                                       84), 4)

        if self.headless:
            self.virtual_display = Display(visible=0, size=(1400, 900))
            self.virtual_display.start()

        self.env = wrappers.Monitor(self.env,
                                    "./video/Pong-v0",
                                    video_callable=lambda episode_id: True,
                                    force=True)
        self.observation = self.env.reset()
        self.epsilon = 1
        self.discount_factor = 0.99
        self.network = DuelingGameNet(
            self.env.action_space.n,
            use_gpus) if variant == "dueling" else StandardGameNet(
                self.env.action_space.n, use_gpus)
        self.target_network = DuelingGameNet(
            self.env.action_space.n,
            use_gpus) if variant == "dueling" else StandardGameNet(
                self.env.action_space.n, use_gpus)
        self.target_update_rate = 1000
        self.reward = 0
        self.game_reward = 0
        self.games = 0
        self.epoch_length = epoch_length
        self.learning_rate = learning_rate
        self.steps_to_train = 4
        self.last_ten = Memory(10)
        self.variant = variant
예제 #3
0
파일: train.py 프로젝트: nk53/dqn
print("Using environment:", env_name)
if 'StarGunner' in env_name:
    use_conv = True
    env = gym.wrappers.AtariPreprocessing(env,
                                          frame_skip=4,
                                          grayscale_obs=False,
                                          screen_size=84)
else:
    use_conv = False

if 'StarGunner' in env_name:
    #env = wrappers.EpisodicLifeEnv(env)
    env = wrappers.GrayScaleObservation(env)
    env = ResizeObservationKeepDims(env, (84, 84))

env = wrappers.FrameStack(env, 4, lz4_compress=True)

if calc_rolling_avg:
    env = RollingMeanReturn(env, window=100)
    env = RecordInfo(env, rolling_avgs_file, ["episode"], overwritefile=False)

import DQN
import GymTrainer as gt

# try to resume training, or start new training example
try:
    model = DQN.DQNmodel(env.action_space.n,
                         folder, {"minutes": 1},
                         use_convolutions=use_conv)
    replay_mem_file = model.load(backup_folder=folder)
    assert replay_mem_file
예제 #4
0
    with open(os.path.join(eval_args.path, 'args.json'), 'r') as f:
        train_args = ArgsStruct(**json.load(f))

    env_name = eval_args.env if eval_args.env is not None else train_args.env
    episodes = eval_args.episodes if eval_args.episodes is not None else train_args.optimize_freq

    timestamp = time.strftime("%Y-%m-%d-%H%M")
    log_path = os.path.join(exp_dir, f'eval-{timestamp}.log')
    logging.basicConfig(filename=log_path, level=logging.INFO)
    logging.getLogger('').addHandler(logging.StreamHandler())

    if env_name == 'pong':
        env_id = 'PongNoFrameskip-v0'
        env = gym.make(env_id)
    else:
        raise NotImplementedError(env_name)

    device = torch.device('cuda:0' if not eval_args.no_cuda
                          and torch.cuda.is_available() else 'cpu')

    env = wrappers.FrameStack(wrappers.AtariPreprocessing(env), num_stack=4)

    model = load_agent(train_args, env).to(device)
    model.load_state_dict(
        torch.load(os.path.join(exp_dir, 'checkpoint_best.pt')))

    eval_res = evaluate(model, env, train_args, device, episodes)

    logging.info(pformat(eval_res))
예제 #5
0
    args = parser.parse_args()

    env = gym.make(args.env_id)
    env.spec.id += " NoFrameskip"

    # You provide the directory to write to (can be an existing
    # directory, including one with existing data -- all monitor files
    # will be namespaced). You can also dump to a tempdir if you'd
    # like: tempfile.mkdtemp().
    outdir = '/tmp/random-agent-results'
    env = wrappers.Monitor(env, directory=outdir, force=True)
    env = wrappers.AtariPreprocessing(env,
                                      screen_size=84,
                                      frame_skip=4,
                                      grayscale_obs=True)
    env = wrappers.FrameStack(env, 4)
    env.seed(0)

    neural_network = ConvNet()
    target_neural_network = copy.deepcopy(neural_network)
    print(list(neural_network.parameters()))
    criterion = nn.MSELoss()
    optim = torch.optim.SGD(neural_network.parameters(),
                            lr=LEARNING_RATE,
                            momentum=MOMENTUM)
    optim.zero_grad()
    reward = 0
    buffer = deque(maxlen=10000)
    done = False
    reward_evolution = []
    epsilon_greedy_evolution = []
예제 #6
0
    # You can set the level to logger.DEBUG or logger.WARN if you
    # want to change the amount of output.
    logger.set_level(logger.WARN)

    env = gym.make(args.env_id)
    outdir = './videos/' + module
    env = wrappers.Monitor(env, directory=outdir, force=True)

    # You provide the directory to write to (can be an existing
    # directory, including one with existing data -- all monitor files
    # will be namespaced). You can also dump to a tempdir if you'd
    # like: tempfile.mkdtemp().
    env.seed(0)

    preproc = wrappers.AtariPreprocessing(env)
    preproc = wrappers.FrameStack(preproc, 4)

    neur = CNN(4, env.action_space.n)
    neur.load_state_dict(torch.load('./trained_networks/' + module + '.n'))
    neur.eval()
    agent = AgentAtari(4, env.action_space.n, False, neur)

    episode_count = 5

    reward = 0
    done = False

    reward_accumulee = 0

    for _ in range(episode_count):
        ob = preproc.reset()
예제 #7
0
def train(args: argparse.Namespace, env: gym.Env, exp_dir: str):
    seed = args.seed
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

    device = torch.device(
        'cuda:0' if not args.no_cuda and torch.cuda.is_available() else 'cpu')

    env = wrappers.FrameStack(wrappers.AtariPreprocessing(env),
                              num_stack=args.stacked_frames)

    writer = SummaryWriter(log_dir=exp_dir)
    with open(os.path.join(exp_dir, 'args.json'), 'w') as f:
        json.dump(args.__dict__, f, indent=2)

    logging.info(args)

    n_actions = env.action_space.n

    current_model = load_agent(args, env).to(device)
    current_model.eval()
    target_model = load_agent(args, env).to(device)
    target_model.eval()

    if args.curiosity:
        curiosity = load_icm(args, env).to(device)
        curiosity.eval()

    target_model.load_state_dict(
        current_model.state_dict())  # Sync/update target model

    # rms-prop? https://www.reddit.com/r/reinforcementlearning/comments/ei9p3y/using_rmsprop_over_adam/
    if args.optimizer == 'adam':
        optimizer = optim.Adam(current_model.parameters(),
                               lr=args.lr,
                               weight_decay=args.weight_decay)
        if args.curiosity:
            curiosity_optimizer = optim.Adam(curiosity.parameters(),
                                             lr=args.lr,
                                             weight_decay=args.weight_decay)
    else:
        logging.error('Optimizer not implemented')
        raise NotImplementedError()

    logging.info(current_model)
    if args.curiosity:
        logging.info(curiosity)
    n_params = sum(p.numel() for p in current_model.parameters()
                   if p.requires_grad)
    logging.info(f'Training {n_params} parameters')
    if args.curiosity:
        n_params = sum(p.numel() for p in curiosity.parameters()
                       if p.requires_grad)
        logging.info(f'Training {n_params} parameters')

    criterion = nn.SmoothL1Loss if args.criterion == 'huber' else None
    if criterion is None:
        raise NotImplementedError(args.criterion)

    buffer = ReplayBuffer(capacity=args.replay_size, seed=args.seed)

    best_mean_reward = env.reward_range[0]
    updates_without_improvement = 0

    # Adapted from Mario Martin's Notebook
    epsilon_start = 1.0
    epsilon_final = 0.01
    epsilon_decay = 10000
    epsilon_by_episode = lambda e: epsilon_final + (
        epsilon_start - epsilon_final) * math.exp(-1. * e / epsilon_decay)

    if args.curiosity:
        epsilon_by_episode = lambda e: 0.0  # No epsilon needed if curiosity is used

    t0 = time.time()
    all_rewards = []
    all_steps = []
    all_mean_rewards = []
    all_mean_steps = []

    episode_set_rewards = 0.0
    episode_set_curiosity_rewards = 0.0
    episode_set_steps = 0

    updates = 0
    optimizations = 0

    initial_counter = 0

    for episode in range(args.episodes):
        state = env.reset()
        episode_reward = 0.0
        episode_curiosity_reward = 0.0
        steps = 0
        epsilon = epsilon_by_episode(episode)

        while True:
            current_model.eval()
            if args.curiosity:
                curiosity.eval()
            action = current_model.act(
                torch.tensor(transform(
                    state.__array__())).unsqueeze(0).to(device), epsilon,
                torch.rand(1)[0].to(device),
                torch.randint(0, n_actions, (1, ))[0].to(device))
            current_model.train()

            next_state, reward, done, _ = env.step(action)

            episode_reward += reward

            curiosity_reward = None
            if args.curiosity:
                with torch.no_grad():
                    curiosity_reward, _ = \
                        curiosity(torch.tensor(transform(state.__array__())).unsqueeze(0).to(device),
                                  torch.tensor(transform(next_state.__array__())).unsqueeze(0).to(device),
                                  torch.tensor([action]).long().to(device))
                episode_curiosity_reward += curiosity_reward

            buffer.push(
                LazyTransition(
                    state, action, next_state, reward, done,
                    curiosity_reward.cpu().numpy()
                    if curiosity_reward is not None else None))

            if done:
                initial_counter += 1
                writer.add_scalar('Reward/train', episode_reward, episode + 1)
                writer.add_scalar('Steps/train', steps, episode + 1)
                writer.add_scalar('Epsilon/train', epsilon, episode + 1)
                all_rewards.append(episode_reward)
                all_steps.append(steps)
                episode_set_rewards += episode_reward
                episode_set_steps += steps
                if args.curiosity:
                    writer.add_scalar('Curiosity/train',
                                      episode_curiosity_reward, episode + 1)
                    episode_set_curiosity_rewards += episode_curiosity_reward

            state = next_state
            steps += 1

            if args.render:
                env.render()

            if done:
                logging.info(
                    f'Finished episode {episode+1} with reward = {episode_reward:.2f} | steps = {steps+1} | '
                    f'epsilon = {epsilon:.2f}')
                if args.curiosity:
                    logging.info(f'curiosity = {curiosity_reward:.2f}')
                break

        if buffer.full and (
                episode + 1
        ) % args.optimize_freq == 0:  # len(buffer) >= args.batch_size:

            transitions = buffer.sample(args.batch_size)

            if not args.curiosity:
                q_loss, _ = optimize(transitions, current_model, target_model,
                                     optimizer, device, epsilon,
                                     args.criterion)
            else:
                q_loss, curiosity_loss = optimize(transitions, current_model,
                                                  target_model, optimizer,
                                                  device, epsilon,
                                                  args.criterion, curiosity,
                                                  curiosity_optimizer)

            denominator = args.optimize_freq - 1 if optimizations > 0 else initial_counter
            mean_episode_set_rewards = episode_set_rewards / denominator
            mean_episode_set_steps = episode_set_steps / denominator
            writer.add_scalar('Mean-Reward/train', mean_episode_set_rewards,
                              optimizations + 1)
            writer.add_scalar('Mean-Steps/train', mean_episode_set_steps,
                              optimizations + 1)
            writer.add_scalar('Q-Loss/train', q_loss, optimizations + 1)
            if args.curiosity:
                writer.add_scalar('Curiosity-Loss/train', curiosity_loss,
                                  optimizations + 1)
            all_mean_rewards.append(mean_episode_set_rewards)
            all_mean_steps.append(mean_episode_set_steps)
            episode_set_rewards = 0.0
            episode_set_steps = 0

            torch.save(current_model.state_dict(),
                       os.path.join(exp_dir, 'checkpoint_last.pt'))
            if args.curiosity:
                torch.save(
                    curiosity.state_dict(),
                    os.path.join(exp_dir, 'curiosity_checkpoint_last.pt'))

            logging.info(f'Optimized model ({optimizations+1} optimizations)')
            optimizations += 1

            if mean_episode_set_rewards > best_mean_reward:
                updates_without_improvement = 0
                best_mean_reward = mean_episode_set_rewards
                torch.save(current_model.state_dict(),
                           os.path.join(exp_dir, 'checkpoint_best.pt'))
                logging.info(f'NEW: Best mean reward: {best_mean_reward:.2f}')
                if best_mean_reward == env.reward_range[1]:
                    logging.info('Reached max reward')
                    break
            else:
                updates_without_improvement += 1
                logging.info(f'Best mean reward: {best_mean_reward:.2f}')
                if args.early_stop != -1 and updates_without_improvement == args.early_stop:
                    break
            logging.info(
                f'{updates_without_improvement} updates without improvement')

        if buffer.full and (episode + 1) % args.update_target_freq == 0:
            target_model.load_state_dict(current_model.state_dict())
            logging.info(f'Updated target model (updates {updates+1})')
            updates += 1

    t1 = time.time()
    logging.info(f'Finished training in {t1-t0:.1f}s')
    if args.render:
        env.close()
    model = load_agent(args, env).to(device)
    model.load_state_dict(torch.load('checkpoint_best.pt'))
    eval_res = evaluate(model, env, args, device, episodes=args.optimize_freq)

    logging.info(pformat(eval_res))