Пример #1
0
    def __init__(self, env, params, net=None, reward=[], loss=[]):
        self.params = params
        self.r_sums = reward
        self.l_means = loss
        # PARAMS
        self.gamma = params["gamma"]
        self.freq_copy = params["freq_copy"]
        self.tau = params["max_tau"]
        self.tau_decay = params["tau_decay"]
        self.min_tau = params["min_tau"]
        self.exploration = params["exploration"]
        self.sigma = params["sigma"]
        self.alpha = params["alpha"]
        self.m = params["m"]
        self.frame_skip = params["frame_skip"]
        self.target_update_strategy = params["target_update_strategy"]
        self.batch_size = params["batch_size"]
        self.cuda = False
        # NEURAL NETWORK
        self.n_action = env.action_space.n
        self.net = QModel(self.n_action)
        if net is not None:
            self.net.load_state_dict(net)
        self.target = copy.deepcopy(self.net)
        self.optimizer = params["optimizer"](self.net.parameters(),
                                             lr=self.sigma)
        self.criterion = params["criterion"]()
        self.buff = Buffer(params["buffer_size"])

        self.env = wrappers.AtariPreprocessing(env,
                                               frame_skip=self.frame_skip,
                                               screen_size=84,
                                               grayscale_obs=True,
                                               scale_obs=True)
        self.env = wrappers.FrameStack(self.env, self.m)
Пример #2
0
 def get_env_fn():
     return wrappers.AtariPreprocessing(gym.make(env_id), )
Пример #3
0
    with open(os.path.join(eval_args.path, 'args.json'), 'r') as f:
        train_args = ArgsStruct(**json.load(f))

    env_name = eval_args.env if eval_args.env is not None else train_args.env
    episodes = eval_args.episodes if eval_args.episodes is not None else train_args.optimize_freq

    timestamp = time.strftime("%Y-%m-%d-%H%M")
    log_path = os.path.join(exp_dir, f'eval-{timestamp}.log')
    logging.basicConfig(filename=log_path, level=logging.INFO)
    logging.getLogger('').addHandler(logging.StreamHandler())

    if env_name == 'pong':
        env_id = 'PongNoFrameskip-v0'
        env = gym.make(env_id)
    else:
        raise NotImplementedError(env_name)

    device = torch.device('cuda:0' if not eval_args.no_cuda
                          and torch.cuda.is_available() else 'cpu')

    env = wrappers.FrameStack(wrappers.AtariPreprocessing(env), num_stack=4)

    model = load_agent(train_args, env).to(device)
    model.load_state_dict(
        torch.load(os.path.join(exp_dir, 'checkpoint_best.pt')))

    eval_res = evaluate(model, env, train_args, device, episodes)

    logging.info(pformat(eval_res))
Пример #4
0
                        nargs='?',
                        default='Breakout-v0',
                        help='Select the environment to run')
    args = parser.parse_args()

    env = gym.make(args.env_id)
    env.spec.id += " NoFrameskip"

    # You provide the directory to write to (can be an existing
    # directory, including one with existing data -- all monitor files
    # will be namespaced). You can also dump to a tempdir if you'd
    # like: tempfile.mkdtemp().
    outdir = '/tmp/random-agent-results'
    env = wrappers.Monitor(env, directory=outdir, force=True)
    env = wrappers.AtariPreprocessing(env,
                                      screen_size=84,
                                      frame_skip=4,
                                      grayscale_obs=True)
    env = wrappers.FrameStack(env, 4)
    env.seed(0)

    neural_network = ConvNet()
    target_neural_network = copy.deepcopy(neural_network)
    print(list(neural_network.parameters()))
    criterion = nn.MSELoss()
    optim = torch.optim.SGD(neural_network.parameters(),
                            lr=LEARNING_RATE,
                            momentum=MOMENTUM)
    optim.zero_grad()
    reward = 0
    buffer = deque(maxlen=10000)
    done = False
Пример #5
0
    parser.add_argument('env_id',
                        nargs='?',
                        default='BreakoutNoFrameskip-v4',
                        help='Select the environment to run')
    args = parser.parse_args()

    # You can set the level to logger.DEBUG or logger.WARN if you
    # want to change the amount of output.
    logger.set_level(logger.INFO)

    env = gym.make(args.env_id)
    rewards = []

    #env = wrappers.Monitor(env, force=True)
    #env.seed(0)
    env = wrappers.AtariPreprocessing(env)
    env = wrappers.FrameStack(env, 4)

    #env.seed(0)

    agent = Agent(env)

    episode_count = 200
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()
        prev_ob = ob
        episode_reward = 0
        print(ob)
Пример #6
0
def train(args: argparse.Namespace, env: gym.Env, exp_dir: str):
    seed = args.seed
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

    device = torch.device(
        'cuda:0' if not args.no_cuda and torch.cuda.is_available() else 'cpu')

    env = wrappers.FrameStack(wrappers.AtariPreprocessing(env),
                              num_stack=args.stacked_frames)

    writer = SummaryWriter(log_dir=exp_dir)
    with open(os.path.join(exp_dir, 'args.json'), 'w') as f:
        json.dump(args.__dict__, f, indent=2)

    logging.info(args)

    n_actions = env.action_space.n

    current_model = load_agent(args, env).to(device)
    current_model.eval()
    target_model = load_agent(args, env).to(device)
    target_model.eval()

    if args.curiosity:
        curiosity = load_icm(args, env).to(device)
        curiosity.eval()

    target_model.load_state_dict(
        current_model.state_dict())  # Sync/update target model

    # rms-prop? https://www.reddit.com/r/reinforcementlearning/comments/ei9p3y/using_rmsprop_over_adam/
    if args.optimizer == 'adam':
        optimizer = optim.Adam(current_model.parameters(),
                               lr=args.lr,
                               weight_decay=args.weight_decay)
        if args.curiosity:
            curiosity_optimizer = optim.Adam(curiosity.parameters(),
                                             lr=args.lr,
                                             weight_decay=args.weight_decay)
    else:
        logging.error('Optimizer not implemented')
        raise NotImplementedError()

    logging.info(current_model)
    if args.curiosity:
        logging.info(curiosity)
    n_params = sum(p.numel() for p in current_model.parameters()
                   if p.requires_grad)
    logging.info(f'Training {n_params} parameters')
    if args.curiosity:
        n_params = sum(p.numel() for p in curiosity.parameters()
                       if p.requires_grad)
        logging.info(f'Training {n_params} parameters')

    criterion = nn.SmoothL1Loss if args.criterion == 'huber' else None
    if criterion is None:
        raise NotImplementedError(args.criterion)

    buffer = ReplayBuffer(capacity=args.replay_size, seed=args.seed)

    best_mean_reward = env.reward_range[0]
    updates_without_improvement = 0

    # Adapted from Mario Martin's Notebook
    epsilon_start = 1.0
    epsilon_final = 0.01
    epsilon_decay = 10000
    epsilon_by_episode = lambda e: epsilon_final + (
        epsilon_start - epsilon_final) * math.exp(-1. * e / epsilon_decay)

    if args.curiosity:
        epsilon_by_episode = lambda e: 0.0  # No epsilon needed if curiosity is used

    t0 = time.time()
    all_rewards = []
    all_steps = []
    all_mean_rewards = []
    all_mean_steps = []

    episode_set_rewards = 0.0
    episode_set_curiosity_rewards = 0.0
    episode_set_steps = 0

    updates = 0
    optimizations = 0

    initial_counter = 0

    for episode in range(args.episodes):
        state = env.reset()
        episode_reward = 0.0
        episode_curiosity_reward = 0.0
        steps = 0
        epsilon = epsilon_by_episode(episode)

        while True:
            current_model.eval()
            if args.curiosity:
                curiosity.eval()
            action = current_model.act(
                torch.tensor(transform(
                    state.__array__())).unsqueeze(0).to(device), epsilon,
                torch.rand(1)[0].to(device),
                torch.randint(0, n_actions, (1, ))[0].to(device))
            current_model.train()

            next_state, reward, done, _ = env.step(action)

            episode_reward += reward

            curiosity_reward = None
            if args.curiosity:
                with torch.no_grad():
                    curiosity_reward, _ = \
                        curiosity(torch.tensor(transform(state.__array__())).unsqueeze(0).to(device),
                                  torch.tensor(transform(next_state.__array__())).unsqueeze(0).to(device),
                                  torch.tensor([action]).long().to(device))
                episode_curiosity_reward += curiosity_reward

            buffer.push(
                LazyTransition(
                    state, action, next_state, reward, done,
                    curiosity_reward.cpu().numpy()
                    if curiosity_reward is not None else None))

            if done:
                initial_counter += 1
                writer.add_scalar('Reward/train', episode_reward, episode + 1)
                writer.add_scalar('Steps/train', steps, episode + 1)
                writer.add_scalar('Epsilon/train', epsilon, episode + 1)
                all_rewards.append(episode_reward)
                all_steps.append(steps)
                episode_set_rewards += episode_reward
                episode_set_steps += steps
                if args.curiosity:
                    writer.add_scalar('Curiosity/train',
                                      episode_curiosity_reward, episode + 1)
                    episode_set_curiosity_rewards += episode_curiosity_reward

            state = next_state
            steps += 1

            if args.render:
                env.render()

            if done:
                logging.info(
                    f'Finished episode {episode+1} with reward = {episode_reward:.2f} | steps = {steps+1} | '
                    f'epsilon = {epsilon:.2f}')
                if args.curiosity:
                    logging.info(f'curiosity = {curiosity_reward:.2f}')
                break

        if buffer.full and (
                episode + 1
        ) % args.optimize_freq == 0:  # len(buffer) >= args.batch_size:

            transitions = buffer.sample(args.batch_size)

            if not args.curiosity:
                q_loss, _ = optimize(transitions, current_model, target_model,
                                     optimizer, device, epsilon,
                                     args.criterion)
            else:
                q_loss, curiosity_loss = optimize(transitions, current_model,
                                                  target_model, optimizer,
                                                  device, epsilon,
                                                  args.criterion, curiosity,
                                                  curiosity_optimizer)

            denominator = args.optimize_freq - 1 if optimizations > 0 else initial_counter
            mean_episode_set_rewards = episode_set_rewards / denominator
            mean_episode_set_steps = episode_set_steps / denominator
            writer.add_scalar('Mean-Reward/train', mean_episode_set_rewards,
                              optimizations + 1)
            writer.add_scalar('Mean-Steps/train', mean_episode_set_steps,
                              optimizations + 1)
            writer.add_scalar('Q-Loss/train', q_loss, optimizations + 1)
            if args.curiosity:
                writer.add_scalar('Curiosity-Loss/train', curiosity_loss,
                                  optimizations + 1)
            all_mean_rewards.append(mean_episode_set_rewards)
            all_mean_steps.append(mean_episode_set_steps)
            episode_set_rewards = 0.0
            episode_set_steps = 0

            torch.save(current_model.state_dict(),
                       os.path.join(exp_dir, 'checkpoint_last.pt'))
            if args.curiosity:
                torch.save(
                    curiosity.state_dict(),
                    os.path.join(exp_dir, 'curiosity_checkpoint_last.pt'))

            logging.info(f'Optimized model ({optimizations+1} optimizations)')
            optimizations += 1

            if mean_episode_set_rewards > best_mean_reward:
                updates_without_improvement = 0
                best_mean_reward = mean_episode_set_rewards
                torch.save(current_model.state_dict(),
                           os.path.join(exp_dir, 'checkpoint_best.pt'))
                logging.info(f'NEW: Best mean reward: {best_mean_reward:.2f}')
                if best_mean_reward == env.reward_range[1]:
                    logging.info('Reached max reward')
                    break
            else:
                updates_without_improvement += 1
                logging.info(f'Best mean reward: {best_mean_reward:.2f}')
                if args.early_stop != -1 and updates_without_improvement == args.early_stop:
                    break
            logging.info(
                f'{updates_without_improvement} updates without improvement')

        if buffer.full and (episode + 1) % args.update_target_freq == 0:
            target_model.load_state_dict(current_model.state_dict())
            logging.info(f'Updated target model (updates {updates+1})')
            updates += 1

    t1 = time.time()
    logging.info(f'Finished training in {t1-t0:.1f}s')
    if args.render:
        env.close()
    model = load_agent(args, env).to(device)
    model.load_state_dict(torch.load('checkpoint_best.pt'))
    eval_res = evaluate(model, env, args, device, episodes=args.optimize_freq)

    logging.info(pformat(eval_res))