示例#1
0
import torch.nn.functional as F
import torch.optim as optim

from lib import common

GAMMA = 0.99
LEARNING_RATE = 0.001
ENTROPY_BETA = 0.01
BATCH_SIZE = 128
NUM_ENVS = 50

BELLMAN_STEPS = 4
CLIP_GRAD = 0.1

if __name__ == "__main__":
    common.mkdir('.', 'checkpoints')
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=False,
                        action="store_true",
                        help="Enable cuda")
    parser.add_argument("-n", "--name", required=True, help="Name of the run")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    make_env = lambda: ptan.common.wrappers.wrap_dqn(
        gym.make("PongNoFrameskip-v4"))
    envs = [make_env() for _ in range(NUM_ENVS)]
    writer = SummaryWriter(comment="-pong-a2c_" + args.name)

    net = common.AtariA2C(envs[0].observation_space.shape,
示例#2
0
            sum_loss_total += loss
            sum_entropy += entropy
            
            count_steps += 1
    
    # Write to tensorboard output file
    writer.add_scalar("returns", sum_returns / count_steps, frame_idx)
    writer.add_scalar("advantage", sum_advantage / count_steps, frame_idx)
    writer.add_scalar("loss_actor", sum_loss_actor / count_steps, frame_idx)
    writer.add_scalar("loss_critic", sum_loss_critic / count_steps, frame_idx)
    writer.add_scalar("entropy", sum_entropy / count_steps, frame_idx)
    writer.add_scalar("loss_total", sum_loss_total / count_steps, frame_idx)


if __name__ == "__main__":
    mkdir('.', 'checkpoints')
    parser = argparse.ArgumentParser()
    parser.add_argument("-n", "--name", default=ENV_ID, help="Name of the run")
    args = parser.parse_args()
    writer = SummaryWriter(comment="ppo_" + args.name)
    
    # Autodetect CUDA
    use_cuda = torch.cuda.is_available()
    device   = torch.device("cuda" if use_cuda else "cpu")
    print('Device:', device)
    
    # Prepare environments
    envs = [make_env() for i in range(NUM_ENVS)]   # make multiple envs (ENV_ID) for training
    envs = SubprocVecEnv(envs)   # ??
    env = gym.make(ENV_ID)       # make env for testing
    num_inputs  = envs.observation_space.shape[0]
示例#3
0
        action='store',
        default='%s/reports' % os.path.dirname(os.path.realpath(__file__)),
        required=False)

    ob_group.add_argument('-d', '--debug',
        help='enable full traceback on exceptions',
        action='store_true',
        default=False,
        required=False)

    args = parser.parse_args()

    config = '%s/etc/omnibus.conf' % os.path.dirname(os.path.realpath(__file__))

    output_dir = args.output
    DEBUG = args.debug

    info('Using configuration file (%s) ...' % config)
    info('Debug: %s' % DEBUG)

    if os.path.exists(output_dir):
        if not os.path.isdir(output_dir):
            error('Specified report output location is not a directory; exiting ...')
            sys.exit(1)
    else:
        info('Creating report output directory (%s) ...' % output_dir)
        mkdir(output_dir)

    console = Console()
    console.cmdloop()
示例#4
0
        action='store',
        default='%s/reports' % os.path.dirname(os.path.realpath(__file__)),
        required=False)

    ob_group.add_argument('-d', '--debug',
        help='enable full traceback on exceptions',
        action='store_true',
        default=False,
        required=False)

    args = parser.parse_args()

    config = '%s/etc/omnibus.conf' % os.path.dirname(os.path.realpath(__file__))

    output_dir = args.output
    DEBUG = args.debug

    info('Using configuration file (%s) ...' % config)
    info('Debug: %s' % DEBUG)

    if os.path.exists(output_dir):
        if not os.path.isdir(output_dir):
            error('Specified report output location is not a directory; exiting ...')
            sys.exit(1)
    else:
        info('Creating report output directory (%s) ...' % output_dir)
        mkdir(output_dir)

    console = Console()
    console.cmdloop()
示例#5
0
            obs, reward, done, _ = env.step(action)
            rewards += reward
            steps += 1
            if done:
                break
    return rewards / count, steps / count


def calc_logprob(mu_v, var_v, actions_v):
    p1 = -((mu_v - actions_v)**2) / (2 * var_v.clamp(min=1e-3))
    p2 = -torch.log(torch.sqrt(2 * math.pi * var_v))
    return p1 + p2


if __name__ == "__main__":
    common.mkdir(".", "checkpoints")
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=False,
                        action="store_true",
                        help="Enable CUDA")
    parser.add_argument("-n", "--name", required=True, help="Name of the run")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    save_path = os.path.join("./checkpoints/", "a2c-" + args.name)
    os.makedirs(save_path, exist_ok=True)

    env = gym.make(ENV_ID)
    test_env = gym.make(ENV_ID)
示例#6
0
    def Start(self, debugOutputQueue, pauseQueue, fromSavedModel=''):
        mkdir('.', 'checkpoints')
        parser = argparse.ArgumentParser()
        parser.add_argument("-n",
                            "--name",
                            default=self.settings['ENV_NAME'],
                            help="Name of the run")
        args = parser.parse_args()

        # Autodetect CUDA
        use_cuda = torch.cuda.is_available()
        device = torch.device("cuda" if use_cuda else "cpu")
        self.logQueue.put(pprint.pformat('Device:' + device.type))

        # Prepare environments
        envs = RemoteVecEnv(NUM_ENVS)
        num_inputs = envs.observation_space.shape[0]
        num_outputs = envs.action_space.shape[0]

        frame_idx = 0
        train_epoch = 0
        best_reward = None

        self.model = ActorCritic(num_inputs, num_outputs,
                                 HIDDEN_SIZE).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE)
        self.writer = SummaryWriter(comment="ppo_" + args.name)

        if fromSavedModel == '':
            self.logQueue.put('Successfully make 8 remote environment')
            self.logQueue.put(pprint.pformat(self.model))
        else:
            check_point = torch.load(fromSavedModel)
            self.model.load_state_dict(check_point['state_dict'])
            self.optimizer.load_state_dict(check_point['optimizer'])
            train_epoch = check_point['epoch']
            frame_idx = check_point['frame_idx']
            self.logQueue.put('Successfully load model from ' + fromSavedModel)

        state = envs.reset()
        early_stop = False
        save_count = 0

        while not early_stop:
            log_probs = []
            values = []
            states = []
            actions = []
            rewards = []
            masks = []

            for _ in range(PPO_STEPS):
                state = torch.FloatTensor(state).to(device)
                dist, value = self.model(state)

                action = dist.sample()
                # each state, reward, done is a list of results from each parallel environment
                action_exp = action.cpu().numpy()
                action_exp = np.clip(action_exp, -10, 10)
                next_state, reward, done, _ = envs.step(action_exp)
                log_prob = dist.log_prob(action)

                log_probs.append(log_prob)
                values.append(value)
                rewards.append(
                    torch.FloatTensor(reward).unsqueeze(1).to(device))
                masks.append(
                    torch.FloatTensor(1 - done).unsqueeze(1).to(device))

                states.append(state)
                actions.append(action)

                state = next_state
                frame_idx += 1

                debugData = (next_state, reward, done, action_exp)
                debugOutputQueue.put(debugData)

                while pauseQueue.qsize() > 0:
                    if pauseQueue.qsize() == 1:
                        time.sleep(1)
                    else:
                        while not pauseQueue.empty():
                            pauseQueue.get()

            next_state = torch.FloatTensor(next_state).to(device)
            _, next_value = self.model(next_state)
            returns = self.compute_gae(next_value, rewards, masks, values)

            returns = torch.cat(returns).detach()
            log_probs = torch.cat(log_probs).detach()
            values = torch.cat(values).detach()
            states = torch.cat(states)
            actions = torch.cat(actions)
            advantage = returns - values
            advantage = self.normalize(advantage)

            self.ppo_update(frame_idx, states, actions, log_probs, returns,
                            advantage)
            train_epoch += 1

            if train_epoch % TEST_EPOCHS == 0:
                test_reward = np.mean([
                    self.test_env(envs, self.model, device)
                    for _ in range(NUM_TESTS)
                ])
                self.writer.add_scalar("test_rewards", test_reward, frame_idx)
                self.logQueue.put(
                    pprint.pformat('Frame %s. reward: %s' %
                                   (frame_idx, test_reward)))
                # Save a checkpoint every time we achieve a best reward
                if best_reward is None or best_reward < test_reward:
                    if best_reward is not None:
                        self.logQueue.put(
                            pprint.pformat(
                                "Best reward updated: %.3f -> %.3f" %
                                (best_reward, test_reward)))
                        name = "%s_best_%+.3f_%d.dat" % (
                            args.name, test_reward, frame_idx)
                        fname = os.path.join('.', 'checkpoints', name)
                        check_point = {
                            'epoch': train_epoch,
                            'state_dict': self.model.state_dict(),
                            'optimizer': self.optimizer.state_dict(),
                            'frame_idx': frame_idx,
                        }
                        # self.save_ckp(check_point, fname)
                        # torch.save(self.model.state_dict(), fname)
                        torch.save(check_point, fname)
                    best_reward = test_reward
                if test_reward > TARGET_REWARD: early_stop = True

                save_count += 1
                if save_count >= 15:
                    self.logQueue.put(
                        pprint.pformat('Saving checkpoint for frame: ' +
                                       str(frame_idx)))
                    name = "%s_frame_%d.dat" % (args.name, frame_idx)
                    fname = os.path.join('.', 'checkpoints', name)
                    check_point = {
                        'epoch': train_epoch,
                        'state_dict': self.model.state_dict(),
                        'optimizer': self.optimizer.state_dict(),
                        'frame_idx': frame_idx,
                    }
                    torch.save(check_point, fname)
                    save_count = 0
示例#7
0
                        default=True,
                        action='store_false',
                        dest='visualize',
                        help="Disable visualization of the game play")
    args = parser.parse_args()
    params = PARAMS[args.config]

    if args.config == 'doom':
        import vizdoomgym
        env = make_doom_env(params['env_name'])
    else:
        env = gym.make(params['env_name'])
        env = ptan.common.wrappers.wrap_dqn(env)

    if args.record:
        mkdir('.', args.record)
        env = wrappers.Monitor(env, args.record, force=True)
    net = model.NoisyDuelingDQN(env.observation_space.shape,
                                env.action_space.n)
    net.load_state_dict(
        torch.load(args.model, map_location=lambda storage, loc: storage))

    state = env.reset()
    total_reward = 0.0
    c = collections.Counter()

    while True:
        start_ts = time.time()
        if args.visualize:
            env.render()
        state_v = torch.tensor(