示例#1
0
    def __init__(self, env_name: str, worker_id: int, global_model: ActorCriticNetwork, seed: int, T: Value,
                 lr: float = 1e-4, n_steps: int = 0, t_max: int = 100000, gamma: float = .99,
                 tau: float = 1, beta: float = .01, value_loss_coef: float = .5,
                 optimizer: Optimizer = None, is_train: bool = True, use_gae: bool = True,
                 is_discrete: bool = False) -> None:
        """
        Initialize Worker thread for A3C algorithm
        :param use_gae: use Generalize Advantage Estimate
        :param t_max: maximum episodes for training
        :param env_name: gym environment name
        :param worker_id: number of workers
        :param T: global shared counter
        :param optimizer: torch optimizer instance, either shared Optimizer or None for individual
        :param beta: entropy weight factor
        :param tau: TODO hyperparam for GAE
        :param gamma: discount factor
        :param global_model: shared global model to get the parameters from
        :param seed: seed to ensure reproducibility
        :param lr: learning rate for the workers NN
        :param n_steps: amount of steps for training
        :param value_loss_coef: factor for scaling the value loss
        """
        super(Worker, self).__init__()

        self.is_discrete = is_discrete

        # separate env for each worker
        self.env_name = env_name

        # check if the requested environment is a quanser robot env
        if self.env_name in ['CartpoleStabShort-v0']:
            self.env = quanser_robots.GentlyTerminating(gym.make(self.env_name))
        else:
            # use the official gym env as default
            self.env = gym.make(self.env_name)

        # training params
        self.n_steps = n_steps
        self.tau = tau
        self.gamma = gamma
        self.beta = beta
        self.value_loss_coef = value_loss_coef
        self.use_gae = use_gae

        # training and testing params
        self.seed = seed
        self.lr = lr
        self.t_max = t_max
        self.is_train = is_train

        # shared params
        self.optimizer = optimizer
        self.global_model = global_model
        self.worker_id = worker_id
        self.T = T

        # logging instance
        self.logger = logging.getLogger(__name__)
示例#2
0
    def run(self):
        torch.manual_seed(self.seed)
        env = quanser_robots.GentlyTerminating(gym.make(self.env_name))
        # env = gym.make(self.env_name)
        global_model = ActorCriticNetwork(env.observation_space.shape[0],
                                          env.action_space, self.is_discrete)
        global_model.share_memory()

        # TODO
        optimizer = SharedRMSProp(global_model.parameters(), lr=self.lr)
        optimizer.share_memory()

        # start the test worker which is visualized to see how the current progress is
        #w = Worker(env_name=self.env_name, worker_id=self.n_worker, global_model=global_model, T=self.T, seed=self.seed,
        #           lr=self.lr, t_max=200, optimizer=None, is_train=False, is_discrete=self.is_discrete)
        w = Worker(env_name=self.env_name,
                   worker_id=self.n_worker,
                   global_model=global_model,
                   T=self.T,
                   seed=self.seed,
                   lr=self.lr,
                   n_steps=0,
                   t_max=200,
                   gamma=.99,
                   tau=1,
                   beta=.01,
                   value_loss_coef=.5,
                   optimizer=None,
                   is_train=False,
                   is_discrete=self.is_discrete)
        w.start()
        self.worker_pool.append(w)

        # start all training workers which update the model parameters
        for wid in range(0, self.n_worker):
            self.logger.info("Worker {} created".format(wid))
            w = Worker(env_name=self.env_name,
                       worker_id=wid,
                       global_model=global_model,
                       T=self.T,
                       seed=self.seed,
                       lr=self.lr,
                       n_steps=20,
                       t_max=1000,
                       gamma=.99,
                       tau=1,
                       beta=.01,
                       value_loss_coef=.5,
                       optimizer=None,
                       is_train=True,
                       is_discrete=self.is_discrete)
            w.start()
            self.worker_pool.append(w)

        for w in self.worker_pool:
            w.join()
示例#3
0
def get_env(env_name, monitor=False):
    if 'RR' in env_name:
        env = quanser_robots.GentlyTerminating(gym.make(env_name))
    else:
        if monitor:
            env = Monitor(gym.make(env_name),
                          'experiments/100_test_runs',
                          video_callable=lambda count: count % 100 == 0,
                          force=True)
        else:
            # use the official gym env as default
            env = gym.make(env_name)
    return env
示例#4
0
    def __init__(self, env_name, seed, n_features):
        # general
        self.env_name = env_name
        self.seed = seed
        self.n_features = n_features
        self.noise_var = None

        # env
        self.env = quanser_robots.GentlyTerminating(gym.make(self.env_name))
        self.env = gym.make(self.env_name)
        self.env.seed(self.seed)

        # dynamics model
        # TODO learn length scale by evidence maximization
        self.mgp = MGPR(dim=self.env.observation_space.shape[0])

        self.states = []
        self.actions = []
示例#5
0
    def run(self):
        """
        Start A3C worker and test thread
        :return:
        """
        torch.manual_seed(self.args.seed)

        if "RR" in self.args.env_name:
            env = quanser_robots.GentlyTerminating(gym.make(
                self.args.env_name))
        else:
            env = gym.make(self.args.env_name)

        optimizer = None
        critic_optimizer = None
        model_critic = None

        if self.args.shared_model:
            model = get_model(env=env,
                              shared=self.args.shared_model,
                              path=self.args.path,
                              T=self.T,
                              global_reward=self.global_reward)
            if not self.args.no_shared_optimizer:
                optimizer = get_shared_optimizer(
                    model=model,
                    optimizer_name=self.args.optimizer,
                    lr=self.args.lr,
                    path=self.args.path)
        else:
            model, model_critic = get_model(env=env,
                                            shared=self.args.shared_model,
                                            path=self.args.path,
                                            T=self.T,
                                            global_reward=self.global_reward)
            if not self.args.no_shared_optimizer:
                optimizer, critic_optimizer = get_shared_optimizer(
                    model=model,
                    optimizer_name=self.args.optimizer,
                    lr=self.args.lr,
                    path=self.args.path,
                    model_critic=model_critic,
                    optimizer_name_critic=self.args.optimizer,
                    lr_critic=self.args.lr_critic)

        lr_scheduler = None
        lr_scheduler_critic = None

        if not self.args.no_shared_optimizer and self.args.lr_scheduler == "exponential":
            lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                                  gamma=0.99)
            if critic_optimizer:
                lr_scheduler_critic = torch.optim.lr_scheduler.ExponentialLR(
                    critic_optimizer, gamma=0.99)

        p = Process(target=test,
                    args=(self.args, self.args.worker, model, self.T,
                          self.global_reward, optimizer, model_critic,
                          critic_optimizer))
        p.start()
        self.worker_pool.append(p)

        if not self.args.test:
            for wid in range(0, self.args.worker):
                p = Process(target=train,
                            args=(self.args, wid, model, self.T,
                                  self.global_reward, optimizer, model_critic,
                                  critic_optimizer, lr_scheduler,
                                  lr_scheduler_critic))
                p.start()
                self.worker_pool.append(p)
                time.sleep(1)

            for p in self.worker_pool:
                p.join()
示例#6
0
def test(args,
         worker_id: int,
         global_model: torch.nn.Module,
         T: Value,
         global_reward: Value = None,
         optimizer: torch.optim.Optimizer = None,
         global_model_critic: CriticNetwork = None,
         optimizer_critic: torch.optim.Optimizer = None):
    """
    Start worker in _test mode, i.e. no training is done, only testing is used to validate current performance
    loosely based on https://github.com/ikostrikov/pytorch-a3c/blob/master/_test.py
    :param args: console arguments
    :param worker_id: id of worker to differentiatethem and init different seeds
    :param global_model: global model, which is optimized/ for split models: actor
    :param T: global counter of steps
    :param global_reward: global running reward value
    :param optimizer: optimizer for shared model/ for split models: actor model
    :param global_model_critic: optional global critic model for split networks
    :param optimizer_critic: optional critic optimizer for split networks
    :return: None
    """

    logging.info("test worker started.")
    torch.manual_seed(args.seed + worker_id)

    if "RR" in args.env_name:
        env = quanser_robots.GentlyTerminating(gym.make(args.env_name))
    else:
        if args.monitor:
            env = Monitor(gym.make(args.env_name),
                          '100_test_runs',
                          video_callable=lambda count: count % 100 == 0,
                          force=True)
        else:
            env = gym.make(args.env_name)

    env.seed(args.seed + worker_id)

    normalizer = get_normalizer(args.normalizer, env)

    # get an instance of the current global model state
    model = copy.deepcopy(global_model)
    model.eval()

    model_critic = None
    if global_model_critic:
        model_critic = copy.deepcopy(global_model_critic)
        model_critic.eval()

    state = torch.from_numpy(env.reset())

    writer = SummaryWriter(comment='_test', log_dir='experiments/runs/')
    start_time = time.time()

    t = 0
    episode_reward = 0

    done = False
    global_iter = 0
    best_global_reward = -np.inf
    best_test_reward = -np.inf

    while True:

        # Get params from shared global model
        model.load_state_dict(global_model.state_dict())
        if not args.shared_model:
            model_critic.load_state_dict(global_model_critic.state_dict())

        rewards = []
        eps_len = []

        sleep = True

        # make 10 runs to get current avg performance
        for i in range(args.test_runs):
            while not done:
                t += 1

                if not args.no_render:
                    if i == 0 and t % 1 == 0 and "RR" not in args.env_name:
                        env.render()
                        if args.monitor and sleep:  # add a small delay to do a screen capture of the test run if needed
                            time.sleep(1)
                            sleep = False

                # apply min/max scaling on the environment

                with torch.no_grad():

                    # select mean of normal dist as action --> Expectation
                    if args.shared_model:
                        _, mu, _ = model(normalizer(state))
                    else:
                        mu, _ = model(normalizer(state))

                    action = mu.detach()

                state, reward, done, _ = env.step(
                    np.clip(action.numpy(), -args.max_action, args.max_action))

                done = done or t >= args.max_episode_length
                episode_reward += reward

                if done:
                    # reset current cumulated reward and episode counter as well as env
                    rewards.append(episode_reward)
                    episode_reward = 0

                    eps_len.append(t)
                    t = 0

                    state = env.reset()

                state = torch.from_numpy(state)

            # necessary to make more than one run
            done = False

        time_print = time.strftime("%Hh %Mm %Ss",
                                   time.gmtime(time.time() - start_time))

        std_reward = np.std(rewards)
        rewards = np.mean(rewards)

        new_best = rewards > best_test_reward
        writer.add_scalar("reward/test", rewards, int(T.value))
        writer.add_scalar("episode/length", np.mean(eps_len), int(T.value))

        log_string = f"Time: {time_print}, T={T.value} -- n_runs={args.test_runs} -- mean total reward={rewards:.5f} " \
            f" +/- {std_reward:.5f} -- mean episode length={np.mean(eps_len):.5f}" \
            f" +/- {np.std(eps_len):.5f} -- global reward={global_reward.value:.5f}"

        if new_best:
            # highlight messages if progress was done
            logging.info(log_string)

            best_global_reward = global_reward.value if global_reward.value > best_global_reward else best_global_reward
            best_test_reward = rewards if rewards > best_test_reward else best_test_reward
            model_type = 'shared' if args.shared_model else 'split'

            save_checkpoint(
                {
                    'epoch':
                    T.value,
                    'model':
                    model.state_dict(),
                    'model_critic':
                    model_critic.state_dict()
                    if model_critic is not None else None,
                    'global_reward':
                    global_reward.value,
                    # only save optimizers if shared ones are used
                    'optimizer':
                    optimizer.state_dict() if optimizer else None,
                    'optimizer_critic':
                    optimizer_critic.state_dict()
                    if optimizer_critic else None,
                },
                path=
                f"./experiments/checkpoints/model_{model_type}_T-{T.value}_global-{global_reward.value:.5f}_test-{rewards:.5f}.pth.tar"
            )
        else:
            # use by default only debug messages if no progress was reached
            logging.debug(log_string)

        global_iter += 1

        # run evaluation only once in test mode
        if args.test:
            break