def run(self):
        self.env = make_env(self.env_name)

        score = 0
        while True:
            action = self.queue.get()
            if action is None:
                break
            elif action == -1: # reset
                state = np.array(self.env.reset())
                self.state[self.idx, :, :, :] = (state - self.mean) / self.std
            else:
                lazy_state, reward, done, _ = self.env.step(action)

                state = np.array(lazy_state)
                self.state[self.idx, :, :, :] = (state - self.mean) / self.std
                
                score += reward
                if done:
                    state = np.array(self.env.reset())
                    self.state[self.idx, :, :, :] = (state - self.mean) / self.std
                    self.channel.put(score)
                    score = 0

            self.barrier.put(None)
    def run(self):
        self.env = make_env(self.env_name)

        score = 0
        step = 0
        while True:
            action = self.queue.get()
            if action is None:
                break
            elif action == -1:  # reset
                state = self.env.reset()
                self.state[self.idx] = state
                self.barrier.put(True)
            else:
                step += 1
                state, reward, done, _ = self.env.step(action)
                score += reward
                if done:
                    state = self.env.reset()
                    self.score_channel.put((score, step))
                    score = 0
                    step = 0

                self.state[self.idx] = state
                self.reward[self.idx] = reward
                self.finished[self.idx] = done

                self.barrier.put(True)
    def __init__(self, args):
        tmp_env = make_env(args.env)
        self.obs_shape = tmp_env.observation_space.shape
        self.num_actions = tmp_env.action_space.n
        self.c_in = self.obs_shape[0]
        del tmp_env

        self.horizon = args.horizon
        self.eta = args.eta
        self.epoch = args.epoch
        self.batch_size = args.batch * args.actors
        self.gamma = args.gamma
        self.lam = args.lam
        self.num_actors = args.actors
        self.eps = args.eps
        self.num_iter = (
            args.epoch * args.actors * args.horizon
        ) // self.batch_size  # how many times to run SGD on the buffer

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.queues = [Queue() for i in range(self.num_actors)]
        self.barrier = Queue(
        )  # This is used as a waiting mechanism, to wait for all the agents to env.step()
        self.score_channel = Queue()

        # these are shmem np.arrays
        self.state, self.reward, self.finished = self.init_shared()

        self.workers = [
            Worker(i, args.env, self.queues[i], self.barrier, self.state,
                   self.reward, self.finished, self.score_channel)
            for i in range(self.num_actors)
        ]
        self.start_workers()

        self.model = Policy(self.c_in, self.num_actions).to(self.device)
        self.optim = torch.optim.Adam(self.model.parameters(), lr=self.eta)

        # used for logging and graphing
        self.stat = {
            'scores': [],
            'steps': [],
            'clip_losses': [],
            'value_losses': [],
            'entropies': []
        }
예제 #4
0
    def __init__(self, env_name, batch_size, gamma, use_random_features):

        self.random = use_random_features
        self.batch_size = batch_size  # batch_size == number of envs

        self.queues = [Queue() for i in range(batch_size)]
        self.barrier = Queue(
        )  # use to block Trainer until all envs finish updating
        self.channel = Queue(
        )  # envs send their total scores after each episode

        tmp_env = make_env(env_name)
        self.c_in = tmp_env.observation_space.shape[0]
        self.num_actions = tmp_env.action_space.n
        mean, std = self.mean_std_from_random_agent(tmp_env, 10000)

        # sh_state is shared between processes
        self.sh_state = self.init_shared(tmp_env.observation_space.shape)

        self.workers = [
            Worker(i, env_name, self.queues[i], self.barrier, self.channel,
                   self.sh_state, mean, std) for i in range(batch_size)
        ]
        self.start_workers()

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.gamma = gamma  # reward discounting factor

        self.model = Policy(self.c_in, self.num_actions).to(self.device)
        self.icm = IntrinsicCuriosityModule(self.c_in, self.num_actions,
                                            self.random).to(self.device)

        self.optim = torch.optim.Adam(list(self.model.parameters()) +
                                      list(self.icm.parameters()),
                                      lr=1e-3)
        self.cross_entropy = torch.nn.CrossEntropyLoss()
예제 #5
0
    def __init__(self,
                 env,
                 config,
                 train):

        # Class name
        class_name = type(self).__name__.lower()

        # Gym environnement
        self.env = make_env(env)

        # Are we in evaluation mode
        self._train = train

        if train:
            # Parameters
            self.gamma = config.gamma
            self.bath_size = config.batch_size
            self.step_target_update = config.target_update
            self.freq_learning = config.freq_learning
            self.epsilon_decay = config.epsilon_decay
            self.epsilon_start = config.epsilon_start
            self.epsilon_end = config.epsilon_end
            self.num_steps = config.num_steps
            self.start_learning = config.start_learning

            # Experience-Replay
            self.memory = Memory(config.memory_capacity)

        # List to save the rewards
        self.plot_reward = []
        self.plot_eval = []

        # Architecture of the neural networks
        self.model = None

        # Error function
        self.__loss_fn = torch.nn.SmoothL1Loss(reduction='mean')

        # Architecture of the neural networks
        self.model = Dense_NN(self.env.observation_space,
                              self.env.action_space.n)
        if train:
            self.qtarget = Dense_NN(
                self.env.observation_space, self.env.action_space.n)

        # Backpropagation function
        self.__optimizer = torch.optim.Adam(
            self.model.parameters(), lr=config.learning_rate)

        # Make the model using the GPU if available
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
            self.model.cuda()
            if train:
                self.qtarget.cuda()
        else:
            self.device = torch.device('cpu')

        # Path for the saves
        self.path_log = class_name + '.txt'
        self.path_save = class_name
        self.path_fig = class_name
예제 #6
0
    LR = args.lr
    n_step = args.n_step
    env_name = args.env
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Using ", device)

    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if "-ram" in args.env or args.env == "CartPole-v0" or args.env == "LunarLander-v2":
        envs = MultiPro.SubprocVecEnv(
            [lambda: gym.make(args.env) for i in range(args.worker)])
        eval_env = gym.make(args.env)
    else:
        envs = MultiPro.SubprocVecEnv(
            [lambda: wrapper.make_env(args.env) for i in range(args.worker)])
        eval_env = wrapper.make_env(args.env)
    envs.seed(seed)
    eval_env.seed(seed + 1)

    action_size = eval_env.action_space.n
    state_size = eval_env.observation_space.shape

    agent = IQN_Agent(state_size=state_size,
                      action_size=action_size,
                      network=args.agent,
                      munchausen=args.munchausen,
                      layer_size=args.layer_size,
                      n_step=n_step,
                      BATCH_SIZE=BATCH_SIZE,
                      BUFFER_SIZE=BUFFER_SIZE,
예제 #7
0
import torch
import numpy as np

from collections import Counter
import time

import wrapper
import dqn_model

### Play the pong game with a trained dqn agent
LOAD_PATH = './models/pong/400_pong_policy_net.pt'
RENDER = True
FPS = 25

## for playing first we initialize the env
env = wrapper.make_env("PongNoFrameskip-v4")

## initialize a model
policy_net = dqn_model.DQN(env.observation_space.shape,
                           env.action_space.n).eval()
## load the trained model
#print(torch.load(LOAD_PATH))
policy_net.load_state_dict(torch.load(LOAD_PATH))

## get the initial state
state = env.reset()
state = torch.FloatTensor(state).unsqueeze(0)

total_reward = 0.0
action_count = Counter()
## play the game
예제 #8
0
    parser.add_argument(
        "--reward",
        type=float,
        default=MEAN_REWARD_BOUND,
        help="Mean reward boundary for stop of training, default=%.2f" %
        MEAN_REWARD_BOUND)
    parser.add_argument('--double', default=False, action="store_true")
    args = parser.parse_args()

    double = args.double
    print('Double Q learning mode: {}'.format('True' if double else 'False'))
    print('The target reward: {}'.format(args.reward))

    args.cuda = True
    device = torch.device("cuda" if args.cuda else "cpu")
    env = wrapper.make_env(args.env)

    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)
    tgt_net = dqn_model.DQN(env.observation_space.shape,
                            env.action_space.n).to(device)
    writer = SummaryWriter(comment="-" + args.env)
    print(net)

    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards = []
    frame_idx = 0
예제 #9
0
파일: train.py 프로젝트: YuechengLiu/PARL
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_env(args.env_name, args.seed, args.gamma)

    model = MujocoModel(envs.observation_space.shape[0],
                        envs.action_space.shape[0])
    model.to(device)

    algorithm = PPO(model,
                    args.clip_param,
                    args.value_loss_coef,
                    args.entropy_coef,
                    initial_lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm)

    agent = MujocoAgent(algorithm, device)

    rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0],
                              envs.action_space.shape[0])

    obs = envs.reset()
    rollouts.obs[0] = np.copy(obs)

    episode_rewards = deque(maxlen=10)

    num_updates = int(args.num_env_steps) // args.num_steps
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(algorithm.optimizer, j, num_updates,
                                         args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = agent.sample(
                    rollouts.obs[step])  # why use obs from rollouts???有病吧

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.append(obs, action, action_log_prob, value, reward, masks,
                            bad_masks)

        with torch.no_grad():
            next_value = agent.value(rollouts.obs[-1])

        value_loss, action_loss, dist_entropy = agent.learn(
            next_value, args.gamma, args.gae_lambda, args.ppo_epoch,
            args.num_mini_batch, rollouts)

        rollouts.after_update()

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_steps
            print(
                "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps, len(episode_rewards),
                        np.mean(episode_rewards), np.median(episode_rewards),
                        np.min(episode_rewards), np.max(episode_rewards),
                        dist_entropy, value_loss, action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            eval_mean_reward = evaluate(agent, ob_rms, args.env_name,
                                        args.seed, device)