def run(self): self.env = make_env(self.env_name) score = 0 while True: action = self.queue.get() if action is None: break elif action == -1: # reset state = np.array(self.env.reset()) self.state[self.idx, :, :, :] = (state - self.mean) / self.std else: lazy_state, reward, done, _ = self.env.step(action) state = np.array(lazy_state) self.state[self.idx, :, :, :] = (state - self.mean) / self.std score += reward if done: state = np.array(self.env.reset()) self.state[self.idx, :, :, :] = (state - self.mean) / self.std self.channel.put(score) score = 0 self.barrier.put(None)
def run(self): self.env = make_env(self.env_name) score = 0 step = 0 while True: action = self.queue.get() if action is None: break elif action == -1: # reset state = self.env.reset() self.state[self.idx] = state self.barrier.put(True) else: step += 1 state, reward, done, _ = self.env.step(action) score += reward if done: state = self.env.reset() self.score_channel.put((score, step)) score = 0 step = 0 self.state[self.idx] = state self.reward[self.idx] = reward self.finished[self.idx] = done self.barrier.put(True)
def __init__(self, args): tmp_env = make_env(args.env) self.obs_shape = tmp_env.observation_space.shape self.num_actions = tmp_env.action_space.n self.c_in = self.obs_shape[0] del tmp_env self.horizon = args.horizon self.eta = args.eta self.epoch = args.epoch self.batch_size = args.batch * args.actors self.gamma = args.gamma self.lam = args.lam self.num_actors = args.actors self.eps = args.eps self.num_iter = ( args.epoch * args.actors * args.horizon ) // self.batch_size # how many times to run SGD on the buffer self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.queues = [Queue() for i in range(self.num_actors)] self.barrier = Queue( ) # This is used as a waiting mechanism, to wait for all the agents to env.step() self.score_channel = Queue() # these are shmem np.arrays self.state, self.reward, self.finished = self.init_shared() self.workers = [ Worker(i, args.env, self.queues[i], self.barrier, self.state, self.reward, self.finished, self.score_channel) for i in range(self.num_actors) ] self.start_workers() self.model = Policy(self.c_in, self.num_actions).to(self.device) self.optim = torch.optim.Adam(self.model.parameters(), lr=self.eta) # used for logging and graphing self.stat = { 'scores': [], 'steps': [], 'clip_losses': [], 'value_losses': [], 'entropies': [] }
def __init__(self, env_name, batch_size, gamma, use_random_features): self.random = use_random_features self.batch_size = batch_size # batch_size == number of envs self.queues = [Queue() for i in range(batch_size)] self.barrier = Queue( ) # use to block Trainer until all envs finish updating self.channel = Queue( ) # envs send their total scores after each episode tmp_env = make_env(env_name) self.c_in = tmp_env.observation_space.shape[0] self.num_actions = tmp_env.action_space.n mean, std = self.mean_std_from_random_agent(tmp_env, 10000) # sh_state is shared between processes self.sh_state = self.init_shared(tmp_env.observation_space.shape) self.workers = [ Worker(i, env_name, self.queues[i], self.barrier, self.channel, self.sh_state, mean, std) for i in range(batch_size) ] self.start_workers() self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.gamma = gamma # reward discounting factor self.model = Policy(self.c_in, self.num_actions).to(self.device) self.icm = IntrinsicCuriosityModule(self.c_in, self.num_actions, self.random).to(self.device) self.optim = torch.optim.Adam(list(self.model.parameters()) + list(self.icm.parameters()), lr=1e-3) self.cross_entropy = torch.nn.CrossEntropyLoss()
def __init__(self, env, config, train): # Class name class_name = type(self).__name__.lower() # Gym environnement self.env = make_env(env) # Are we in evaluation mode self._train = train if train: # Parameters self.gamma = config.gamma self.bath_size = config.batch_size self.step_target_update = config.target_update self.freq_learning = config.freq_learning self.epsilon_decay = config.epsilon_decay self.epsilon_start = config.epsilon_start self.epsilon_end = config.epsilon_end self.num_steps = config.num_steps self.start_learning = config.start_learning # Experience-Replay self.memory = Memory(config.memory_capacity) # List to save the rewards self.plot_reward = [] self.plot_eval = [] # Architecture of the neural networks self.model = None # Error function self.__loss_fn = torch.nn.SmoothL1Loss(reduction='mean') # Architecture of the neural networks self.model = Dense_NN(self.env.observation_space, self.env.action_space.n) if train: self.qtarget = Dense_NN( self.env.observation_space, self.env.action_space.n) # Backpropagation function self.__optimizer = torch.optim.Adam( self.model.parameters(), lr=config.learning_rate) # Make the model using the GPU if available if torch.cuda.is_available(): self.device = torch.device('cuda') self.model.cuda() if train: self.qtarget.cuda() else: self.device = torch.device('cpu') # Path for the saves self.path_log = class_name + '.txt' self.path_save = class_name self.path_fig = class_name
LR = args.lr n_step = args.n_step env_name = args.env device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using ", device) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) if "-ram" in args.env or args.env == "CartPole-v0" or args.env == "LunarLander-v2": envs = MultiPro.SubprocVecEnv( [lambda: gym.make(args.env) for i in range(args.worker)]) eval_env = gym.make(args.env) else: envs = MultiPro.SubprocVecEnv( [lambda: wrapper.make_env(args.env) for i in range(args.worker)]) eval_env = wrapper.make_env(args.env) envs.seed(seed) eval_env.seed(seed + 1) action_size = eval_env.action_space.n state_size = eval_env.observation_space.shape agent = IQN_Agent(state_size=state_size, action_size=action_size, network=args.agent, munchausen=args.munchausen, layer_size=args.layer_size, n_step=n_step, BATCH_SIZE=BATCH_SIZE, BUFFER_SIZE=BUFFER_SIZE,
import torch import numpy as np from collections import Counter import time import wrapper import dqn_model ### Play the pong game with a trained dqn agent LOAD_PATH = './models/pong/400_pong_policy_net.pt' RENDER = True FPS = 25 ## for playing first we initialize the env env = wrapper.make_env("PongNoFrameskip-v4") ## initialize a model policy_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).eval() ## load the trained model #print(torch.load(LOAD_PATH)) policy_net.load_state_dict(torch.load(LOAD_PATH)) ## get the initial state state = env.reset() state = torch.FloatTensor(state).unsqueeze(0) total_reward = 0.0 action_count = Counter() ## play the game
parser.add_argument( "--reward", type=float, default=MEAN_REWARD_BOUND, help="Mean reward boundary for stop of training, default=%.2f" % MEAN_REWARD_BOUND) parser.add_argument('--double', default=False, action="store_true") args = parser.parse_args() double = args.double print('Double Q learning mode: {}'.format('True' if double else 'False')) print('The target reward: {}'.format(args.reward)) args.cuda = True device = torch.device("cuda" if args.cuda else "cpu") env = wrapper.make_env(args.env) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) writer = SummaryWriter(comment="-" + args.env) print(net) buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards = [] frame_idx = 0
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_env(args.env_name, args.seed, args.gamma) model = MujocoModel(envs.observation_space.shape[0], envs.action_space.shape[0]) model.to(device) algorithm = PPO(model, args.clip_param, args.value_loss_coef, args.entropy_coef, initial_lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) agent = MujocoAgent(algorithm, device) rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0], envs.action_space.shape[0]) obs = envs.reset() rollouts.obs[0] = np.copy(obs) episode_rewards = deque(maxlen=10) num_updates = int(args.num_env_steps) // args.num_steps for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(algorithm.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = agent.sample( rollouts.obs[step]) # why use obs from rollouts???有病吧 # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.append(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = agent.value(rollouts.obs[-1]) value_loss, action_loss, dist_entropy = agent.learn( next_value, args.gamma, args.gae_lambda, args.ppo_epoch, args.num_mini_batch, rollouts) rollouts.after_update() if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_steps print( "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms eval_mean_reward = evaluate(agent, ob_rms, args.env_name, args.seed, device)