def __init__(self, env, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=25, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.state_len = env.width * env.height self.nn = Model(in_features=2, hidden=[self.state_len, self.state_len], out_features=len(Agent.actions)) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.01) self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size
def __init__(self, env, model, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=10, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.model = model self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size
def __init__(self, inputs, n_actions): self.brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions) self.target_brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions) self.target_brain.load_state_dict(self.brain.state_dict()) self.target_brain.eval() self.set_params() self.optimizer = torch.optim.Adam(self.brain.parameters()) self.memory = ReplayMemory(50000) self.action_space = [0, 1]
def __init__(self, env, input_size, output_size, hidden_size, mix_hidden = 32, batch_size = 128, lr = 0.001, gamma = .999, eps_start = 0.9, eps_end = 0.05, eps_decay = 750, replay_capacity = 10000, num_save = 200, num_episodes = 10000, mode="random", training = False, load_file = None): self.env = env self.orig_env = copy.deepcopy(env) self.grid_map = env.grid_map self.cars = env.grid_map.cars self.num_cars = len(self.cars) self.passengers = env.grid_map.passengers self.num_passengers = len(self.passengers) self.input_size = input_size self.output_size = output_size self.hidden_size = hidden_size self.batch_size = batch_size self.gamma = gamma self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay self.replay_capacity = replay_capacity self.num_episodes = num_episodes self.steps_done = 0 self.lr = lr self.mode = mode self.num_save = num_save self.training = training self.algorithm = PairAlgorithm() self.episode_durations = [] self.loss_history = [] self.memory = ReplayMemory(self.replay_capacity) self.device = torch.device("cpu")#"cuda:0" if torch.cuda.is_available() else print("Device being used:", self.device) self.policy_net = DQN(self.input_size, self.output_size , self.hidden_size).to(self.device) self.params = list(self.policy_net.parameters()) if self.mode == "qmix": self.mixer = QMixer(self.input_size, self.num_passengers, mix_hidden).to(self.device) self.params += list(self.mixer.parameters()) if load_file: self.policy_net.load_state_dict(torch.load(load_file)) self.policy_net.eval() if self.mode == "qmix": self.mixer.load_state_dict(torch.load("mixer_" + load_file)) self.mixer.eval() self.load_file = "Trained_" + load_file print("Checkpoint loaded") else: self.load_file = self.mode + "_model_num_cars_" + str(self.num_cars) + "_num_passengers_" + str(self.num_passengers) + \ "_num_episodes_" + str(self.num_episodes) + "_hidden_size_" + str(self.hidden_size) + ".pth" self.optimizer = optim.RMSprop(self.params, lr = self.lr)
def __init__(self): # self.config = config self.gamma = 0.4 # self.logger = logging.getLogger("DQNAgent") self.screen_width = 600 # define models (policy and target) self.policy_model = DQN() self.target_model = DQN() # define memory self.memory = ReplayMemory() # define loss self.loss = HuberLoss() # define optimizer self.optim = torch.optim.Adam(self.policy_model.parameters(), lr=0.01) # define environment self.env = PyCar() #TODO # self.cartpole = PyCar(self.screen_width) # initialize counter self.current_episode = 0 self.current_iteration = 0 self.episode_durations = [] self.batch_size = 250 # set cuda flag self.is_cuda = torch.cuda.is_available() self.cuda = self.is_cuda if self.cuda: self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.policy_model = self.policy_model.to(self.device) self.target_model = self.target_model.to(self.device) self.loss = self.loss.to(self.device) # Initialize Target model with policy model state dict self.target_model.load_state_dict(self.policy_model.state_dict()) self.target_model.eval() self.savepath = "/home/sk002/Documents/RL-Project/model/"
def testReplayMemory(self): od = [84, 84, 4] ad = [8, 10] rd = [5] s = int(10000) b = 32 rm = ReplayMemory(obs_dim=od, act_dim=ad, r_dim=rd, size=s) o = self.get_rand(od) a = self.get_rand(ad) r = self.get_rand(rd) d = 0 for _ in range(1000): rm.store(o, a, r, o, d) o_s, a_s, r_s, on_s, d_s = rm.sample(b) self.assertEqual(o_s.shape, combined_shape(b, od)) self.assertEqual(a_s.shape, combined_shape(b, ad)) self.assertEqual(r_s.shape, combined_shape(b, rd)) self.assertEqual(on_s.shape, combined_shape(b, od)) self.assertEqual(d_s.shape, combined_shape(b))
def __init__(self, env, p=1.0, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=10, target_update=10, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost q = (1.0 - p) / 2 self.stochastic_actions = { '←': [[0, 2, 3], [p, q, q]], '→': [[1, 2, 3], [p, q, q]], '↑': [[2, 0, 1], [p, q, q]], '↓': [[3, 0, 1], [p, q, q]] } self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.state_len = env.width * env.height self.nn = Model( in_features=self.state_len, hidden=[], out_features=len(Agent.actions)) self.target_nn = Model( in_features=self.state_len, hidden=[], out_features=len(Agent.actions)) self.target_nn.load_state_dict(self.nn.state_dict()) self.target_nn.eval() self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.05) self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size self.target_update = target_update
class RLAgent(Player): def __init__(self, name, others=None, last_n=10, load_path=None, checkpoint=5000, fixed_strategy=False, eps_decay=0.00005): if others is None: others = [1, 2] self.others = others self.last_n = last_n self.prev_points = 0 self.batch_size = 32 self.gamma = 0.9 self.eps_start = 1 self.eps_end = 0.01 self.eps_decay = eps_decay self.target_update = 100 self.plot_at = 1000 self.q_max = [] self.q_list = [] self.checkpoint = checkpoint self.memory_size = 1000 self.lr = 0.00001 self.train = True self.input_dim = len(others) * 6 self.output_dim = 3 self.current_step = 1 self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.memory = ReplayMemory(self.memory_size) # Initialize the policy and target networks self.policy_net = DQN(self.input_dim, self.output_dim).to(self.device) self.target_net = DQN(self.input_dim, self.output_dim).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() if load_path is not None: checkpoint = torch.load(load_path) self.policy_net.load_state_dict(checkpoint['model_state_dict']) self.policy_net.eval() self.eps_start = 0 self.eps_end = 0 self.train = False if fixed_strategy: self.strategy = FixedStrategy() self.strategy = EpsilonGreedyStrategy(self.eps_start, self.eps_end, self.eps_decay) # Set the optimizer self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=self.lr) self.loss = None # Push to replay memory self.prev_state = None self.action = None self.reward = None self.current_state = None super().__init__(name) def select_action(self, valid_actions, history): # print(self.memory.can_provide_sample(self.batch_size)) if self.memory.can_provide_sample(self.batch_size) and self.train: self.train_model() if len(history) > self.last_n + 1: self.prev_state, self.current_state = self.get_states(history) self.reward = self.get_reward() if self.action is not None and self.train: self.memory.push( Experience(self.prev_state, self.action, self.current_state, self.reward)) self.action = self.get_action(valid_actions) return self.action.item() else: return np.random.choice(valid_actions) def get_states(self, history): prev_state, current_state = [], [] if len(history) > self.last_n + 1: for other in self.others: other_history = [i[other] for i in history] other_last_n = other_history[-self.last_n:] other_last_n_p = other_history[-self.last_n - 1:-1] other_policy_total = get_policy(other_history) other_policy_last_n = get_policy(other_last_n) other_policy_total_p = get_policy(other_history[:-1]) other_policy_last_n_p = get_policy(other_last_n_p) prev_state.extend(other_policy_total_p + other_policy_last_n_p) current_state.extend(other_policy_total + other_policy_last_n) return torch.as_tensor(prev_state).unsqueeze(-2), torch.as_tensor( current_state).unsqueeze(-2) def get_reward(self): reward = self.points - self.prev_points self.prev_points = self.points return torch.tensor([reward]) def get_action(self, valid_actions): rate = self.strategy.get_exploration_rate(self.current_step) self.current_step += 1 if rate > random.random(): # For random, we can pass the allowable_moves vector and choose from it randomly action = np.random.choice(valid_actions) return torch.tensor([action]).to(self.device) # explore else: with torch.no_grad(): self.q_max.append( self.policy_net(self.current_state).max().item()) return self.policy_net(self.current_state).max(1)[1].to( self.device) # exploit def train_model(self): experiences = self.memory.sample(self.batch_size) states, actions, rewards, next_states = extract_tensors(experiences) if self.current_step % self.target_update == 0: print('UPDATE TARGET NET', self.current_step) self.q_list.extend(self.q_max) print('Q Max', sum(self.q_max) / self.target_update) q_max_list.append(sum(self.q_max) / self.target_update) self.q_max = [] self.target_net.load_state_dict(self.policy_net.state_dict()) if self.current_step % self.plot_at == 0: e_ = self.memory.memory[-100:] batch = Experience(*zip(*e_)) print('\n', '*' * 42) print('EXPLORATION RATE', self.strategy.get_exploration_rate(self.current_step)) print('REWARD', sum(batch.reward).item()) print('POLICY', get_policy([i.item() for i in batch.action])) print('*' * 42, '\n') plt.plot(range(len(q_max_list)), q_max_list) plt.show() if self.current_step % self.checkpoint == 0: print('SAVE CHECKPOINT AT', self.current_step) checkpoint_path = checkpoint_folder + checkpoint_prefix + str( self.current_step) + checkpoint_suffix torch.save({'model_state_dict': self.policy_net.state_dict()}, checkpoint_path) current_q_values = QValues.get_current(self.policy_net, states, actions) next_q_values = QValues.get_next(self.policy_net, self.target_net, next_states) target_q_values = (next_q_values * self.gamma) + rewards self.loss = F.mse_loss(current_q_values, target_q_values) self.optimizer.zero_grad() self.loss.backward() self.optimizer.step()
# Initialize environment and config. env = gym.make(args.env) env_config = ENV_CONFIGS[args.env] env = gym.wrappers.AtariPreprocessing(env, screen_size=84, grayscale_obs=True, frame_skip=1, noop_max=30, scale_obs=True) # Initialize deep Q-networks. dqn = DQN(env_config=env_config).to(device) # TODO: Create and initialize target Q-network. target_dqn = DQN(env_config=env_config).to(device) # Create replay memory. memory = ReplayMemory(env_config['memory_size']) # Initialize optimizer used for training the DQN. We use Adam rather than RMSProp. optimizer = torch.optim.Adam(dqn.parameters(), lr=env_config['lr']) # Keep track of best evaluation mean return achieved so far. best_mean_return = -float("Inf") for episode in range(env_config['n_episodes']): done = False obs = preprocess(env.reset(), envID=args.env, env=env).unsqueeze(0) obs_stack = torch.cat(env_config['obs_stack_size'] * [obs]).unsqueeze(0).to(device) count = 0 while not done: # TODO: Get action from DQN. action = dqn.act(obs_stack)
def __init__(self): # self.config = config self.gamma = 0.75 # self.logger = logging.getLogger("DQNAgent") self.screen_width = 600 # define models (policy and target) self.policy_model = DQN() self.target_model = DQN() # define memory self.memory = ReplayMemory() # define loss self.loss = HuberLoss() # define optimizer self.optim = torch.optim.Adam(self.policy_model.parameters(), lr=0.0001) # define environment self.env = PyCar() #TODO # self.cartpole = PyCar(self.screen_width) # initialize counter self.current_episode = 0 self.current_iteration = 0 self.episode_durations = [] self.batch_size = 250 # set cuda flag self.is_cuda = torch.cuda.is_available() self.cuda = self.is_cuda if self.cuda: self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.policy_model = self.policy_model.to(self.device) self.target_model = self.target_model.to(self.device) self.loss = self.loss.to(self.device) # Initialize Target model with policy model state dict self.target_model.load_state_dict(self.policy_model.state_dict()) self.target_model.eval() self.savepath = os.path.join(os.getcwd(), "model") + "/" if not os.path.isdir(self.savepath): os.makedirs(self.savepath) t = time.localtime() self.save_tensorboard_path = os.path.join( os.getcwd(), "tensorboard_record") + "/run_" + time.strftime( "%d_%m_%Y_%H_%M", t) + "/" if not os.path.isdir(self.savepath): os.makedirs(self.savepath) self.writer = SummaryWriter(self.save_tensorboard_path)
class DQNAgent: def __init__(self): # self.config = config self.gamma = 0.75 # self.logger = logging.getLogger("DQNAgent") self.screen_width = 600 # define models (policy and target) self.policy_model = DQN() self.target_model = DQN() # define memory self.memory = ReplayMemory() # define loss self.loss = HuberLoss() # define optimizer self.optim = torch.optim.Adam(self.policy_model.parameters(), lr=0.0001) # define environment self.env = PyCar() #TODO # self.cartpole = PyCar(self.screen_width) # initialize counter self.current_episode = 0 self.current_iteration = 0 self.episode_durations = [] self.batch_size = 250 # set cuda flag self.is_cuda = torch.cuda.is_available() self.cuda = self.is_cuda if self.cuda: self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.policy_model = self.policy_model.to(self.device) self.target_model = self.target_model.to(self.device) self.loss = self.loss.to(self.device) # Initialize Target model with policy model state dict self.target_model.load_state_dict(self.policy_model.state_dict()) self.target_model.eval() self.savepath = os.path.join(os.getcwd(), "model") + "/" if not os.path.isdir(self.savepath): os.makedirs(self.savepath) t = time.localtime() self.save_tensorboard_path = os.path.join( os.getcwd(), "tensorboard_record") + "/run_" + time.strftime( "%d_%m_%Y_%H_%M", t) + "/" if not os.path.isdir(self.savepath): os.makedirs(self.savepath) self.writer = SummaryWriter(self.save_tensorboard_path) def run(self): """ This function will the operator :return: """ try: self.train() except KeyboardInterrupt as e: print(e) def select_action(self, state, random_only=False): """ The action selection function, it either uses the model to choose an action or samples one uniformly. :param state: current state of the model :return: """ self.eps_start = 0.90 self.eps_end = 0.35 self.eps_decay = 500 if self.cuda: state = state.cuda() sample = random.random() eps_threshold = self.eps_start - ( self.eps_start - self.eps_end) * math.exp( -1. * self.current_iteration / self.eps_decay) self.writer.add_scalar('epsilon', eps_threshold, self.current_iteration) # print("Eps thresh: ", eps_threshold) if sample < eps_threshold and not random_only: # print("Model step") with torch.no_grad(): return self.policy_model(state).max(1)[1].view(1, 1) # size (1,1) else: # print("Random step") return torch.tensor([[random.randrange(5)]], device=self.device, dtype=torch.long) def get_action(self, state): if self.cuda: state = state.cuda() with torch.no_grad(): return self.policy_model(state).max(1)[1].view(1, 1) # size (1,1) def optimize_policy_model(self): """ performs a single step of optimization for the policy model :return: """ if self.memory.length() < self.batch_size: return self.memory.setup_epoch_training() total_loss = None training_len = math.ceil(self.memory.length() / self.batch_size) for i in range(training_len): # sample a batch transitions = self.memory.sample_batch(self.batch_size, i) len_transitions = len(transitions) one_batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, one_batch.next_state)), device=self.device, dtype=torch.uint8) non_final_next_states = torch.cat( [s for s in one_batch.next_state if s is not None]) state_batch = torch.cat(one_batch.state) action_batch = torch.cat(one_batch.action) reward_batch = torch.cat(one_batch.reward) state_batch = state_batch.to(self.device) non_final_next_states = non_final_next_states.to(self.device) curr_state_values = self.policy_model(state_batch) # [128, 2] curr_state_action_values = curr_state_values.gather( 1, action_batch) # [128, 1] next_state_values = torch.zeros(len_transitions, device=self.device) # [128] next_state_values[non_final_mask] = self.target_model( non_final_next_states).max(1)[0].detach() # [< 128] # Get the expected Q values expected_state_action_values = (next_state_values * self.gamma) + reward_batch # [128] # compute loss: temporal difference error loss = self.loss(curr_state_action_values, expected_state_action_values.unsqueeze(1)) if total_loss is None: total_loss = loss else: total_loss += loss # optimizer step self.optim.zero_grad() loss.backward() for param in self.policy_model.parameters(): param.grad.data.clamp_(-1, 1) self.optim.step() self.writer.add_scalar('loss', total_loss / training_len, self.current_iteration) # return loss def train(self): """ Training loop based on the number of episodes :return: """ self.num_episodes = 2000 self.target_update = 1 mean_score, max_score, min_score = self.run_sim(100, random_only=True) self.writer.add_scalar('mean_score', mean_score, 0) self.writer.add_scalar('max_score', max_score, 0) self.writer.add_scalar('min_score', min_score, 0) for episode in tqdm(range(self.current_episode, self.num_episodes)): self.current_iteration += 1 self.current_episode = episode # reset environment self.train_one_epoch() # The target network has its weights kept frozen most of the time if self.current_episode % self.target_update == 0: self.target_model.load_state_dict( self.policy_model.state_dict()) if self.current_episode % 25 == 0: torch.save( self.policy_model.state_dict(), self.savepath + "policy_epoch" + str(self.current_episode) + ".pth") torch.save( self.target_model.state_dict(), self.savepath + "target_epoch" + str(self.current_episode) + ".pth") def run_sim(self, count=20, random_only=False): score_list = [] for i in range(count): self.env.reset_game() episode_duration = 0 curr_state = torch.Tensor(self.env.get_state()).permute( 2, 0, 1).unsqueeze(0) while (1): # time.sleep(0.1) episode_duration += 1 # select action action = self.select_action(curr_state, random_only) images, reward, done, score = self.env.step( action.item()) #TODO if self.cuda: reward = torch.Tensor([reward]).to(self.device) else: reward = torch.Tensor([reward]).to(self.device) # assign next state if done: next_state = None else: next_state = torch.Tensor(images).permute(2, 0, 1).unsqueeze( 0) #TODO # add this transition into memory self.memory.push_transition(curr_state, action, next_state, reward) curr_state = next_state if done: score_list.append(score) break return np.mean(np.array(score_list)), np.max( np.array(score_list)), np.min(np.array(score_list)) def train_one_epoch(self): """ One episode of training; it samples an action, observe next screen and optimize the model once :return: """ mean_score, max_score, min_score = self.run_sim() # print(mean_score) self.writer.add_scalar('mean_score', mean_score, self.current_iteration) self.writer.add_scalar('max_score', max_score, self.current_iteration) self.writer.add_scalar('min_score', min_score, self.current_iteration) # Policy model optimization step self.optimize_policy_model() def validate(self): curr_state = torch.Tensor(self.env.get_state()).permute(2, 0, 1).unsqueeze(0) while (1): # time.sleep(0.1) episode_duration += 1 # select action action = self.get_action(curr_state) images, reward, done, score = self.env.step(action.item()) #TODO if self.cuda: reward = torch.Tensor([reward]).to(self.device) else: reward = torch.Tensor([reward]).to(self.device) # assign next state if done: next_state = None else: next_state = torch.Tensor(images).permute(2, 0, 1).unsqueeze( 0) #TODO curr_state = next_state if done: print(score) break
class Agent(AbstractAgent): actions = ['←', '→', '↑', '↓'] def __init__(self, env, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=25, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.state_len = env.width * env.height self.nn = Model(in_features=2, hidden=[self.state_len, self.state_len], out_features=len(Agent.actions)) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.01) self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size def step(self, state, action): return self.env.step(state, action) def print_policy(self): for y in range(self.env.height): for x in range(self.env.width): s = y * self.env.width + x cell = self.env.field[s] if not (cell == '.' or cell == 's'): print(cell, end='') continue q_predicted = self._predict_q(s) a = torch.argmax(q_predicted, 0).item() print(Agent.actions[a], end='') print() def _encode_state(self, s): # z = np.zeros(self.state_len) # z[s] = 1 # return torch.tensor(z, dtype=torch.float) w = self.env.width x, y = s % w, s // w return torch.tensor([x, y], dtype=torch.float) def _predict_q(self, s): return self.nn(self._encode_state(s)) def optimize(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) for s, a, s1, r in transitions: q_predicted = self._predict_q(s) q_target = q_predicted.clone().detach() q_target[a] = r + self.y * self._predict_q(s1).max().item() loss = self.criterion(q_predicted, q_target) self.losses.append(loss) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def run_episode(self): AbstractAgent.run_episode(self) s = self.s0 self.rewards.append(.0) for j in range(self.episode_length): q_predicted = self._predict_q(s) a = torch.argmax(q_predicted, 0).item() a = self.select_action(a) s1, r, over = self.step(s, Agent.actions[a]) if s != s1: r -= self.step_cost r -= self.living_cost self.memory.push(s, a, s1, r) s = s1 self.optimize() self.rewards[-1] += r if over: break
class DQNAgent: def __init__(self, inputs, n_actions): self.brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions) self.target_brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions) self.target_brain.load_state_dict(self.brain.state_dict()) self.target_brain.eval() self.set_params() self.optimizer = torch.optim.Adam(self.brain.parameters()) self.memory = ReplayMemory(50000) self.action_space = [0, 1] def set_params(self): self.batch_size = 64 self.max_exploration_rate = 1 self.min_exploration_rate = 0.05 self.exploration_decay_rate = 0.0005 self.steps_done = 0 def select_action(self, state): sample = np.random.random() exploration_rate = self.min_exploration_rate + ( self.max_exploration_rate - self.min_exploration_rate) * np.exp( -self.steps_done * self.exploration_decay_rate) self.steps_done += 1 if sample > exploration_rate: with torch.no_grad(): actions = self.brain(state) return torch.argmax(actions).item() else: return np.random.choice(self.action_space) def learn(self): if len(self.memory) < self.batch_size: return self.optimizer.zero_grad() max_capacity = (len(self.memory) if len(self.memory) < self.memory.capacity else self.memory.capacity) batch = np.random.choice(max_capacity, self.batch_size) transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor( tuple(map(lambda s: s is not None, batch.next_state)), dtype=torch.bool, ) non_final_next_states = torch.tensor( [s for s in batch.next_state if s is not None]) state_batch = torch.tensor(batch.state) action_batch = torch.tensor(batch.action) reward_batch = torch.tensor(batch.reward, dtype=torch.float) state_action_values = self.brain(state_batch).gather( 1, action_batch.unsqueeze(-1)) next_state_values = torch.zeros(self.batch_size) next_state_values[non_final_mask] = self.target_brain( non_final_next_states).max(1)[0] gamma = 0.99 expected_state_action_values = (gamma * next_state_values + reward_batch / reward_batch.max()) self.loss = torch.nn.MSELoss()( expected_state_action_values.unsqueeze(-1), state_action_values) self.optimizer.zero_grad() self.loss.backward() self.optimizer.step()
class Agent(AbstractAgent): actions = ['←', '→', '↑', '↓'] def __init__(self, env, model, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=10, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.model = model self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size def step(self, state, action): return self.env.step(state, action) def print_policy(self): for y in range(self.env.height): for x in range(self.env.width): s = y * self.env.width + x cell = self.env.field[s] if not (cell == '.' or cell == 's'): print(cell, end='') continue q_predicted = self.predict_q(s) a = np.argmax(q_predicted) print(Agent.actions[a], end='') print() def _encode_state(self, s): z = np.zeros(self.env.length) z[s] = 1.0 return np.array([z]) def predict_q(self, s): return self.model.predict(self._encode_state(s))[0] def optimize(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) for s, a, s1, r in transitions: q_predicted = self.predict_q(s) q_target = q_predicted q_target[a] = r + self.y * self.predict_q(s1).max() history = self.model.fit(x=self._encode_state(s), y=np.array([q_target]), epochs=1, verbose=False) self.losses.append(history.history["loss"][-1]) def run_episode(self): AbstractAgent.run_episode(self) s = self.s0 self.rewards.append(.0) for j in range(self.episode_length): q_predicted = self.predict_q(s) a = np.argmax(q_predicted) a = self.select_action(a) s1, r, over = self.step(s, Agent.actions[a]) if s != s1: r -= self.step_cost r -= self.living_cost self.memory.push(s, a, s1, r) s = s1 self.optimize() self.rewards[-1] += r if over: break
class Agent(AbstractAgent): actions = ['←', '→', '↑', '↓'] def __init__(self, env, p=1.0, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=10, target_update=10, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost q = (1.0 - p) / 2 self.stochastic_actions = { '←': [[0, 2, 3], [p, q, q]], '→': [[1, 2, 3], [p, q, q]], '↑': [[2, 0, 1], [p, q, q]], '↓': [[3, 0, 1], [p, q, q]] } self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.state_len = env.width * env.height self.nn = Model( in_features=self.state_len, hidden=[], out_features=len(Agent.actions)) self.target_nn = Model( in_features=self.state_len, hidden=[], out_features=len(Agent.actions)) self.target_nn.load_state_dict(self.nn.state_dict()) self.target_nn.eval() self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.05) self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size self.target_update = target_update def step(self, state, action): # simulating Markov Process, desired action happens with probability p # but with the probability (1-p) / 2 the agent goes sideways sa = self.stochastic_actions[action] mp_action = np.random.choice(sa[0], p=sa[1]) action = Agent.actions[mp_action] return self.env.step(state, action) def print_policy(self): for y in range(self.env.height): for x in range(self.env.width): s = y * self.env.width + x cell = self.env.field[s] if not (cell == '.' or cell == 's'): print(cell, end='') continue q_predicted = self._predict_q_policy(s) a = torch.argmax(q_predicted, 0).item() print(Agent.actions[a], end='') print() def _encode_state(self, s): z = np.zeros(self.state_len) z[s] = 1 return torch.tensor(z, dtype=torch.float) def _predict_q_policy(self, s): return self.nn(self._encode_state(s)) def _predict_q_target(self, s): return self.target_nn(self._encode_state(s)) def optimize(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) for s, a, s1, r in transitions: q_predicted = self._predict_q_policy(s) q_target = q_predicted.clone().detach() q_target[a] = r + self.y * self._predict_q_target(s1).max().item() loss = self.criterion(q_predicted, q_target) self.losses.append(loss.item()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def run_episode(self): AbstractAgent.run_episode(self) s = self.s0 episode_number = len(self.rewards) self.rewards.append(.0) for j in range(self.episode_length): q_predicted = self._predict_q_policy(s) a = torch.argmax(q_predicted, 0).item() a = self.select_action(a) s1, r, over = self.step(s, Agent.actions[a]) if s != s1: r -= self.step_cost r -= self.living_cost self.memory.push(s, a, s1, r) s = s1 self.optimize() self.rewards[-1] += r if over: break if episode_number % self.target_update == 0: self.target_nn.load_state_dict(self.nn.state_dict())
def __init__(self, name, others=None, last_n=10, load_path=None, checkpoint=5000, fixed_strategy=False, eps_decay=0.00005): if others is None: others = [1, 2] self.others = others self.last_n = last_n self.prev_points = 0 self.batch_size = 32 self.gamma = 0.9 self.eps_start = 1 self.eps_end = 0.01 self.eps_decay = eps_decay self.target_update = 100 self.plot_at = 1000 self.q_max = [] self.q_list = [] self.checkpoint = checkpoint self.memory_size = 1000 self.lr = 0.00001 self.train = True self.input_dim = len(others) * 6 self.output_dim = 3 self.current_step = 1 self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.memory = ReplayMemory(self.memory_size) # Initialize the policy and target networks self.policy_net = DQN(self.input_dim, self.output_dim).to(self.device) self.target_net = DQN(self.input_dim, self.output_dim).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() if load_path is not None: checkpoint = torch.load(load_path) self.policy_net.load_state_dict(checkpoint['model_state_dict']) self.policy_net.eval() self.eps_start = 0 self.eps_end = 0 self.train = False if fixed_strategy: self.strategy = FixedStrategy() self.strategy = EpsilonGreedyStrategy(self.eps_start, self.eps_end, self.eps_decay) # Set the optimizer self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=self.lr) self.loss = None # Push to replay memory self.prev_state = None self.action = None self.reward = None self.current_state = None super().__init__(name)
dtype=torch.long) if __name__ == "__main__": BATCH_SIZE = 128 GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 TARGET_UPDATE = 10 MAX_T = 9999 steps_done = 0 timer = Timer() rect = util.get_screen_rect() region = (rect[0], rect[1], rect[2] - rect[0], rect[3] - rect[1]) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") monitor = Monitor(device, region) env = gym.make("Game-v0") init_screen = monitor.get_screen(pytorch=True) _, _, height, width = init_screen.shape n_actions = env.action_space.n policy_net = DQN(width, height, n_actions).to(device) target_net = DQN(width, height, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = torch.optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(3000) simulate()
ACTION_BUILD_BARRACKS, ACTION_ATTACK, ACTION_SELECT_BARRACKS, ACTION_BUILD_MARINE, ] KILL_UNIT_REWARD = 0.2 KILL_BUILDING_REWARD = 0.5 reward_check = [] model = DQN(6, 8) optimizer = optim.RMSprop(model.parameters(), 1e-3) memory = ReplayMemory(10000) class DQNAgent(base_agent.BaseAgent): def __init__(self): super(DQNAgent, self).__init__() self.previous_state = None self.previous_action = None self.model = model self.memory = memory self.optimizer = optimizer self.diagnostics = [0, 0, 0, 0, 0, 0, 0, 0] self.base_top_left = None self.supply_depot_built = False self.scv_selected = False
outdir = 'results/selfx-billard' env = wrappers.Monitor(env, directory=outdir, force=True) env.seed(0) env.reset() init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape n_actions = len(env.action_space) policy_net = DQN(screen_height, screen_width, n_actions).to(device) target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() steps_done = 0 memory = ReplayMemory(10000) optimizer = optim.RMSprop(policy_net.parameters()) def select_action(state): global steps_done sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY) steps_done += 1 if sample > eps_threshold: with torch.no_grad(): expected_reward = policy_net(state) return expected_reward.max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(n_actions)]],
class DQNAgent: def __init__(self): # self.config = config self.gamma = 0.4 # self.logger = logging.getLogger("DQNAgent") self.screen_width = 600 # define models (policy and target) self.policy_model = DQN() self.target_model = DQN() # define memory self.memory = ReplayMemory() # define loss self.loss = HuberLoss() # define optimizer self.optim = torch.optim.Adam(self.policy_model.parameters(), lr=0.01) # define environment self.env = PyCar() #TODO # self.cartpole = PyCar(self.screen_width) # initialize counter self.current_episode = 0 self.current_iteration = 0 self.episode_durations = [] self.batch_size = 250 # set cuda flag self.is_cuda = torch.cuda.is_available() self.cuda = self.is_cuda if self.cuda: self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.policy_model = self.policy_model.to(self.device) self.target_model = self.target_model.to(self.device) self.loss = self.loss.to(self.device) # Initialize Target model with policy model state dict self.target_model.load_state_dict(self.policy_model.state_dict()) self.target_model.eval() self.savepath = "/home/sk002/Documents/RL-Project/model/" def run(self): """ This function will the operator :return: """ try: self.train() except KeyboardInterrupt as e: print(e) def select_action(self, state): """ The action selection function, it either uses the model to choose an action or samples one uniformly. :param state: current state of the model :return: """ self.eps_start = 0.95 self.eps_end = 0.65 self.eps_decay = 2000 if self.cuda: state = state.cuda() sample = random.random() eps_threshold = self.eps_start - ( self.eps_start - self.eps_end) * math.exp( -1. * self.current_iteration / self.eps_decay) self.current_iteration += 1 # print("Eps thresh: ", eps_threshold) if sample < eps_threshold: # print("Model step") with torch.no_grad(): return self.policy_model(state).max(1)[1].view(1, 1) # size (1,1) else: # print("Random step") return torch.tensor([[random.randrange(5)]], device=self.device, dtype=torch.long) def get_action(self, state): if self.cuda: state = state.cuda() with torch.no_grad(): return self.policy_model(state).max(1)[1].view(1, 1) # size (1,1) def optimize_policy_model(self): """ performs a single step of optimization for the policy model :return: """ if self.memory.length() < self.batch_size: return # sample a batch transitions = self.memory.sample_batch(self.batch_size) one_batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, one_batch.next_state)), device=self.device, dtype=torch.uint8) non_final_next_states = torch.cat( [s for s in one_batch.next_state if s is not None]) state_batch = torch.cat(one_batch.state) action_batch = torch.cat(one_batch.action) reward_batch = torch.cat(one_batch.reward) state_batch = state_batch.to(self.device) non_final_next_states = non_final_next_states.to(self.device) curr_state_values = self.policy_model(state_batch) # [128, 2] curr_state_action_values = curr_state_values.gather( 1, action_batch) # [128, 1] next_state_values = torch.zeros(self.batch_size, device=self.device) # [128] next_state_values[non_final_mask] = self.target_model( non_final_next_states).max(1)[0].detach() # [< 128] # Get the expected Q values expected_state_action_values = (next_state_values * self.gamma) + reward_batch # [128] # compute loss: temporal difference error loss = self.loss(curr_state_action_values, expected_state_action_values.unsqueeze(1)) # optimizer step self.optim.zero_grad() loss.backward() for param in self.policy_model.parameters(): param.grad.data.clamp_(-1, 1) self.optim.step() return loss def train(self): """ Training loop based on the number of episodes :return: """ self.num_episodes = 1000 self.target_update = 5 for episode in tqdm(range(self.current_episode, self.num_episodes)): self.current_episode = episode # reset environment self.env.reset_game() self.train_one_epoch() # The target network has its weights kept frozen most of the time if self.current_episode % self.target_update == 0: self.target_model.load_state_dict( self.policy_model.state_dict()) if self.current_episode % 50 == 0: torch.save( self.policy_model.state_dict(), self.savepath + "policy_epoch" + str(self.current_episode) + ".pth") torch.save( self.target_model.state_dict(), self.savepath + "target_epoch" + str(self.current_episode) + ".pth") def train_one_epoch(self): """ One episode of training; it samples an action, observe next screen and optimize the model once :return: """ episode_duration = 0 curr_state = torch.Tensor(self.env.get_state()).permute(2, 0, 1).unsqueeze(0) while (1): # time.sleep(0.1) episode_duration += 1 # select action action = self.select_action(curr_state) images, reward, done, score = self.env.step(action.item()) #TODO if self.cuda: reward = torch.Tensor([reward]).to(self.device) else: reward = torch.Tensor([reward]).to(self.device) # assign next state if done: next_state = None else: next_state = torch.Tensor(images).permute(2, 0, 1).unsqueeze( 0) #TODO # add this transition into memory self.memory.push_transition(curr_state, action, next_state, reward) curr_state = next_state # Policy model optimization step curr_loss = self.optimize_policy_model() if curr_loss is not None: if self.cuda: curr_loss = curr_loss.cpu() if done: print(score) break def validate(self): curr_state = torch.Tensor(self.env.get_state()).permute(2, 0, 1).unsqueeze(0) while (1): # time.sleep(0.1) episode_duration += 1 # select action action = self.get_action(curr_state) images, reward, done, score = self.env.step(action.item()) #TODO if self.cuda: reward = torch.Tensor([reward]).to(self.device) else: reward = torch.Tensor([reward]).to(self.device) # assign next state if done: next_state = None else: next_state = torch.Tensor(images).permute(2, 0, 1).unsqueeze( 0) #TODO curr_state = next_state if done: print(score) break
class Agent: def __init__(self, env, input_size, output_size, hidden_size, max_cars=10, max_passengers=10, mix_hidden=32, batch_size=128, lr=0.001, gamma=.999, eps_start=0.9, eps_end=0.05, eps_decay=750, replay_capacity=10000, num_save=200, num_episodes=10000, mode="random", training=False, load_file=None): self.env = env self.orig_env = copy.deepcopy(env) self.grid_map = env.grid_map self.cars = env.grid_map.cars self.num_cars = len(self.cars) self.passengers = env.grid_map.passengers self.num_passengers = len(self.passengers) self.max_cars = max_cars self.max_passengers = max_passengers self.input_size = input_size self.output_size = output_size self.hidden_size = hidden_size self.batch_size = batch_size self.gamma = gamma self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay self.replay_capacity = replay_capacity self.num_episodes = num_episodes self.steps_done = 0 self.lr = lr self.mode = mode self.num_save = num_save self.training = training self.algorithm = PairAlgorithm() self.episode_durations = [] self.duration_matrix = np.zeros((self.max_passengers, self.max_cars)) self.count_matrix = np.zeros((self.max_passengers, self.max_cars)) self.loss_history = [] self.memory = ReplayMemory(self.replay_capacity) self.device = torch.device("cpu") print("Device being used:", self.device) self.policy_net = DQN(self.input_size, self.output_size, self.hidden_size).to(self.device) self.params = list(self.policy_net.parameters()) if self.mode == "qmix": self.mixer = QMixer(self.input_size, self.max_passengers, mix_hidden).to(self.device) self.params += list(self.mixer.parameters()) if load_file: self.policy_net.load_state_dict(torch.load(load_file)) if self.mode == "qmix": self.mixer.load_state_dict(torch.load("mixer_" + load_file)) self.mixer.eval() self.policy_net.eval() self.load_file = "Pretrained_" + load_file print("Checkpoint loaded") else: self.load_file = self.mode + "_model_num_cars_" + str(self.num_cars) + "_num_passengers_" + str(self.num_passengers) + \ "_num_episodes_" + str(self.num_episodes) + "_hidden_size_" + str(self.hidden_size) + ".pth" self.optimizer = optim.RMSprop(self.params, lr=self.lr) #self.optimizer = optim.Adam(self.params, lr=self.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) #self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1500, gamma=0.1) def select_action(self, state): #Select action with epsilon greedy sample = random.random() eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \ math.exp(-1. * self.steps_done / self.eps_decay) print(eps_threshold) self.steps_done += 1 if not self.training: eps_threshold = 0.0 if sample > eps_threshold: # Choose best action with torch.no_grad(): self.policy_net.eval() action = self.policy_net(state).view( self.max_passengers, self.max_cars)[:, :self.num_cars].max(1)[1].view( 1, self.max_passengers) action[0, self.num_passengers:] = self.max_cars return action else: #Choose random action action = torch.tensor([[ random.randrange(self.num_cars) for car in range(self.max_passengers) ]], device=self.device, dtype=torch.long) action[0, self.num_passengers:] = self.max_cars return action def random_action(self, state): return torch.tensor([[ random.randrange(self.num_cars) for car in range(self.num_passengers) ]], device=self.device, dtype=torch.long) def get_state(self): # Cars (px, py, 1=matched), Passengers(pickup_x, pickup_y, dest_x, dest_y, 1=matched) # Vector Size = 3*C + 5*P cars = self.cars passengers = self.passengers indicator_cars_vec = np.zeros(self.max_cars) indicator_passengers_vec = np.zeros(self.max_passengers) # Encode information about cars cars_vec = np.array([0] * (2 * self.max_cars)) for i, car in enumerate(cars): cars_vec[2 * i:2 * i + 2] = [car.position[0], car.position[1]] indicator_cars_vec[i] = 1 # Encode information about passengers passengers_vec = np.array([0] * (4 * self.max_passengers)) for i, passenger in enumerate(passengers): passengers_vec[4 * i:4 * i + 4] = [ passenger.pick_up_point[0], passenger.pick_up_point[1], passenger.drop_off_point[0], passenger.drop_off_point[1] ] indicator_passengers_vec[i] = 1 return torch.tensor(np.concatenate( (cars_vec, indicator_cars_vec, passengers_vec, indicator_passengers_vec)), device=self.device, dtype=torch.float).unsqueeze(0) def train(self): duration_sum = 0.0 for episode in range(self.num_episodes): self.reset_different_num() #self.reset() #self.reset_orig_env() state = self.get_state() if self.mode == "dqn" or self.mode == "qmix": action = self.select_action(state) elif self.mode == "random": action = self.random_action([state]) elif self.mode == "greedy": action = [self.algorithm.greedy_fcfs(self.grid_map)] action = torch.tensor(action, device=self.device, dtype=torch.long) #print(action.size()) #print(action[:,:self.num_passengers]) reward, duration = self.env.step(action[:, :self.num_passengers], self.mode) if self.mode == "dqn": reward.extend([0] * (self.max_passengers - self.num_passengers)) self.episode_durations.append(duration) count = self.count_matrix[self.num_passengers - 1, self.num_cars - 1] self.duration_matrix[ self.num_passengers - 1, self.num_cars - 1] = self.duration_matrix[ self.num_passengers - 1, self.num_cars - 1] * (count / (count + 1)) + duration / (count + 1) self.count_matrix[self.num_passengers - 1, self.num_cars - 1] += 1 duration_sum += duration if self.training: self.memory.push( state, action, torch.tensor(reward, device=self.device, dtype=torch.float).unsqueeze(0)) self.optimize_model() self.plot_durations(self.mode) self.plot_loss_history(self.mode) if self.training and episode % self.num_save == 0: torch.save(self.policy_net.state_dict(), "episode_" + str(episode) + "_" + self.load_file) if self.mode == "qmix": torch.save( self.mixer.state_dict(), "mixer_episode_" + str(episode) + "_" + self.load_file) print("Checkpoint saved") print("Episode: ", episode) if self.training: torch.save(self.policy_net.state_dict(), self.load_file) if self.mode == "qmix": torch.save(self.mixer.state_dict(), "mixer_" + self.load_file) print("Checkpoint saved") print("Average duration was ", duration_sum / self.num_episodes) print("Finished") np.save("Duration_matrix", self.duration_matrix) np.save("Count_matrix", self.count_matrix) print(self.duration_matrix) print(self.count_matrix) def reset(self): self.env.reset() self.grid_map = self.env.grid_map self.cars = self.env.grid_map.cars self.passengers = self.env.grid_map.passengers def reset_different_num(self): self.env.grid_map.cars = [] self.env.grid_map.passengers = [] self.env.grid_map.num_passengers = random.randint( 1, self.max_passengers) self.env.grid_map.num_cars = random.randint(1, self.max_cars) self.env.grid_map.add_passenger(self.env.grid_map.num_passengers) self.env.grid_map.add_cars(self.env.grid_map.num_cars) self.grid_map = self.env.grid_map self.num_passengers = self.env.grid_map.num_passengers self.num_cars = self.env.grid_map.num_cars self.cars = self.env.grid_map.cars self.passengers = self.env.grid_map.passengers def reset_orig_env(self): self.env = copy.deepcopy(self.orig_env) self.grid_map = self.env.grid_map self.cars = self.env.grid_map.cars self.passengers = self.env.grid_map.passengers self.grid_map.init_zero_map_cost() def optimize_model(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) self.policy_net.train() q_values = self.policy_net(state_batch).view(self.batch_size, self.max_passengers, self.max_cars) q_values = torch.cat((q_values, torch.zeros( (self.batch_size, self.max_passengers, 1), device=self.device)), 2) state_action_values = q_values.gather( 2, action_batch.unsqueeze(2)).squeeze() # Compute the expected Q values expected_state_action_values = reward_batch # Compute Huber loss if self.mode == "dqn": loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) elif self.mode == "qmix": self.mixer.train() chosen_action_qvals = self.mixer(state_action_values, state_batch) loss = F.smooth_l1_loss(chosen_action_qvals, reward_batch.view(-1, 1, 1)) #loss = F.mse_loss(chosen_action_qvals, reward_batch.view(-1, 1, 1)) self.loss_history.append(loss.item()) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def plot_durations(self, filename): print("Saving durations plot ...") plt.figure(2) plt.clf() total_steps = np.array(self.episode_durations) N = len(total_steps) window_size = 200 if N < window_size: total_steps_smoothed = total_steps else: total_steps_smoothed = np.zeros(N - window_size) for i in range(N - window_size): window_steps = total_steps[i:i + window_size] total_steps_smoothed[i] = np.average(window_steps) plt.title('Episode Duration history') plt.xlabel('Episode') plt.ylabel('Duration') plt.plot(total_steps_smoothed) np.save("Duration_" + filename, total_steps_smoothed) #plt.savefig("Durations_history_" + filename) def plot_loss_history(self, filename): print("Saving loss history ...") plt.figure(2) plt.clf() #loss = torch.tensor(self.loss_history, dtype=torch.float) total_loss = np.array(self.loss_history) N = len(total_loss) window_size = 50 if N < window_size: total_loss_smoothed = total_loss else: total_loss_smoothed = np.zeros(N - window_size) for i in range(N - window_size): window_steps = total_loss[i:i + window_size] total_loss_smoothed[i] = np.average(window_steps) plt.title('Loss history') plt.xlabel('Episodes') plt.ylabel('Loss') plt.plot(self.loss_history) np.save("Loss_" + filename, total_loss_smoothed)
def train(args): device = torch.device("cuda" if args.gpu else "cpu") env = Environment(draw=False, fps=args.fps, debug=args.debug, dist_to_pipe=args.dist_to_pipe, dist_between_pipes=args.dist_between_pipes, obs_this_pipe=args.obs_this_pipe) observation_space = env.get_observation_size_buffer() action_space = env.get_action_size() policy_network = DQN(observation_space, action_space).to(device) target_network = DQN(observation_space, action_space).to(device) optimizer = torch.optim.Adam(policy_network.parameters(), lr=args.lr) replay_buffer = ReplayMemory(args.replay_capacity) writer = SummaryWriter() if args.inference: target_network.load_checkpoint() best_reward = None iteration = 0 total_reward = 0.0 rewards = [] state = env.reset() while True: epsilon = max(args.final_eps, args.start_eps - iteration / args.eps_decay_final_step) iteration += 1 episode_reward = None if np.random.rand() < epsilon: action = env.get_action_random() else: state_v = torch.tensor(np.array([state], copy=False)).to(device) q_vals_v = policy_network(state_v.float()) _, act_v = torch.max(q_vals_v, dim=1) action = int(act_v.item()) next_state, reward, done = env.step(action) total_reward += reward replay_buffer.push(state, action, next_state, reward, done) state = next_state if done: episode_reward = total_reward state = env.reset() total_reward = 0.0 if episode_reward is not None: rewards.append(episode_reward) mean_reward = np.mean(rewards[-80:]) print( f"Episode {iteration}: eps {epsilon} mean reward {mean_reward} episode reward {episode_reward}" ) writer.add_scalar("epsilon", epsilon, iteration) writer.add_scalar("mean_reward", mean_reward, iteration) writer.add_scalar("reward", episode_reward, iteration) if best_reward is None or best_reward < mean_reward: torch.save(policy_network.state_dict(), f"./models/checkpoint_{iteration}") print(f"New best reward found: {best_reward} -> {mean_reward}") best_reward = mean_reward if mean_reward > args.goal_reward: print(f"Achieved in {iteration} steps.") break if len(replay_buffer) < args.replay_start_step: continue if iteration % args.target_update_iterations == 0: target_network.load_state_dict(policy_network.state_dict()) optimizer.zero_grad() batch = replay_buffer.sample(args.batch_size) loss = calculate_loss(batch, policy_network, target_network, args.gamma, device=device) loss.backward() optimizer.step() writer.close()