def __init__(self, chance_coeff=1, chance_coeff_hl=False, mem_size=1000, mem_batch_size=4, penalize_if_repeats=True, *args, **kwargs): super().__init__(*args, **kwargs) self.loss_func = torch.nn.KLDivLoss() # Whether to just choose max prob. event (0.0) or pick from probability # distribution produced by PolicyNet (1.0). # It's a float, so you can control how much exploring should be done by agent # chance_coeff_hl: If defined, makes it a decaying variable, with halflife in steps self.chance_coeff = chance_coeff self.chance_coeff_hl = chance_coeff_hl # memory buffer self.memory = MemoryBuffer(size=mem_size, batch_size=mem_batch_size, replace=False) self.curr_state = self.get_game_state() self.prev_state = self.curr_state self.max_event = "" self._reward = 0 self._loss = 0
def __init__( self, target_model=None, # must be the same as normal model update_target_model_every=1000, # model with which qvals are calculated learn_epsilon_half_life=3000, discount_factor=0.9, mem_size=1000, mem_batch_size=4, mem_bias_prob=0.9, *args, **kwargs): super().__init__(*args, **kwargs) self.target_model = target_model self.update_target_model_every = update_target_model_every self.discount_factor = discount_factor self.learn_epsilon = 1 # for epsilon-greedy search self.learn_epsilon_half_life = learn_epsilon_half_life # time it takes to fall to half self.memory = MemoryBuffer(size=mem_size, batch_size=mem_batch_size, bias_prob=mem_bias_prob) self.prev_state = self.get_game_state() self.prev_score = 0 # info vars self._real_event = "" self._reward = 0 self._last_q = 0 self._loss = 0
def train(env, model, args): model.optim = torch.optim.Adam(islice(model.parameters(), 20), lr=0.0005) model.pos_optim = torch.optim.Adam(model.eval_mlp.parameters(), lr=0.0005) replay_buffer = MemoryBuffer(int(args.batch)) agent = RandomAgent(env.action_spec()) max_steps = args.num_steps env.reset() step = 0 sub_trajectory = SubTrajectory(100) pbar = tqdm(total = max_steps) while step < max_steps: action = agent.step() # for _ in range(np.random.randint(1,5)): rgb, pos, orientation = env.observations()['RGB'], env.observations()['DEBUG.POS.TRANS'], env.observations()['DEBUG.POS.ROT'] reward = env.step(action) if (not env.is_running()): env.reset() else: new_rgb, new_pos, new_orientation = env.observations()['RGB'], env.observations()['DEBUG.POS.TRANS'], env.observations()['DEBUG.POS.ROT'] if sub_trajectory.len == 100: tmp = copy.deepcopy(sub_trajectory) # Send initial belief to replay buffer o_0 = torch.from_numpy(tmp.new_rgb[0]).to(dtype=torch.float32).unsqueeze(0).to(device) a_0 = torch.from_numpy(tmp.action[0]).to(dtype=torch.float32).unsqueeze(0).to(device) z_0 = model.conv(o_0) bgru_input = torch.cat((z_0, a_0), dim=1) _, tmp.belief = model.belief_gru.gru1(torch.unsqueeze(bgru_input, 1)) replay_buffer.add(tmp) sub_trajectory.clear() sub_trajectory.add(rgb, pos, orientation, action, new_rgb, new_pos, new_orientation) # Train using replay_buffer if step >= args.batch * 100: train_batch = replay_buffer.sample(64) if None in train_batch: raise Exception("Training batch contains None object") model.update(train_batch) step += 1 pbar.update(1) pbar.close()
def train(rank, device, args): current_time = datetime.now().strftime('%b%d_%H-%M') LOGGER_DIR = os.path.join(args.log_dir, args.env, current_time, 'Agent:{}'.format(rank)) writer = SummaryWriter(LOGGER_DIR) MODEL_DIR = os.path.join(LOGGER_DIR, 'models') if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR) env = create_env(args.env, args) if args.pri: ram = PrioMemoryBuffer(args.buffer_size) else: ram = MemoryBuffer(args.buffer_size) player = DDPGAgent(env.observation_space, env.action_space, ram, writer, device, args) if args.model_dir is not None: player.load_models(args.model_dir) steps_done = 0 episode_rewards = [] max_score = -9999 count_eps = 0 for _ep in range(1, args.max_eps): observation = env.reset() total_reward = 0 count_eps += 1 for r in range(10000): if 'img' in args.obs: state = np.expand_dims(observation, axis=0) else: state = np.float32(observation) action, action_rescale = player.get_exploration_action(state) new_observation, reward, done, info = env.step(action_rescale) steps_done += 1 total_reward += reward ram.add(observation, np.expand_dims(action, axis=0), reward, new_observation) observation = new_observation # perform optimization if steps_done > args.start_learning: player.optimize() if done: break # logger writer.add_scalar('episode/reward', total_reward, steps_done) writer.add_scalar('episode/length', r, steps_done) episode_rewards.append(total_reward) if _ep % args.eval_eps == 0: reward_ave = np.array(episode_rewards).mean() print('Train, episode %d, steps: %d reward: %.3f,ave_reward: %.3f' % (count_eps, steps_done, episode_rewards[-1], reward_ave)) if reward_ave > max_score: player.save_models(os.path.join(MODEL_DIR, 'best')) max_score = reward_ave print('Save Best!') else: player.save_models(os.path.join(MODEL_DIR, 'new')) episode_rewards = [] # check memory consumption and clear memory gc.collect()
def test(device, args): env = create_env(args.env, args) ram = MemoryBuffer(1) player = DDPGAgent(env.observation_space, env.action_space, ram, None, device, args) if args.model_dir is not None: player.load_models(args.model_dir, test=True) steps_done = 0 count_eps = 0 count_success = 0 while True: episode_rewards = [] episode_lenghts = [] for _ep in range(1, args.eval_eps): if args.ar: env.seed(True) observation = env.reset() total_reward = 0 episode_action = [] for steps in range(1000): if 'img' in args.obs: state = np.expand_dims(observation, axis=0) else: state = np.float32(observation) action, action_rescale = player.get_exploitation_action(state) episode_action.append(action) new_observation, reward, done, info = env.step(action_rescale) observation = new_observation total_reward += reward steps_done += 1 if args.render: env.render() if done: episode_rewards.append(total_reward) count_eps += 1 episode_lenghts.append(steps) if reward > 1: count_success += 1.0 break # check memory consumption and clear memory gc.collect() reward_ave = np.array(episode_rewards).mean() length_ave = np.array(episode_lenghts).mean() print( 'Test, episode %d, steps: %d, Success_rate: %.3f ave_reward: %.3f, ave_length: %.3f' % (count_eps, steps_done, count_success / count_eps, reward_ave, length_ave)) env.close()
class PAgent(Agent): def __init__(self, chance_coeff=1, chance_coeff_hl=False, mem_size=1000, mem_batch_size=4, penalize_if_repeats=True, *args, **kwargs): super().__init__(*args, **kwargs) self.loss_func = torch.nn.KLDivLoss() # Whether to just choose max prob. event (0.0) or pick from probability # distribution produced by PolicyNet (1.0). # It's a float, so you can control how much exploring should be done by agent # chance_coeff_hl: If defined, makes it a decaying variable, with halflife in steps self.chance_coeff = chance_coeff self.chance_coeff_hl = chance_coeff_hl # memory buffer self.memory = MemoryBuffer(size=mem_size, batch_size=mem_batch_size, replace=False) self.curr_state = self.get_game_state() self.prev_state = self.curr_state self.max_event = "" self._reward = 0 self._loss = 0 def step(self): if self.chance_coeff_hl: hl_ratio = self.step_counter / self.chance_coeff_hl self.chance_coeff = math.pow(2, -hl_ratio) self.count_step() reward = self.compute_reward() self.curr_state = self.get_game_state() probs = self.model(self.curr_state) # p is the probability distribution set by policy net probs_list = probs.detach().numpy()[0] self.max_idx = np.argmax(probs_list) self._real_event = globals.EVENTS[self.max_idx] if random.random() > self.chance_coeff: chosen_idx = self.max_idx else: chosen_idx = np.random.choice(range(LEN_EVENTS), p=probs_list) chosen_event = globals.EVENTS[chosen_idx] if chosen_event != self.last_event: self.vm.reset_keys() self.vm.keyDown(chosen_event) self.memory.add(self.prev_state, chosen_idx, reward, self.curr_state) self.last_event = chosen_event self._reward = reward # for info if len(self.memory.buffer) >= self.memory.size: self.learning_step() self.prev_state = self.curr_state def learning_step(self): self.optimizer.zero_grad() batch = self.memory.get_batch() probs = self.model(batch["prev_state"]) # (b,8) target = torch.tensor(probs).detach() # copy without grad for i, p in enumerate(probs): prev_a = batch["prev_action"][i] if batch["reward"][i] > 0: target[i] = torch.zeros(LEN_EVENTS) target[i][prev_a] = 1.0 else: target[i][prev_a] = 0.0 loss = self.loss_func(probs, target) self._loss = float(loss) loss.backward() self.optimizer.step() def __learning_step(self): if len(self.event_buffer) > 0: self.optimizer.zero_grad() target = torch.zeros((len(self.event_buffer), LEN_EVENTS)) probs_list = [] for i, event_probs in enumerate(self.event_buffer): event, probs = event_probs event_idx = globals.EVENTS_IDX[event] target[i][event_idx] = 1 probs_list.append(probs) probs_batch = torch.cat(probs_list) loss = self.loss_func(probs_batch, target) loss.backward() self.optimizer.step() # end learning step self.clear_event_past() self.end_learning_step() self.positive_step_counter += 1 print("POSITIVE STEP +++") def negative_step_for_repeating( self): # used for penalizing if agent repeats itself self.optimizer.zero_grad() for event, probs in self.event_buffer: event_idx = globals.EVENTS_IDX[event] # probability distribution where event we're penalizing is 0 target = torch.tensor(probs) # copy as tensor target += target[0][event_idx] / ( LEN_EVENTS - 1 ) # make sure all elements add up to 1 after we set unwanted event to 0 target[0][event_idx] = 0 loss = self.loss_func(probs, target) loss.backward() self.negative_step_counter += 1 print("NEGATIVE STEP ---") self.clear_event_past() self.optimizer.step() def clear_event_past(self): self.event_buffer = [] self.event_history = []
class DQAgent(Agent): def __init__( self, target_model=None, # must be the same as normal model update_target_model_every=1000, # model with which qvals are calculated learn_epsilon_half_life=3000, discount_factor=0.9, mem_size=1000, mem_batch_size=4, mem_bias_prob=0.9, *args, **kwargs): super().__init__(*args, **kwargs) self.target_model = target_model self.update_target_model_every = update_target_model_every self.discount_factor = discount_factor self.learn_epsilon = 1 # for epsilon-greedy search self.learn_epsilon_half_life = learn_epsilon_half_life # time it takes to fall to half self.memory = MemoryBuffer(size=mem_size, batch_size=mem_batch_size, bias_prob=mem_bias_prob) self.prev_state = self.get_game_state() self.prev_score = 0 # info vars self._real_event = "" self._reward = 0 self._last_q = 0 self._loss = 0 def step(self): if self.learn_epsilon_half_life: hl_ratio = self.step_counter / self.learn_epsilon_half_life self.learn_epsilon = math.pow(2, -hl_ratio) # update target model if self.step_counter % self.update_target_model_every == 0: self.target_model.load_state_dict(self.model.state_dict()) self.count_step() # computes score and counts step # compute reward reward = self.compute_reward() qvals_prev = self.model(self.prev_state) qvals_prev = qvals_prev.detach() if self.learn_epsilon_half_life and random.random( ) < self.learn_epsilon: # random action chosen_idx = random.randrange(len(globals.EVENTS)) else: # pick best action chosen_idx = np.argmax(qvals_prev.numpy()) self.curr_state = self.get_game_state() self.memory.add(self.prev_state, chosen_idx, reward, self.curr_state) chosen_event = globals.EVENTS[chosen_idx] self._real_event = globals.EVENTS[np.argmax( qvals_prev.numpy())] # for info if chosen_event != self.last_event: self.vm.reset_keys() self.vm.keyDown(chosen_event) self.last_event = chosen_event self._reward = reward # for info if len(self.memory.buffer) >= self.memory.size: self.learning_step() self.prev_state = self.curr_state def learning_step(self): self.optimizer.zero_grad() batch = self.memory.get_batch() # implement pseudo-Double-DQN # |Q|(s',a') with torch.no_grad(): tqvals_curr = self.target_model( batch["curr_state"]) # qvals for all possible actions for curr # Q(s',a') qvals_curr = self.model(batch["curr_state"]) argmax_qval_curr = torch.argmax(qvals_curr.detach(), dim=1) #self._last_q = float(max_qval_curr[0]) # Q(s,a) qvals_prev = self.model(batch["prev_state"]) # Shape = (b,8) # don't touch actions that weren't activated target = torch.tensor(qvals_prev).detach() for i, prev_a in enumerate(batch["prev_action"]): argmax_curr = argmax_qval_curr[i] target[i][prev_a] = batch["reward"][ i] + self.discount_factor * tqvals_curr[i][argmax_curr] loss = self.loss_func(qvals_prev, target) self._loss = float(loss) loss.backward() if batch["reward"][0] > 0: self.positive_step_counter += 1 self.optimizer.step()