def step(self, action): total_reward = 0 for i in range(self.action_repeat): img_rgb, reward, die, _ = self.env.step(action) # don't penalize "die state" if die: reward += 100 # green penalty if np.mean(img_rgb[:, :, 1]) > 185.0: reward -= 0.05 total_reward += reward # if no reward recently, end the episode done = True if self.av_r(reward) <= -0.1 else False if done or die: break img_gray = self.rgb2gray(img_rgb) if self.resize: img_gray = rsz(img_gray, (64, 64)) self.stack.pop(0) self.stack.append(img_gray) assert len(self.stack) == self.img_stack if done or die: done = True out_img_stack = np.array(self.stack).astype(np.float64) #out_img_stack = np.interp(out_img_stack, (out_img_stack.min(), out_img_stack.max()), (0, 255)) out_img_stack = (out_img_stack / out_img_stack.max()) * 255 out_img_stack = out_img_stack.astype(np.uint8).transpose(1, 2, 0) return out_img_stack, total_reward, done, die
def reset(self): self.counter = 0 self.av_r = self.reward_memory() self.die = False img_rgb = self.env.reset() img_gray = self.rgb2gray(img_rgb) if self.resize: img_gray = rsz(img_gray, (64, 64)) self.stack = [img_gray] * self.img_stack # four frames for decision out_img_stack = np.array(self.stack).astype(np.float64) #out_img_stack = np.interp(out_img_stack, (out_img_stack.min(), out_img_stack.max()), (0, 255)) out_img_stack = (out_img_stack / out_img_stack.max()) * 255 out_img_stack = out_img_stack.astype(np.uint8).transpose(1, 2, 0) return out_img_stack