def _random_turn(self): turn = numpy.random.random() * random.choice([-1, 1]) pitch = numpy.random.random() * random.choice([-0.5, 0.5]) self.act(["turn {0}".format(turn)]) self.act(["pitch {0}".format(pitch)]) time.sleep(0.5) stop_motion(self.mc)
def run_episode(self): """ Deep Q-Learning episode """ self.agent.clear_state() mc = self.mc # apply random turn and pitch self._random_turn() logging.debug('memory: %i', self.agent.memory.position) self.agent.train() max_t = 60 eps_start = self.eps eps_end = 0.05 eps_decay = 0.99 eps = eps_start total_reward = 0 t = 0 # pitch, yaw, xpos, ypos, zpos prev_pos = None prev_target_dist = None prev_life = 20 solved = False mean_loss = numpy.mean( [self.learn(self.agent, self.optimizer) for _ in range(5)]) logging.info('loss %f', mean_loss) while True: t += 1 reward = 0 import pdb pdb.set_trace() try: data = self.collect_state() new_pos = data['position'] target = self.is_tree_visible() except DeadException: stop_motion(mc) # should not die is this mission # so don't add this event to the replay buffer reward = 0 logging.warning("died at step %i", t) break if prev_pos is None: prev_pos = new_pos else: # use only dist change for now life = mc.getLife() logging.debug('current life %f', life) if life == 0: # should not die is this mission continue reward += (life - prev_life) * 2 prev_life = life if target is not None: if target[0] == 'log': reward = 100 self.agent.push_final(reward) logging.debug('solved in %i steps', t) mc.sendCommand("quit") solved = True break elif target[0] == 'leaves': reward = -0.05 if reward == 0: reward -= 1 data['prev_pos'] = prev_pos logging.debug("current reward %f", reward) new_actions = self.agent(data, reward=reward, epsilon=eps) eps = max(eps * eps_decay, eps_end) logging.debug('epsilon %f', eps) self.act(new_actions) time.sleep(0.4) stop_motion(mc) time.sleep(0.1) prev_pos = new_pos if t == max_t: logging.debug("too long") stop_motion(mc) reward = -10 self.agent.push_final(-10) self.mc.sendCommand("quit") self.learn(self.agent, self.optimizer) break total_reward += reward # in termial state reward is not added due loop breaking total_reward += reward logging.info("Final reward: %d" % reward) return total_reward, t, solved
def run_episode(self): """ Deep Q-Learning episode """ self.agent.clear_state() max_t = 250 self._random_turn() mc = self.mc logging.debug('memory: %i', self.agent.memory.position) self.agent.train() eps_start = self.eps eps_end = 0.05 eps_decay = 0.999 eps = eps_start total_reward = 0 t = 0 state = self.collect_state() if state['ypos'] < 0: raise DeadException('started in a pit') # pitch, yaw, xpos, ypos, zpos prev_pos = None prev_target_dist = None prev_life = 20 solved = False while True: t += 1 logging.debug('\n\n\nstep %i', t) # target = search4blocks(mc, ['lapis_block'], run=False) reward = 0 try: data = self.collect_state() except DeadException: stop_motion(mc) self.agent.push_final(-100) reward = -100 logging.debug("failed at step %i", t) import pdb pdb.set_trace() self.learn(self.agent, self.optimizer) break if self.state_queue: life = mc.getLife() logging.debug('current life %f', life) if life == 0: reward = -100 stop_motion(mc) if t > 2: self.agent.push_final(reward) self.learn(self.agent, self.optimizer) break if 'visible' in self.state_queue[-1]: prev_action = self.agent.policy_net.actions[0].to_string( self.state_queue[-1]['action']) prev_item = self.state_queue[-1]['visible'] logging.debug(prev_item) prev_dist = prev_item[DIST] prev_block = prev_item[BLOCK_TYPE] if prev_block in ('water', 'lava', 'flowing_lava'): reward -= 2 if prev_action == 'attack 1': h_target = 24 if prev_dist <= 4: reward += 0.5 else: reward -= 1 if 'visible' in data: current_dist = data['visible'][-1] # if block is removed visible would change if (0.1 < (current_dist - prev_dist)): logging.debug('distance is more than before!') reward += 1 if prev_block in ('double_plant', 'tallgrass'): reward -= 0.5 tmp = ((30 - h_target) - abs(prev_item[HEIGHT] - h_target)) logging.info('tmp dist %f', tmp) tmp = max(tmp, 0)**2 if h_target < prev_item[HEIGHT]: tmp /= 3 reward += tmp if prev_block not in ('dirt', 'grass', 'stone', 'double_plant', 'tallgrass', 'leaves', 'log'): reward += 25 else: # give small reward for removing block under self prev_height = 30 - self.state_queue[-1]['ypos'] curr_height = 30 - data['ypos'] if curr_height < prev_height: tmp = ((30 - h_target) - abs(prev_height - h_target)) logging.debug('removed block!') logging.debug('tmp dist %f', tmp) reward += max(tmp, 0)**2 / 2 else: if 'visible' not in data: logging.debug('not visible') reward -= 1 reward -= 1 reward += (life - prev_life) * 2 prev_life = life if not mc.is_mission_running(): logging.debug('failed in %i steps', t) reward = -100 logging.debug("current reward %f", reward) new_actions = self.agent(data, reward=reward, epsilon=eps) eps = max(eps * eps_decay, eps_end) logging.debug('epsilon %f', eps) data['action'] = self.agent.prev_action self.state_queue.append(copy.copy(data)) if 'visible' in data: data.pop('visible') self.act(new_actions) time.sleep(0.4) stop_motion(mc) time.sleep(0.1) if t == max_t: logging.debug("too long") stop_motion(mc) self.agent.push_final(reward) self.mc.sendCommand("quit") self.learn(self.agent, self.optimizer) break total_reward += reward # in termial state reward is not added due loop breaking total_reward += reward aPos = self.mc.getAgentPos() if aPos is not None and aPos[1] <= 25: solved = True logging.debug("Final reward: %f", reward) self._end() return total_reward, t, solved
def run_episode(self): """ Deep Q-Learning episode """ self.agent.clear_state() mc = self.mc logging.debug('memory: %i', self.agent.memory.position) self.agent.train() max_t = 80 eps_start = self.eps eps_end = 0.05 eps_decay = 0.99 eps = eps_start total_reward = 0 t = 0 # pitch, yaw, xpos, ypos, zpos prev_pos = None prev_target_dist = None prev_life = 20 solved = False mean_loss = numpy.mean( [self.learn(self.agent, self.optimizer) for _ in range(5)]) logging.info('loss %f', mean_loss) while True: t += 1 # target = search4blocks(mc, ['lapis_block'], run=False) reward = 0 try: data = self.collect_state() target_enc = data['target'] new_pos = data['pos'] except DeadException: stop_motion(mc) self.agent.push_final(-100) reward = -100 logging.debug("failed at step %i", t) self.learn(self.agent, self.optimizer) break if prev_pos is None: prev_pos = new_pos else: # use only dist change for now life = mc.getLife() logging.debug('current life %f', life) if life == 0: reward = -100 stop_motion(mc) self.agent.push_final(reward) self.learn(self.agent, self.optimizer) break reward += (prev_target_dist - target_enc)[2] + (life - prev_life) * 2 prev_life = life grid = mc.getNearGrid() if target_enc[2] < 0.53: reward = 100 self.agent.push_final(reward) logging.debug('solved in %i steps', t) mc.sendCommand("quit") solved = True break if not mc.is_mission_running(): logging.debug('failed in %i steps', t) reward = -100 self.agent.push_final(reward) break if reward == 0: reward -= 2 logging.debug("current reward %f", reward) new_actions = self.agent(data, reward=reward, epsilon=eps) eps = max(eps * eps_decay, eps_end) logging.debug('epsilon %f', eps) self.act(new_actions) time.sleep(0.4) stop_motion(mc) time.sleep(0.1) prev_pos = new_pos prev_target_dist = target_enc if t == max_t: logging.debug("too long") stop_motion(mc) reward = -10 self.agent.push_final(-10) self.mc.sendCommand("quit") self.learn(self.agent, self.optimizer) break total_reward += reward # in termial state reward is not added due loop breaking total_reward += reward logging.debug("Final reward: %d" % reward) return total_reward, t, solved
def run_episode(self): """ Deep Q-Learning episode """ self.agent.clear_state() start = self._start() end = self.end max_t = 3000 self._random_turn() logging.info('current target (%i, %i)', self.target_x, self.target_y) mc = self.mc logging.debug('memory: %i', self.agent.memory.position) self.agent.train() logging.info('max dist %i', max_t) eps_start = self.eps eps_end = 0.05 eps_decay = 0.999 eps = eps_start total_reward = 0 t = 0 # pitch, yaw, xpos, ypos, zpos prev_pos = None prev_target_dist = None prev_life = 20 solved = False while True: t += 1 # target = search4blocks(mc, ['lapis_block'], run=False) reward = 0 try: data = self.collect_state() except DeadException: stop_motion(mc) self.agent.push_final(-100) reward = -100 logging.debug("failed at step %i", t) self.learn(self.agent, self.optimizer) break if self.state_queue: life = mc.getLife() logging.debug('current life %f', life) if life == 0: reward = -100 stop_motion(mc) if t > 2: self.agent.push_final(reward) self.learn(self.agent, self.optimizer) break logging.debug('distance %f', data['dist']) prev_target_dist = self.state_queue[-1]['dist'] dist_diff = (prev_target_dist - data['dist']) if dist_diff > 1: reward += 1 if dist_diff < 0: reward -= 1 reward += dist_diff + (life - prev_life) * 2 prev_life = life grid = mc.getNearGrid() if not mc.is_mission_running(): logging.debug('failed in %i steps', t) reward = -100 if data['dist'] < 0.88: time.sleep(1) mc.observeProc() life = mc.getLife() mc.sendCommand("quit") if life == prev_life: reward += 25 self.agent.push_final(reward) logging.debug('solved in %i steps', t) solved = True break if reward == 0: reward -= 0.5 if 'visible' in data: d = data['visible'][-1] if d < 1: logging.debug('visible {0}'.format(d)) reward -= 3 logging.debug("current reward %f", reward) new_actions = self.agent(data, reward=reward, epsilon=eps) eps = max(eps * eps_decay, eps_end) logging.debug('epsilon %f', eps) data['action'] = self.agent.prev_action if 'visible' in data: data.pop('visible') self.state_queue.append(data) self.act(new_actions) time.sleep(0.4) stop_motion(mc) time.sleep(0.1) if t == max_t or total_reward < -200: reward -= 1 logging.debug("too long") stop_motion(mc) self.agent.push_final(reward) self.mc.sendCommand("quit") self.learn(self.agent, self.optimizer) break total_reward += reward # in termial state reward is not added due loop breaking total_reward += reward logging.debug("Final reward: %f", reward) self._end(start, end, solved, t, total_reward) return total_reward, t, solved
def run_episode(self): """ Deep Q-Learning episode """ self.agent.clear_state() mc = self.mc # apply random turn and pitch self._random_turn() logging.debug('memory: %i', self.agent.memory.position) self.agent.train() max_t = 80 eps_start = self.eps eps_end = 0.05 eps_decay = 0.99 eps = eps_start total_reward = 0 t = 0 # pitch, yaw, xpos, ypos, zpos prev_pos = None prev_target_dist = None prev_life = 20 solved = False while True: t += 1 reward = -0.2 try: data = self.collect_state() target = self.is_tree_visible() except DeadException: stop_motion(mc) # should not die is this mission # so don't add this event to the replay buffer reward = 0 logging.warning("died at step %i", t) break if self.state_queue: # use only dist change for now life = mc.getLife() logging.debug('current life %f', life) if life == 0: # should not die is this mission break solved, reward = self.is_solved(target) if solved: logging.info('solved in %i steps', t) self.agent.push_final(reward) self.mc.sendCommand("quit") break else: r = reward if reward > 0: for item in self.state_queue: if 'reward' in item and item['reward'] > 0: r += item['reward'] if r >= 3: solved = True logging.info('solved in %i steps', t) reward = 45 self.agent.push_final(reward) self.mc.sendCommand("quit") break data['reward'] = torch.as_tensor(reward) #if 'stop' in new_actions: # # either it solved, or the tree is blocked # if not solved: # mc.sendCommand('move 1') # time.sleep(5) # stop_motion(mc) # self.collect_state() # target = self.is_tree_visible() # solved, reward = self.is_solved(target) # if solved: # logging.debug('solved in %i steps', t) # reward += 5 # else: # logging.debug('actually not solved!') # reward -= 2 # self.mc.sendCommand("quit") # self.agent.push_final(reward) # break #elif solved: # logging.debug('solved but not signaling about that') # reward -= 2 # self.mc.sendCommand("quit") # self.agent.push_final(reward) # break logging.debug('reward %f', reward) new_actions = self.agent(data, reward=reward, epsilon=eps) data['action'] = self.agent.prev_action eps = max(eps * eps_decay, eps_end) logging.debug('epsilon %f', eps) self.state_queue.append(data) if t == max_t and reward <= 0: reward -= 1 logging.debug("too long") stop_motion(mc) self.agent.push_final(reward) self.mc.sendCommand("quit") self.learn(self.agent, self.optimizer) break self.act(new_actions) time.sleep(0.4) stop_motion(mc) time.sleep(0.1) total_reward += reward # in termial state reward is not added due loop breaking total_reward += reward logging.info("Final reward: %f" % reward) mean_loss = numpy.mean([self.learn(self.agent, self.optimizer) for _ in range(3)]) logging.info('loss %f', mean_loss) return total_reward, t, solved
def run_episode(self): """ Deep Q-Learning episode """ from_queue = False self.agent.clear_state() if random.random() < 0.4 and (self.failed_queue or self.episode_stats): self.mc.sendCommand("quit") time.sleep(1) if self.failed_queue: from_queue = True start, end = self.failed_queue.pop() x, y = end start_x = x + random.choice(numpy.arange(-20, 20)) start_y = y + random.choice(numpy.arange(-20, 20)) logging.info('evaluating from queue {0}, {1}'.format( start, end)) else: pairs = list(self.episode_stats.items()) r = numpy.asarray([p[1][0] for p in pairs]) idx = inverse_priority_sample(r) logging.debug('prority sample idx=%i', idx) start, end = pairs[idx][0] logging.info('evaluating from stats {0}, {1}'.format( start, end)) start_x, start_y = start # start somewhere near end self.mc = self.init_mission(0, self.mc, start_x=start_x, start_y=start_y) self.mc.safeStart() self.target_x, self.target_y = end else: self.mc.observeProc() aPos = self.mc.getAgentPos() while aPos is None: time.sleep(0.05) self.mc.observeProc() aPos = self.mc.getAgentPos() XPos, _, YPos = aPos[:3] self.target_x = XPos + random.choice( numpy.arange(-self.dist, self.dist)) self.target_y = YPos + random.choice( numpy.arange(-self.dist, self.dist)) start = (XPos, YPos) end = self.target_x, self.target_y logging.info('current target (%i, %i)', self.target_x, self.target_y) mc = self.mc logging.debug('memory: %i', self.agent.memory.position) self.agent.train() max_t = self.dist * 4 eps_start = self.eps eps_end = 0.05 eps_decay = 0.9999 eps = eps_start total_reward = 0 t = 0 # pitch, yaw, xpos, ypos, zpos prev_pos = None prev_target_dist = None prev_life = 20 solved = False mean_loss = numpy.mean( [self.learn(self.agent, self.optimizer) for _ in range(10)]) logging.info('loss %f', mean_loss) while True: t += 1 # target = search4blocks(mc, ['lapis_block'], run=False) reward = 0 try: data = self.collect_state() target_enc = data['target'] new_pos = data['pos'] except DeadException: stop_motion(mc) self.agent.push_final(-100) reward = -100 logging.debug("failed at step %i", t) self.learn(self.agent, self.optimizer) break if prev_pos is None: prev_pos = new_pos else: # use only dist change for now life = mc.getLife() logging.debug('current life %f', life) if life == 0: reward = -100 stop_motion(mc) if t > 2: self.agent.push_final(reward) self.learn(self.agent, self.optimizer) break logging.debug('distance %f', target_enc[2]) reward += (prev_target_dist - target_enc)[2] + (life - prev_life) * 2 prev_life = life grid = mc.getNearGrid() if target_enc[2] < 0.58: time.sleep(1) mc.observeProc() life = mc.getLife() mc.sendCommand("quit") if life == prev_life: self.agent.push_final(reward) logging.debug('solved in %i steps', t) solved = True break if not mc.is_mission_running(): logging.debug('failed in %i steps', t) reward = -100 if reward == 0: reward -= 0.5 logging.debug("current reward %f", reward) data['prev_pos'] = prev_pos new_actions = self.agent(data, reward=reward, epsilon=eps) eps = max(eps * eps_decay, eps_end) logging.debug('epsilon %f', eps) self.act(new_actions) time.sleep(0.4) stop_motion(mc) time.sleep(0.1) prev_pos = new_pos prev_target_dist = target_enc if t == max_t: logging.debug("too long") stop_motion(mc) self.agent.push_final(reward) self.mc.sendCommand("quit") self.learn(self.agent, self.optimizer) break total_reward += reward # in termial state reward is not added due loop breaking total_reward += reward logging.debug("Final reward: %f", reward) if from_queue: """ If episode failed check length, if 15 < t start from a different point near the target if success add to episode_stats """ # failed if not solved: pass else: logging.info('from queue run succeeded') self.episode_stats[(start, end)] = [0, 0] else: if (start, end) in self.episode_stats: r, l = self.episode_stats[(start, end)] r = iterative_avg(r, total_reward) l = iterative_avg(l, t) self.episode_stats[(start, end)] = (r, l) logging.info('new episode stats reward {0} length {1}'.format( r, l)) elif 10 < t and not solved: self.failed_queue.append((start, end)) logging.info('adding to failed queue {0}, {1}'.format( start, end)) return total_reward, t, solved