def env_step(self, action): self.stepCount = self.stepCount + 1 if self.whichEpisode % 2 == 0: self.o.intArray = range(0, 50000) #cheating, might break something self.o.doubleArray = range(0, 50000) terminal = 0 if self.stepCount == 200: terminal = 1 ro = Reward_observation_terminal() ro.r = 1.0 ro.o = self.o ro.terminal = terminal return ro self.o.intArray = range(0, 5) #cheating, might break something self.o.doubleArray = range(0, 5) terminal = 0 if self.stepCount == 5000: terminal = 1 ro = Reward_observation_terminal() ro.r = 1.0 ro.o = self.o ro.terminal = terminal return ro
def env_step(self,action): ro=Reward_observation_terminal() terminal=False if self.stepCount < 5: self.o.doubleArray=[] self.o.charArray=[] self.o.intArray=[self.stepCount] self.stepCount=self.stepCount+1 if self.stepCount==5: terminal=True ro.r=1.0 else: self.o.doubleArray=[0.0078125,-0.0078125,0.0,0.0078125e150,-0.0078125e150] self.o.charArray=['g','F','?',' ','&'] self.o.intArray=[173,-173,2147483647,0,-2147483648] ro.r=-2.0 ro.o=self.o ro.terminal=terminal return ro
def env_step(self,action): self.stepCount=self.stepCount+1 if self.whichEpisode % 2 == 0: self.o.intArray=list(range(0,50000)) #cheating, might break something self.o.doubleArray=list(range(0,50000)) terminal=0 if self.stepCount==200: terminal=1 ro=Reward_observation_terminal() ro.r=1.0 ro.o=self.o ro.terminal=terminal return ro self.o.intArray=list(range(0,5)) #cheating, might break something self.o.doubleArray=list(range(0,5)) terminal=0 if self.stepCount==5000: terminal=1 ro=Reward_observation_terminal() ro.r=1.0 ro.o=self.o ro.terminal=terminal return ro
def step(self): rot = Reward_observation_terminal() rot.o = self.observe() if self.problemSpec["reward"]["type"] == "glue": rot.r = get_reward(self.latestObs) else: rot.r = self.latestReward if self.problemSpec["termination"]["type"] == "glue": rot.terminal = check_termination_conditions(self.latestObs) else: rot.terminal = self.latestTermination return rot
def env_step(self,thisAction): episodeOver=0 theReward=0 if thisAction.intArray[0]==0: self.currentState=self.currentState-1 if thisAction.intArray[0]==1: self.currentState=self.currentState+1 if self.currentState <= 0: self.currentState=0 theReward=-1 episodeOver=1 if self.currentState >= 20: self.currentState=20 theReward=1 episodeOver=1 theObs=Observation() theObs.intArray=[self.currentState] returnRO=Reward_observation_terminal() returnRO.r=theReward returnRO.o=theObs returnRO.terminal=episodeOver return returnRO
def env_step(self,thisAction): # プレーヤーの移動 self.player.update(thisAction) # 移動後のスコア計算 theReward = self.field.decision(int(self.player.x+0.5), int(self.player.y+0.5), thisAction.intArray[0]) #print("Reward:%d" %theReward) episodeOver = self.field.get_gameover() #print("EdgeTracer:episodeOver %03d" %episodeOver) # フィールドの描画 self.draw_field() returnObs=Observation() returnObs.intArray=np.append(np.zeros(128), [ item for innerlist in self.img_state for item in innerlist ]) #scipy.misc.imsave('l_screen.png', img_src) #scipy.misc.imsave('r_screen.png', img_afn) returnRO=Reward_observation_terminal() returnRO.r=theReward returnRO.o=returnObs returnRO.terminal=episodeOver return returnRO
def env_step(self, thisAction): log = logging.getLogger('pyrl.environments.gridworld.env_step') episodeOver = 0 intAction = thisAction.intArray[0] log.debug("Action to take: %d", intAction) theReward = self.takeAction(intAction) if self.isAtGoal(): log.info("Episode completed!!") episodeOver = 1 if self.reward_noise > 0: theReward += numpy.random.normal(scale=self.reward_noise) theObs = Observation() theObs.doubleArray = self.getState() returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = episodeOver log.info("(Action - State - Reward): (%d - %s - %f)", intAction, pformat(theObs), theReward) return returnRO
def step(self): rot = Reward_observation_terminal() rot.o = self.observe() if self.problemSpec['reward']['type'] == 'glue': rot.r = get_reward(self.latestObs) else: rot.r = self.latestReward if self.problemSpec['termination']['type'] == 'glue': rot.terminal = check_termination_conditions(self.latestObs) else: rot.terminal = self.latestTermination return rot
def env_step(self, thisAction): # Make sure the action is valid assert len(thisAction.intArray) == 1, "Expected 1 integer action." assert thisAction.intArray[0] >= 0, "Expected action to be in [0,4]" assert thisAction.intArray[0] < 4, "Expected action to be in [0,4]" self.updatePosition(thisAction.intArray[0]) lastActionValue = thisAction.intArray[0] theObs = Observation() theObs.intArray = [self.calculateFlatState()] theObs.charArray = ["T", "T", "T", "T"] if len(self.optionsArray[self.agentRow][self.agentCol]) != 0: for i in range(len( self.optionsArray[self.agentRow][self.agentCol])): theObs.charArray[ 2 + self.optionsArray[self.agentRow][self.agentCol][i]] = "T" returnRO = Reward_observation_terminal() returnRO.r = self.calculateReward(lastActionValue) returnRO.o = theObs returnRO.terminal = self.checkCurrentTerminal() return returnRO
def env_step(self, thisAction): episodeOver = 0 theReward = 0 if thisAction.intArray[0] == 0: self.currentState = self.currentState - 1 if thisAction.intArray[0] == 1: self.currentState = self.currentState + 1 if self.currentState <= 0: self.currentState = 0 theReward = -1 episodeOver = 1 if self.currentState >= 20: self.currentState = 20 theReward = 1 episodeOver = 1 theObs = Observation() theObs.intArray = [self.currentState] returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = episodeOver return returnRO
def env_step(self, action): action = action.intArray if len(action) != 3: print action, len(action) assert len(action) == self.simulationParameterObj.nbrReaches, "Expected " + str( self.simulationParameterObj.nbrReaches) + " integer action." if not InvasiveUtility.is_action_allowable(action, self.state): theObs = Observation() InvasiveUtility.is_action_allowable(action, self.state) #map(int, results) theObs.intArray = [-1] returnRO = Reward_observation_terminal() returnRO.r = self.Bad_Action_Penalty returnRO.o = theObs return returnRO cost_state_unit = InvasiveUtility.get_unit_invaded_reaches(self.state, self.simulationParameterObj.habitatSize) * self.actionParameterObj.costPerReach stateCost = cost_state_unit + InvasiveUtility.get_invaded_reaches( self.state) * self.actionParameterObj.costPerTree stateCost = stateCost + InvasiveUtility.get_empty_slots(self.state) * self.actionParameterObj.emptyCost costAction = InvasiveUtility.get_budget_cost_actions(action, self.state, self.actionParameterObj) if costAction > self.actionParameterObj.budget: theObs = Observation() InvasiveUtility.is_action_allowable(action, self.state) #map(int, results) theObs.intArray = [-1] returnRO = Reward_observation_terminal() returnRO.r = self.Bad_Action_Penalty returnRO.o = theObs return returnRO nextState = simulateNextState(self.state, action, self.simulationParameterObj, self.actionParameterObj, self.dispertionTable, self.germinationObj) self.state = nextState theObs = Observation() theObs.intArray = self.state returnRO = Reward_observation_terminal() returnRO.r = -1 * (costAction + stateCost) returnRO.o = theObs return returnRO
def env_step(self, action): state, reward, terminal = self.environment.step(self.get_action(action)) rot = Reward_observation_terminal() rot.r = reward rot.o = self.create_observation(state) rot.terminal = terminal return rot
def env_step(self, thisAction): intAction = int(thisAction.intArray[0]) theReward = self.takeAction(intAction) theObs = Observation() theObs.intArray = self.getState() returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = 0 return returnRO
def env_step(self,thisAction): intAction = int(thisAction.intArray[0]) theReward = self.takeAction(intAction) theObs = Observation() theObs.intArray = self.getState() returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = 0 return returnRO
def env_step(self, thisAction): intAction = thisAction.intArray[0] obs, reward = self.takeAction(intAction) theObs = obs returnRO = Reward_observation_terminal() returnRO.r = reward returnRO.o = theObs returnRO.terminal = mdptetris.isgameover() return returnRO
def env_step(self,thisAction): intAction = thisAction.intArray[0] obs, reward = self.takeAction(intAction) theObs = obs returnRO = Reward_observation_terminal() returnRO.r = reward returnRO.o = theObs returnRO.terminal = mdptetris.isgameover() return returnRO
def env_step(self,thisAction): intAction = thisAction.intArray[0] theReward, episodeOver = self.takeAction(intAction) theObs = Observation() theObs.doubleArray = self.state.tolist() returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = int(episodeOver) return returnRO
def env_step(self, thisAction): # print self.agentRow, self.agentCol hitBoundary = self.updatePosition(thisAction.doubleArray[0]) theObs = Observation() theObs.doubleArray = [self.agentRow, self.agentCol] returnRO = Reward_observation_terminal() returnRO.r = self.calculateReward(hitBoundary) returnRO.o = theObs returnRO.terminal = self.checkCurrentTerminal() return returnRO
def env_step(self,thisAction): intAction = thisAction.intArray[0] obs, reward = self.takeAction(intAction) theObs = Observation() theObs.doubleArray = [obs] returnRO = Reward_observation_terminal() returnRO.r = reward returnRO.o = theObs returnRO.terminal = 0 return returnRO
def env_step(self, thisAction): intAction = thisAction.intArray[0] obs, reward = self.takeAction(intAction) theObs = Observation() theObs.doubleArray = [obs] returnRO = Reward_observation_terminal() returnRO.r = reward returnRO.o = theObs returnRO.terminal = 0 return returnRO
def env_step(self, action): self.agent.botAction = action self.step() pixels = pygame.surfarray.array2d(screen) theObs = Observation() theObs.intArray = misc.imresize(pixels, (84, 84)).flatten().tolist() returnRO = Reward_observation_terminal() returnRO.r = 1 #reward goes here returnRO.o = theObs returnRO.terminal = 0 return returnRO
def env_step(self, action): action = action.intArray assert len(action) == self.simulationParameterObj.nbrReaches, "Expected " + str( self.simulationParameterObj.nbrReaches) + " integer action." if not InvasiveUtility.is_action_allowable(action, self.state): theObs = Observation() InvasiveUtility.is_action_allowable(action, self.state) #map(int, results) theObs.intArray = [-1] returnRO = Reward_observation_terminal() returnRO.r = self.Bad_Action_Penalty returnRO.o = theObs return returnRO cost_state_unit = InvasiveUtility.get_unit_invaded_reaches(self.state, self.simulationParameterObj.habitatSize) * self.actionParameterObj.costPerReach stateCost = cost_state_unit + InvasiveUtility.get_invaded_reaches( self.state) * self.actionParameterObj.costPerTree stateCost = stateCost + InvasiveUtility.get_empty_slots(self.state) * self.actionParameterObj.emptyCost costAction = InvasiveUtility.get_budget_cost_actions(action, self.state, self.actionParameterObj) if costAction > self.actionParameterObj.budget: theObs = Observation() InvasiveUtility.is_action_allowable(action, self.state) #map(int, results) theObs.intArray = [-1] returnRO = Reward_observation_terminal() returnRO.r = self.Bad_Action_Penalty returnRO.o = theObs return returnRO nextState = simulateNextState(self.state, action, self.simulationParameterObj, self.actionParameterObj, self.dispertionTable, self.germinationObj) self.state = nextState theObs = Observation() theObs.intArray = self.state returnRO = Reward_observation_terminal() returnRO.r = -1 * (costAction + stateCost) returnRO.o = theObs return returnRO
def env_step(self,thisAction): # validate the action assert len(thisAction.doubleArray)==2,"Expected 4 double actions." self.takeAction(thisAction.doubleArray) theObs = Observation() theObs.doubleArray = self.getState().tolist() theReward,terminate = self.getReward() returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = int(terminate) return returnRO
def env_step(self, thisAction): # validate the action assert len(thisAction.doubleArray) == 2, "Expected 4 double actions." self.takeAction(thisAction.doubleArray) theObs = Observation() theObs.doubleArray = self.getState().tolist() theReward, terminate = self.getReward() returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = int(terminate) return returnRO
def env_step(self,actions): """ Verify the actions are valid, play a move, and return the state. """ reward = 0 terminal = 0 #Change our current state to the new board self.state = actions.intArray #Check if the agent made a winning move if self.is_victory(): print "WE LOST" reward = 1 terminal = 1 #Otherwise keep on playing! elif self.is_full(): "AGENT FILLED" reward = 1 terminal = 1 elif not self.is_full(): print "PLAY" self.env_play() #Check if we won if self.is_full(): print "WE FILLED" reward = 1 terminal = 1 if self.is_victory(): print "WE WON" reward = 0 terminal = 1 #Set up the observation object and return it obs = Observation() obs.intArray = self.state reward_obs = Reward_observation_terminal() reward_obs.r = reward reward_obs.o = obs reward_obs.terminal = terminal return reward_obs
def env_step(self, thisAction): # Make sure the action is valid assert len(thisAction.intArray) == 1, "Expected 1 integer action." assert thisAction.intArray[0] >= 0, "Expected action to be in [0,3]" assert thisAction.intArray[0] < 4, "Expected action to be in [0,3]" self.updatePosition(thisAction.intArray[0]) theObs = Observation() theObs.intArray = [self.calculateFlatState()] returnRO = Reward_observation_terminal() returnRO.r = self.calculateReward() returnRO.o = theObs returnRO.terminal = self.checkCurrentTerminal() return returnRO
def env_step(self,thisAction): episodeOver = 0 theReward = -1.0 intAction = thisAction.intArray[0] theReward = self.takeAction(intAction) if self.isAtGoal() or (self.fuel_loc is not None and self.fuel) < 0: episodeOver = 1 theObs = self.makeObservation() returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = episodeOver return returnRO
def env_step(self,thisAction): # Make sure the action is valid assert len(thisAction.intArray)==1,"Expected 1 integer action." assert thisAction.intArray[0]>=0, "Expected action to be in [0,3]" assert thisAction.intArray[0]<4, "Expected action to be in [0,3]" self.updatePosition(thisAction.intArray[0]) theObs=Observation() theObs.intArray=[self.calculateFlatState()] returnRO=Reward_observation_terminal() returnRO.r=self.calculateReward() returnRO.o=theObs returnRO.terminal=self.checkCurrentTerminal() return returnRO
def env_step(self,thisAction): self.screen.fill((0,0,0)) if self.gameover: self.center_msg("""Game Over!\nYour score: %d Press space to continue""" % self.score) else: if self.paused: self.center_msg("Paused") else: pygame.draw.line(self.screen, (255,255,255), (self.rlim+1, 0), (self.rlim+1, self.height-1)) self.disp_msg("Next:", ( self.rlim+cell_size, 2)) self.disp_msg("Score: %d\n\nLevel: %d\nLines: %d" % (self.score, self.level, self.lines),(self.rlim+cell_size, cell_size*5)) self.draw_matrix(self.bground_grid, (0,0)) self.draw_matrix(self.board, (0,0)) self.draw_matrix(self.stone, (self.stone_x, self.stone_y)) self.draw_matrix(self.next_stone, (cols+1,2)) pygame.display.update() for event in pygame.event.get(): if event.type == pygame.USEREVENT+1: self.drop(False) elif event.type == pygame.QUIT: self.quit() elif event.type == pygame.KEYDOWN: for key in key_actions: if event.key == eval("pygame.K_"+key): key_actions[key]() episodeOver=0 theReward=0 theObs=Observation() theObs.intArray=np.zeros(50816) returnRO=Reward_observation_terminal() returnRO.r=theReward returnRO.o=theObs returnRO.terminal=episodeOver return returnRO
def env_step(self, action): # エージェントから受け取った○を打つ場所 int_action_agent = action.intArray[0] # 相手(Agent) の手を実行し、勝敗を確認する # 勝敗がつかなければ、自身の手を考え、実行する。 # ゲームの報酬などをまとめて エージェントにおくる。 # パスの場合は、(-1,-1)を使用する if int_action_agent == -1: step_raw_col = (-1, -1) else: step_raw_col = (int_action_agent // self.n_cols, int_action_agent % self.n_cols) # step 実行 step_o, step_r, step_done = self.game.step(step_raw_col) rot = Reward_observation_terminal() # build_map_from_game()でマップを作成する。 self.map = self.build_map_from_game() observation = Observation() observation.intArray = self.map rot.o = observation # step_r は報酬、step_done は継続の有無 rot.r = step_r rot.terminal = step_done # ボード情報保存用 current_map = '' for i in range(0, len(self.map), self.n_cols): current_map += ' '.join(map(str, self.map[i:i + self.n_cols])) + '\n' self.history.append(current_map) # 試合の様子を記録 if rot.r == self.game.r_lose: f = open('history.txt', 'a') history = '\n'.join(self.history) f.writelines('# START\n' + history + '# END\n\n') f.close() # 決着がついた場合は agentのagent_end # 決着がついていない場合は agentのagent_step に続く return rot
def env_step(self, thisAction): episodeOver = 0 theReward = -1.0 intAction = thisAction.intArray[0] theReward = self.takeAction(intAction) if self.isAtGoal() or (self.fuel_loc is not None and self.fuel) < 0: episodeOver = 1 theObs = self.makeObservation() returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = episodeOver return returnRO
def env_step(self, thisAction): print thisAction.intArray[0] assert len(thisAction.intArray) == 1, "Expected 1 integer action." assert thisAction.intArray[0] >= 0, "Expected action to be in [0,3]" assert thisAction.intArray[0] < 4, "Expected action to be in [0,3]" self.updatePosition(thisAction.intArray[0]) Obs = Observation() Obs.intArray = [self.rolloutstate()] Reward = Reward_observation_terminal() Reward.r = self.current_reward() Reward.o = Obs Reward.terminal = self.goalcheck() return Reward
def env_step(self, action): """ Take a step in the environment :param action: The action that the agent wants to take :returns: The next state, reward and whether the current state is terminal :rtype: :class:`Reward_observation_terminal` """ returnRO = Reward_observation_terminal() returnRO.r = self.pinball.take_action(action.intArray[0]) obs = Observation() obs.doubleArray = self.pinball.get_state() returnRO.o = obs returnRO.terminal = self.pinball.episode_ended() return returnRO
def env_step(self, thisAction): intAction = thisAction.intArray[0] theReward = self.takeAction(intAction) episodeOver = int(self.terminate()) if self.reward_noise > 0: theReward += numpy.random.normal(scale=self.reward_noise) theObs = Observation() theObs.doubleArray = ( [self.cart_location, self.cart_velocity] + self.pole_angle.tolist() + self.pole_velocity.tolist() ) returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = episodeOver return returnRO
def env_step(self, thisAction): intAction = thisAction.intArray[0] theReward = self.takeAction(intAction) episodeOver = int(self.terminate()) if self.reward_noise > 0: theReward += numpy.random.normal(scale=self.reward_noise) theObs = Observation() theObs.doubleArray = [ self.cart_location, self.cart_velocity ] + self.pole_angle.tolist() + self.pole_velocity.tolist() returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = episodeOver return returnRO
def env_step(self,thisAction): episodeOver = 0 theReward = -1.0 intAction = thisAction.intArray[0] self.step(intAction, self.noise) seized = 0 theReward = self.stim_penalty if intAction == 1 else 0.0 if self.getLabel(self.current_neighbor) == self.seiz_label: theReward += self.seizure_penalty theObs = Observation() theObs.doubleArray = self.state.tolist() returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = 0 return returnRO
def env_step(self, thisAction): # Process action # self.stageIndex = thisAction.intArray[0] if thisAction.intArray[0] == 0: self.stageIndex = self.licycle.next() # print "stageIndex: {}".format(self.stageIndex) traci.trafficlights.setRedYellowGreenState("1", self.Stages[self.stageIndex]) traci.simulationStep() self.simStep += 1 # print "Simulation step: {}".format(self.simStep) self.currentVehList = traci.vehicle.getIDList() self.state.updateState(self.currentVehList) episodeTerminal=0 # Check if state is terminal if traci.simulation.getMinExpectedNumber() == 0: theObs = Observation() theObs.intArray=self.state.carState.flatten() episodeTerminal=1 traci.close() theObs=Observation() theObs.intArray=self.state.carState.flatten() returnRO=Reward_observation_terminal() returnRO.r=self.calculate_reward() # returnRO.r=self.calculate_delay() # print "Reward: {}".format(returnRO.r) returnRO.o=theObs returnRO.terminal=episodeTerminal killedVehicles = checkVehKill(self.vehicleDict) for vehicle in killedVehicles: del self.vehicleDict[vehicle] self.previousVehList = self.currentVehList return returnRO
def env_step(self,thisAction): episodeOver = 0 intAction = thisAction.intArray[0] theReward = self.takeAction(intAction) if self.isAtGoal(): episodeOver = 1 if self.reward_noise > 0: theReward += numpy.random.normal(scale=self.reward_noise) theObs = Observation() theObs.doubleArray = self.getState() returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = episodeOver return returnRO
def env_step(self, action): assert len(action.intArray) <= 2, "Expected 1 integer action." assert action.intArray[0] >= 0, "Expected action to be in [0,5]" assert action.intArray[0] < 6, "Expected action to be in [0,5]" s1, r1, d1, k1 = self.step(action.intArray[0]) returnRO = Reward_observation_terminal() returnRO.r = r1 * 1.0 returnRO.o = Observation() returnRO.o.intArray = [s1] returnRO.terminal = d1 if self.toprint == 1: self.clearscreen() x = taxi.TaxiEnv.render(self, taxi.TaxiEnv.metadata['render.modes'][0]) print x time.sleep(0.08) #self.seps=self.seps+1 #if self.seps >50: # returnRO.terminal=TRUE return returnRO
def env_step(self,thisAction): # Make sure the action is valid assert len(thisAction.intArray)==1,"Expected 1 integer action." assert thisAction.intArray[0]>=0, "Expected action to be in [0,4]" assert thisAction.intArray[0]<4, "Expected action to be in [0,4]" self.updatePosition(thisAction.intArray[0]) lastActionValue = thisAction.intArray[0] theObs=Observation() theObs.intArray=[self.calculateFlatState()] theObs.charArray = ["T", "T", "T", "T"] if len(self.optionsArray[self.agentRow][self.agentCol]) != 0: for i in range(len(self.optionsArray[self.agentRow][self.agentCol])): theObs.charArray[2+self.optionsArray[self.agentRow][self.agentCol][i]] = "T" returnRO=Reward_observation_terminal() returnRO.r=self.calculateReward(lastActionValue) returnRO.o=theObs returnRO.terminal=self.checkCurrentTerminal() return returnRO
def env_step(self,thisAction): episodeOver=0 theReward=0 #screen.fill((255,255,255)) # 画面を青色で塗りつぶす #self.screen.blit(self.backImg, (0,0)) self.bg.draw(self.screen) self.player.setAction(thisAction) self.player.update() theReward = self.bg.decision(self.player.rect.x, self.player.rect.y) episodeOver = self.bg.get_gameover() self.player.draw(self.screen) #all.update() #all.draw(screen) #score_board.draw(self.screen) # スコアボードを描画 pygame.display.update() # 画面を更新 returnObs=Observation() arr = pygame.surfarray.array2d(self.screen) returnObs.intArray=np.append(np.zeros(128), [ item for innerlist in arr for item in innerlist ]) scipy.misc.imsave('screen.png', arr) returnRO=Reward_observation_terminal() returnRO.r=theReward returnRO.o=returnObs returnRO.terminal=episodeOver # イベント処理 for event in pygame.event.get(): if event.type == QUIT: # 終了イベント sys.exit() return returnRO
def env_step(self, action): #まずはエージェントさんに勝敗を告げる。 # エージェントから受け取った○を打つ場所 int_action_agent = self.get_drop_ball_point(action.intArray[0]) # 盤に○を打ち、空白の個所を取得する self.map[int_action_agent] = self.flg_agent #これが盤面 free_top = self.get_free_top_of_map() #free = [i for i, v in enumerate(self.map) if v == self.flg_free] n_free = len(free_top) rot = Reward_observation_terminal() rot.r = 0.0 rot.terminal = False # ○を打った後の勝敗を確認する for line in self.lines: state = np.array(self.map)[line] point = sum(state == self.flg_agent) if point == self.n_rows: rot.r = self.r_win rot.terminal = True break point = sum(state == self.flg_env) if point == self.n_rows: rot.r = self.r_lose rot.terminal = True break # 勝敗がつかなければ、×を打つ位置を決める if not rot.terminal: # 空白がなければ引き分け if n_free == 0: rot.r = self.r_draw rot.terminal = True else: int_action_env = None # 空白が1個所ならばそこに×を打つ if n_free == 1: int_action_env = self.get_drop_ball_point(free_top[0]) rot.terminal = True else: # ×の位置を決定する 75% if np.random.rand() < self.opp: #勝てそうなら勝ちに行く。 #todo アルゴリズム変更。n_free回打ってみてチェック。 for line in self.lines: state = np.array(self.map)[line] point = sum(state == self.flg_env) #環境さん if point == self.n_rows - 1: #環境さんが勝ちそう! index = np.where(state == self.flg_free)[0] if len(index) != 0: want_to_put = line[index[0]] i_top = want_to_put % 16 #上から落としてみて起きたい場所におけるか? if (want_to_put == self.get_drop_ball_point(i_top)): int_action_env = want_to_put break #負けそうなら回避する。 #todo アルゴリズム変更。負ける箇所が複数なら負けを宣言。 if int_action_env is None: for line in self.lines: state = np.array(self.map)[line] point = sum(state == self.flg_agent) #エージェントさん if point == self.n_rows - 1: index = np.where(state == self.flg_free)[0] if len(index) != 0: want_to_put = line[index[0]] i_top = want_to_put % 16 #上から落としてみて起きたい場所におけるか? if (want_to_put == self. get_drop_ball_point(i_top)): int_action_env = want_to_put break int_action_env = line[index[0]] break # ×の位置をランダムに決定する 25% if int_action_env is None: int_action_env = self.get_drop_ball_point( free_top[np.random.randint(n_free)]) # 盤に×を打つ self.map[int_action_env] = self.flg_env #このままでいい。 free_top = self.get_free_top_of_map() #0の箇所を探索している。 n_free = len(free_top) # ×を打った後の勝敗を確認する for line in self.lines: state = np.array(self.map)[line] point = sum(state == self.flg_agent) if point == self.n_rows: rot.r = self.r_win rot.terminal = True break point = sum(state == self.flg_env) if point == self.n_rows: rot.r = self.r_lose rot.terminal = True break if not rot.terminal and n_free == 0: rot.r = self.r_draw rot.terminal = True # 盤の状態と報酬、決着がついたかどうか をまとめて エージェントにおくる。 observation = Observation() observation.intArray = self.map rot.o = observation current_map = 'map\n' for i in range(0, len(self.map), self.n_cols): current_map += ' '.join(map(str, self.map[i:i + self.n_cols])) + '\n' if (i % 16 == 0): current_map += "\n" self.history.append(current_map) if rot.r == -1: f = open('history.txt', 'a') history = '\n'.join(self.history) f.writelines('# START\n' + history + '# END\n\n') f.close() # 決着がついた場合は agentのagent_end # 決着がついていない場合は agentのagent_step に続く return rot