def agent_step(self,reward, observation): import math # function sigmoid if(self.Episode_Counter>10000): self.Epsilon = 0 else: self.Epsilon = util.randomSigmoidEpsilon(reward,0.02,50) self.Steps +=1 print reward thisDoubleAction=self.agent_action_step(reward,observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastObservation=copy.deepcopy(observation) self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_step(self, reward, observation): """ This method is called each time step. Arguments: reward - Real valued reward. observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ # Generate random action this_int_action=self.randGenerator.randint(0,self.num_actions-1) return_action=Action() return_action.intArray=[this_int_action] if self.show_ale: self._show_ale_color() #self._show_ale_gray() if self.saving: if self.int_states: self.states.append(self.last_observation.intArray) else: self.states.append(self.last_observation.doubleArray) self.actions.append(self.last_action.intArray[0]) self.rewards.append(reward) self.absorbs.append(False) self.last_action=copy.deepcopy(return_action) self.last_observation=copy.deepcopy(observation) return return_action
def agent_step(self, reward, obs): """ This function is called by the environment while the episode lasts. If learning is not frozen, the option-value function Q(s, o) is updated using intra-option learning. :param reward: The reward obtained as a result of the last transition. :param obs: An observation from the environment :rtype obs: :class:`rlglue.types.Observation` :returns: The primitive action to execute in the environment according to the behavior policy. :rtype: a primitive action under the form of a :class:`rlglue.types.Action` """ observation = np.array(obs.doubleArray) current_features = self.basis.computeFeatures(observation) if not self.finished_learning: self.intraoption_update(reward, current_features, observation) self.last_observation = observation self.last_features = current_features self.last_action = self.mu(observation, current_features).pi(observation) action = Action() action.intArray = [self.last_action] return action
def agent_step(self, reward, observation): lastState = self.lastObservation.intArray lastAction = self.lastAction.intArray lastStateId = SamplingUtility.getStateId(lastState) lastActionIdx = self.all_allowed_actions[lastStateId].index(tuple(lastAction)) if reward == self.Bad_Action_Penalty: self.all_allowed_actions[lastStateId].pop(lastActionIdx) self.Q_value_function[lastStateId].pop(lastActionIdx) newAction = self.egreedy(self.lastObservation.intArray) returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) return returnAction newState = observation.intArray newAction = self.egreedy(newState) if type(newAction) is tuple: newAction = list(newAction) Q_sa = self.Q_value_function[lastStateId][lastActionIdx] Q_sprime_aprime = self.Q_value_function[SamplingUtility.getStateId(newState)][ self.all_allowed_actions[SamplingUtility.getStateId(newState)].index(tuple(newAction))] new_Q_sa = Q_sa + self.sarsa_stepsize * (reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa) if not self.policyFrozen: self.Q_value_function[SamplingUtility.getStateId(lastState)][ self.all_allowed_actions[SamplingUtility.getStateId(lastState)].index(tuple(lastAction))] = new_Q_sa returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ self.step_counter = 0 self.batch_counter = 0 # We report the mean loss for every epoch. self.loss_averages = [] self.start_time = time.time() #this_int_action = self.randGenerator.randint(0, self.num_actions-1) observation_matrix = np.asmatrix(observation.doubleArray, dtype='float32') actions = self.action_network.fprop(observation_matrix) return_action = Action() return_action.doubleArray = [actions] self.last_action = copy.deepcopy(return_action) self.last_observation = observation.doubleArray return return_action
def agent_step(self, reward, observation): # Preproces tmp = np.bitwise_and( np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 - 8, :] # Scaling obs_processed = np.maximum( obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation self.state = np.asanyarray( [self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8) state_ = cuda.to_gpu( np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) # Exploration decays along the time sequence if self.policyFrozen is False: # Learning ON/OFF if self.DQN.initial_exploration < self.time: self.epsilon -= 1.0 / 10**6 if self.epsilon < 0.1: self.epsilon = 0.1 eps = self.epsilon else: # Initial Exploation Phase print "Initial Exploration : %d/%d steps" % ( self.time, self.DQN.initial_exploration) eps = 1.0 else: # Evaluation print "Policy is Frozen" eps = 0.05 # Generate an Action from e-greedy action selection returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, eps) returnAction.intArray = [action] # Learning Phase if self.policyFrozen is False: # Learning ON/OFF self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False) self.DQN.experienceReplay(self.time) # Simple text based visualization print ' Time Step %d / ACTION %d / REWARD %.1f / EPSILON %.6f / Q_max %3f' % ( self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now.get())) # Updates for next step self.last_observation = obs_array # Update for next step if self.policyFrozen is False: self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.time += 1 return returnAction
def agent_step(self,reward, observation): import math self.Rewards += reward # function sigmoid if(self.Episode_Counter>Training_Runs): self.Epsilon = 0 self.Steps +=1 thisDoubleAction=self.agent_action_step(reward,observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastObservation=copy.deepcopy(observation) self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_step(self, reward, observation): self.step_counter += 1 self.total_reward += reward cur_img = self.resize_image(observation.intArray) if self.is_testing: int_action = self.choose_action(self.test_table, cur_img, np.clip(reward, -1, 1), testing_ep=0.05) else: if self.step_counter % self.reset_after == 0: self.network.reset_q_hat() int_action = self.choose_action(self.train_table, cur_img, np.clip(reward, -1, 1), testing_ep=None) if self.train_table.num_entries > max(self.learn_start, self.batch_size): states, actions, rewards, next_states, terminals = self.train_table.get_minibatch( self.batch_size) loss, qvals = self.network.train(states, actions, rewards, next_states, terminals) self.losses.append(loss) self.qvals.append(np.mean(qvals)) self.batch_counter += 1 return_action = Action() return_action.intArray = [int_action] self.last_action = int_action self.last_img = cur_img return return_action
def do_step(self, state, reward = None): """ Runs the actual learning algorithm. In a separate function so it can be called both on start and on step. """ #self.debug('do_step(', state, ',', reward, ')') #if not state in self.Q: # State not yet visited, initialize randomly # self.Q[state] = self.random_actions() # Run the Q update if this isn't the first step action = None if reward is not None: action = self.update_Q(self.last_state, self.last_action, reward, state) # Action object a_obj = Action() if action is None: # Query the policy to find the best action action = self.policy(state) a_obj.charArray = list(action) # Save the current state-action pair for the next step's Q update. self.last_state = state self.last_action = action # And we're done return a_obj
def agent_start(self, observation): if self.debug_flag: print('agent start') # stepを1増やす self.step_counter += 1 #開始時にstateをクリアしないとだめじゃない? #self.state = np.zeros((1, self.n_frames, self.bdim)).astype(np.float32) self.state = np.zeros( (1, 2, self.n_rows, self.n_cols)).astype(np.float32) # kmori: 独自のobservationを使用して、状態をアップデート。 # 一部サンプルに合わせ、残りは別の方法で作成した。 self.update_state(observation) self.update_targetQ() if self.debug_flag: print('自分が打つ手を決定する。') # 自分が打つ手を決定する。 int_action = self.select_int_action() action = Action() action.intArray = [int_action] if self.debug_flag: print('eps を更新する。') # eps を更新する。epsはランダムに○を打つ確率 self.update_eps() # state = 盤の状態 と action = ○を打つ場所 を退避する self.last_state2 = copy.deepcopy(self.last_state) # 2手前の状態 self.last_action2 = copy.deepcopy(self.last_action) # 2手前の行動 self.last_state = copy.deepcopy(self.state) self.last_action = copy.deepcopy(int_action) return action
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ self.step_counter = 0 self.batch_counter = 0 # We report the mean loss for every epoch. self.loss_averages = [] self.start_time = time.time() # this_int_action = self.randGenerator.randint(0, self.num_actions-1) observation_matrix = np.asmatrix(observation.doubleArray, dtype='float32') actions = self.action_network.fprop(observation_matrix) return_action = Action() return_action.doubleArray = actions self.last_action = copy.deepcopy(actions) self.last_observation = observation.doubleArray return return_action
class test_empty_agent(Agent): whichEpisode = 0 emptyAction = Action(0, 0, 0) nonEmptyAction = Action(7, 3, 1) def agent_init(self, taskSpec): self.whichEpisode = 0 self.nonEmptyAction.intArray = (0, 1, 2, 3, 4, 5, 6) self.nonEmptyAction.doubleArray = (0.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0) self.nonEmptyAction.charArray = ('a') def agent_start(self, observation): self.whichEpisode = self.whichEpisode + 1 if self.whichEpisode % 2 == 0: return self.emptyAction else: return self.nonEmptyAction def agent_step(self, reward, observation): if self.whichEpisode % 2 == 0: return self.emptyAction else: return self.nonEmptyAction def agent_end(self, reward): pass def agent_cleanup(self): pass def agent_message(self, inMessage): return ""
def agent_start(self, observation): #Generate random action, 0 or 1 return_action = Action() return_action.intArray = [] for i in xrange(0,self.action_size): return_action.intArray += [self.rng.randint(self.action_bounds[i][0],self.action_bounds[i][1])] return return_action
def agent_step(self, reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ new_state = numpy.array(list(observation.doubleArray)) last_state = numpy.array(list(self.last_observation.doubleArray)) last_action = self.last_action.intArray[0] new_disc_state = self.getDiscState(observation.intArray) last_disc_state = self.getDiscState(self.last_observation.intArray) # Update eligibility traces phi_t = numpy.zeros(self.traces.shape) phi_t[last_disc_state, :, last_action] = self.basis.computeFeatures(last_state) self.update_traces(phi_t, None) self.update(phi_t, new_state, new_disc_state, reward) # QLearning can choose action after update new_int_action = self.getAction(new_state, new_disc_state) return_action = Action() return_action.intArray = [new_int_action] self.last_action = copy.deepcopy(return_action) self.last_observation = copy.deepcopy(observation) return return_action
def agent_step(self,reward, observation): import math # try to set the epsilon inverse of reward that we have got if(self.Episode_Counter >10000): self.Epsilon = 0.0 self.Steps +=1 print reward thisDoubleAction=self.agent_action_step(reward,observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastObservation=copy.deepcopy(observation) self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_step(self,Reward,Obs): new_state = Obs.intArray[0] last_state = self.lastObs.intArray[0] last_action = self.lastaction.intArray[0] Q_sa = self.qfunction[last_state][last_action] Q_saprime = self.maxim(new_state) Q_new = Q_sa + self.learningrate*( Reward + self.gamma*Q_saprime - Q_sa) #if not self.pause: self.qfunction[last_state][last_action] = Q_new #To be taken new_action = self.epsilon_greedy(new_state) returnaction = Action() returnaction.intArray = [new_action] self.lastaction = copy.deepcopy(returnaction) self.lastObs = copy.deepcopy(Obs) return returnaction
def agent_step(self,Reward,Obs): new_state = Obs.intArray[0] last_state = self.lastObs.intArray[0] last_action = self.lastaction.intArray[0] new_action = self.epsilon_greedy(new_state) Q_sa = self.qfunction[last_state][last_action] Q_saprime = self.qfunction[new_state][new_action] delta = Reward + self.gamma*Q_saprime - Q_sa self.efunction[last_state][last_action] = self.efunction[last_state][last_action] + 1 self.qfunction = np.array(self.qfunction) self.efunction = np.array(self.efunction) self.qfunction = self.qfunction + self.learningrate*delta*self.efunction self.efunction = self.gamma*self.lamda*self.efunction returnaction = Action() returnaction.intArray = [new_action] self.lastaction = copy.deepcopy(returnaction) self.lastObs = copy.deepcopy(Obs) return returnaction
def agent_step(self,reward, observation): self.reward += reward self.step += 1 self.total_reward += reward thisDoubleAction=self.agent_step_action(observation.doubleArray) if(self.isRisk(observation.doubleArray,thisDoubleAction)): self.times += 1 thisDoubleAction = util.baselinePolicy(observation.doubleArray) from pybrain.supervised.trainers import BackpropTrainer from pybrain.datasets import SupervisedDataSet ds = SupervisedDataSet(12, 4) ds.addSample(observation.doubleArray,self.best.activate(observation.doubleArray)) trainer = BackpropTrainer(self.network, ds) trainer.train() returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastObservation=copy.deepcopy(observation) self.lastAction=copy.deepcopy(returnAction) self.lastReward = reward return returnAction
def agent_step(self,reward, observation): self.states_diff_list.append([a - b for (a, b) in zip (observation.doubleArray, self.lastObservation.doubleArray)]) self.lastObservation=copy.deepcopy(observation) self.approximateValueFunction() #end of test print reward #test how reward approximation works self.approximateRewardFunction(reward,observation) #end of test thisDoubleAction=self.approximateAction() returnAction=Action() returnAction.doubleArray = thisDoubleAction #approximate value function self.action_list.append(thisDoubleAction) #end of test self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_step(self, reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) newIntAction = self.getAction(newState, newDiscState) phi_t = numpy.zeros((self.weights.shape[0], self.weights.shape[1])) phi_tp = numpy.zeros((self.weights.shape[0], self.weights.shape[1])) phi_t[lastDiscState, :] = self.basis.computeFeatures(lastState) phi_tp[newDiscState, :] = self.basis.computeFeatures(newState) self.step_count += 1 self.update( phi_t, phi_tp, reward, self.getCompatibleFeatures(phi_t, lastAction, reward, phi_tp, newIntAction)) returnAction = Action() returnAction.intArray = [newIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self, reward, observation): self.step_counter += 1 self.total_reward += reward cur_img = self.resize_image(observation.intArray) if self.is_testing: int_action = self.choose_action(self.test_table, cur_img, np.clip(reward, -1, 1), testing_ep=0.05) else: if self.step_counter % self.reset_after == 0: self.network.reset_q_hat() int_action = self.choose_action(self.train_table, cur_img, np.clip(reward, -1, 1), testing_ep=None) if self.train_table.num_entries > max(self.learn_start, self.batch_size): states, actions, rewards, next_states, terminals = self.train_table.get_minibatch(self.batch_size) loss, qvals = self.network.train(states, actions, rewards, next_states, terminals) self.losses.append(loss) self.qvals.append(np.mean(qvals)) self.batch_counter += 1 return_action = Action() return_action.intArray = [int_action] self.last_action = int_action self.last_img = cur_img return return_action
def agent_step(self,reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) newIntAction = self.getAction(newState, newDiscState) phi_t = numpy.zeros((self.weights.shape[0], self.weights.shape[1])) phi_tp = numpy.zeros((self.weights.shape[0], self.weights.shape[1])) phi_t[lastDiscState, :] = self.basis.computeFeatures(lastState) phi_tp[newDiscState, :] = self.basis.computeFeatures(newState) self.step_count += 1 self.update(phi_t, phi_tp, reward, self.getCompatibleFeatures(phi_t, lastAction, reward, phi_tp, newIntAction)) returnAction=Action() returnAction.intArray=[newIntAction] self.lastAction=copy.deepcopy(returnAction) self.lastObservation=copy.deepcopy(observation) return returnAction
def agent_step(self, reward, observation): action = None self.window.erase() self.window.addstr('STATE: %s\n' % (observation.intArray)) self.window.addstr('REWARD: %s\n' % (reward)) self.window.addstr('HIT UP, DOWN, LEFT or RIGHT to move...\n') self.window.refresh() try: c = self.window.getch() if c == curses.KEY_UP: action = 'N' elif c == curses.KEY_DOWN: action = 'S' elif c == curses.KEY_LEFT: action = 'W' elif c == curses.KEY_RIGHT: action = 'E' self.window.refresh() except KeyboardInterrupt: RLGlue.RL_cleanup() a = Action() if action: a.charArray = [action] return a
def agent_start(self,observation): self.P = np.asarray([[0.0 for j in range(self.N_AC)] for i in range(self.N_PC)]) theState=observation.doubleArray if dynamicEpsilon=='1': self.q_epsilon = 0.3-0.005*self.episode else: self.q_epsilon = 0.3 r_PC = self.getProbGaussians(theState[0], theState[1]) res = self.egreedy(theState, r_PC) phi_AC = res[0] r_1_AC = res[1] r_2_AC = [] for i in xrange(self.N_AC): r_2_AC.append(math.exp( (-1*(phi_AC - 2*math.pi*i/self.N_AC)**2)/(2*self.sigma_AC**2) )) # Update P_ij for i in xrange(self.N_AC): for j in xrange(self.N_PC): self.P[j,i] = self.q_stepsize*self.P[j,i] + r_2_AC[i]*r_PC[j] returnAction=Action() returnAction.doubleArray=[phi_AC] # finding closest AC closest_AC = r_2_AC.index(max(r_2_AC)) self.lastQ = r_1_AC[closest_AC] self.episode += 1 return returnAction
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ self.step_counter = 0 self.batch_counter = 0 # We report the mean loss for every epoch. self.loss_averages = [] self.start_time = time.time() this_int_action = self.randGenerator.randint(0, self.num_actions-1) return_action = Action() return_action.intArray = [this_int_action] self.last_action = copy.deepcopy(return_action) self.last_img = np.array(self._resize_observation(observation.intArray)) self.last_img = self.last_img.reshape(CROPPED_WIDTH, CROPPED_HEIGHT).T return return_action
def agent_start(self, observation): """Start an episode for the RL agent. Args: observation: The first observation of the episode. Should be an RLGlue Observation object. Returns: The first action the RL agent chooses to take, represented as an RLGlue Action object. """ log = logging.getLogger('pyrl.agents.sarsa_lambda.agent_start') theState = numpy.array(list(observation.doubleArray)) thisIntAction = self.getAction(theState, self.getDiscState(observation.intArray)) returnAction = Action() returnAction.intArray = [thisIntAction] # Clear traces self.traces.fill(0.0) self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) log.debug("Action: %d", thisIntAction) log.debug("Start State: %s", theState) log.debug("Traces: %s", self.traces) return returnAction
def agent_step(self, reward, observation): """ This method is called each time step. Arguments: reward - Real valued reward. observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ # Generate random action this_int_action = self.randGenerator.randint(0, self.num_actions - 1) return_action = Action() return_action.intArray = [this_int_action] if self.show_ale: self._show_ale_color() #self._show_ale_gray() if self.saving: if self.int_states: self.states.append(self.last_observation.intArray) else: self.states.append(self.last_observation.doubleArray) self.actions.append(self.last_action.intArray[0]) self.rewards.append(reward) self.absorbs.append(False) self.last_action = copy.deepcopy(return_action) self.last_observation = copy.deepcopy(observation) return return_action
def agent_step(self,reward, observation): theState=observation.doubleArray r_PC = self.getProbGaussians(theState[0], theState[1]) res = self.egreedy(theState, r_PC) phi_AC = res[0] r_1_AC = res[1] r_2_AC = [] for i in xrange(self.N_AC): r_2_AC.append(math.exp( (-1*(phi_AC - 2*math.pi*i/self.N_AC)**2)/(2*self.sigma_AC**2) )) # Calculate reward prediction error delta = reward + self.q_gamma*max(r_1_AC) - self.lastQ #print self.q_gamma*max(r_1_AC), self.lastQ # Update synaptic weights for i in xrange(self.N_PC): for j in xrange(self.N_AC): self.W[i,j] = self.q_stepsize * delta * self.P[i,j] # Update P_ij for i in xrange(self.N_AC): for j in xrange(self.N_PC): self.P[j,i] = self.q_stepsize*self.P[j,i] + r_2_AC[i]*r_PC[j] returnAction=Action() returnAction.doubleArray=[phi_AC] # finding closest AC closest_AC = r_2_AC.index(max(r_2_AC)) self.lastQ = r_1_AC[closest_AC] self.episode += 1 return returnAction
def agent_step(self,reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) phi_t = numpy.zeros((self.numStates+1,)) phi_tp = numpy.zeros((self.numStates+1,)) phi_t[0] = lastDiscState phi_t[1:] = lastState phi_tp[0] = newDiscState phi_tp[1:] = newState #print ','.join(map(str, lastState)) self.planner.updateExperience(phi_t, lastAction, phi_tp, reward) newIntAction = self.getAction(newState, newDiscState) returnAction=Action() returnAction.intArray=[newIntAction] self.lastAction=copy.deepcopy(returnAction) self.lastObservation=copy.deepcopy(observation) return returnAction
def create_action(self, action): if np.isscalar(action): action = np.array([action]) return_action = Action() return_action.intArray = [ action[:self.learner.dim_action()].astype(int)] return return_action
def agent_start(self, observation): # Get intensity from current observation array tmp = np.bitwise_and( np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 - 8, :] # Scaling # Initialize State self.state = np.zeros((4, 84, 84), dtype=np.uint8) self.state[0] = obs_array state_ = cuda.to_gpu( np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) # Generate an Action e-greedy returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, self.epsilon) returnAction.intArray = [action] # Update for next step self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.last_observation = obs_array return returnAction
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ self.step_counter = 0 self.batch_counter = 0 # We report the mean loss for every epoch. self.loss_averages = [] self.start_time = time.time() this_int_action = self.randGenerator.randint(0, self.num_actions-1) return_action = Action() return_action.intArray = [this_int_action] self.last_action = copy.deepcopy(return_action) self.last_img = self._resize_observation(observation.intArray) return return_action
def agent_start(self, observation): ''' initialize the episode strategy ''' #Generate action, query 0,0 action = Action() action.charArray.append('q') action.intArray = [1, 0, 0] # increment strategy self.strategyIndex += 1 #add 1st node (0,0) and North with arrow to the partial nodes initPartialNode = Node() self.partialStateNodes = [initPartialNode] #initialize new queue according to strategy self.newQueu() #set the agenda self.agenda = self.EAGLE #reset the pointer to the action path self.pathToGoalIndex = -1 self.visited.fill(False) self.depthq=[] # to measure performance self.numExpandedNodes=0 self.startTime=time.time() #print 'End the method start' return action
def agent_step(self, reward, observation): observed_screen = self.preprocess_screen(observation) self.state = np.roll(self.state, 1, axis=0) self.state[0] = observed_screen ########################### DEBUG ############################### # if self.total_time_step % 500 == 0 and self.total_time_step != 0: # self.dump_state() self.learn(reward) return_action = Action() q_max = None q_min = None if self.time_step % config.rl_action_repeat == 0: action, q_max, q_min = self.dqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate) else: action = self.last_action.intArray[0] return_action.intArray = [action] self.dump_result(reward, q_max, q_min) if self.policy_frozen is False: self.last_action = copy.deepcopy(return_action) self.last_state = self.state self.time_step += 1 self.total_time_step += 1 return return_action
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ # this_int_action = self.randGenerator.randint(0, self.num_actions-1) observation_matrix = np.asmatrix(observation.doubleArray, dtype='float32') actions = self.action_network.predict(observation_matrix) return_action = Action() return_action.doubleArray = actions self.last_action = copy.deepcopy(actions) self.last_state = np.asmatrix(observation.doubleArray, dtype=floatX) return return_action
def agent_step(self,reward, observation): #Generate random action, 0 or 1 #print "actual===reward",reward #print "actual observation==== ",observation.doubleArray #approximate value function self.lastObservation=copy.deepcopy(observation) self.next_observation_list.append(observation.doubleArray) self.approximateKernelFunction() #end of test print reward #test how reward approximation works self.approximateRewardFunction(reward,observation) #end of test thisDoubleAction=self.approximateAction() returnAction=Action() returnAction.doubleArray = thisDoubleAction #approximate value function self.action_list.append(thisDoubleAction) self.last_observation_list.append(observation.doubleArray) #end of test self.lastAction=copy.deepcopy(returnAction) return returnAction
def do_step(self, state, reward=None): """ Runs the actual learning algorithm. In a separate function so it can be called both on start and on step. """ #self.debug('do_step(', state, ',', reward, ')') #if not state in self.Q: # State not yet visited, initialize randomly # self.Q[state] = self.random_actions() # Run the Q update if this isn't the first step action = None if reward is not None: action = self.update_Q(self.last_state, self.last_action, reward, state) # Action object a_obj = Action() if action is None: # Query the policy to find the best action action = self.policy(state) a_obj.charArray = list(action) # Save the current state-action pair for the next step's Q update. self.last_state = state self.last_action = action # And we're done return a_obj
def _select_action(self, phi=None): """ Utility function for selecting an action. phi: ndarray Memory from which action should be selected. """ if self.action_count % self.k == 0: if (np.random.rand() > self.epsilon) and phi: # Get action from Q-function phi = np.array(phi)[:, :, :, None] action_int = self.action_func(phi)[0] else: # Get random action action_int = np.random.randint(0, len(self.action_map)) self.action_log[action_int] += 1 self.cmd = [0]*len(self.action_map) self.cmd[action_int] = 1 # Map cmd to ALE action # 18 is the number of commands ALE accepts action = Action() action.intArray = [self.action_map[action_int]] self.action = action
def agent_step(self,reward, observation): import math if(self.Episode_Counter>5000): self.Epsilon = 0 self.Steps +=1 print reward thisDoubleAction=self.agent_action_step(reward,observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastObservation=copy.deepcopy(observation) self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_step(self, reward, observation): newState = observation.intArray[0] lastState = self.lastObservation.intArray[0] lastAction = self.lastAction.intArray[0] Q_sa = self.value_function[lastState][lastAction] Q_sprime_aprime = -500000 for a in range(self.numberOfActions): if self.value_function[newState][a] > Q_sprime_aprime: Q_sprime_aprime = self.value_function[newState][a] #updating Q function new_Q_sa = Q_sa + self.sarsa_stepsize * ( reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa) newIntAction = self.egreedy(newState) if not self.policyFrozen: self.value_function[lastState][lastAction] = new_Q_sa returnAction = Action() returnAction.intArray = [newIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self, reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) phi_t = numpy.zeros((self.numStates + 1, )) phi_tp = numpy.zeros((self.numStates + 1, )) phi_t[0] = lastDiscState phi_t[1:] = lastState phi_tp[0] = newDiscState phi_tp[1:] = newState #print ','.join(map(str, lastState)) self.planner.updateExperience(phi_t, lastAction, phi_tp, reward) newIntAction = self.getAction(newState, newDiscState) returnAction = Action() returnAction.intArray = [newIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self, reward, observation): # ステップを1増加 self.step_counter += 1 self.update_state(observation) self.update_targetQ() # 自分が打つ手を決定する。 int_action = self.select_int_action() # 戻り値が -1 ならパス。 action = Action() action.intArray = [int_action] self.reward = reward # epsを更新 self.update_eps() # データを保存 (状態、アクション、報酬、結果) self.store_transition(terminal=False) if not self.frozen: # 学習実行 if self.step_counter > self.learn_start: self.replay_experience() self.last_state = copy.deepcopy(self.state) self.last_action = copy.deepcopy(int_action) # ○の位置をエージェントへ渡す return action
def agent_step(self, reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) # Update eligibility traces phi_t = numpy.zeros(self.traces.shape) phi_t[lastDiscState, :, lastAction] = self.basis.computeFeatures(lastState) self.update_traces(phi_t, None) self.update(phi_t, newState, newDiscState, reward) # QLearning can choose action after update newIntAction = self.getAction(newState, newDiscState) returnAction = Action() returnAction.intArray = [newIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self,reward, observation): self.stepCount=self.stepCount+1 action=Action() action.intArray=observation.intArray action.doubleArray=observation.doubleArray action.charArray=observation.charArray return action
def agent_start(self,observation): self.stepCount=0 action=Action() action.intArray=observation.intArray action.doubleArray=observation.doubleArray action.charArray=observation.charArray return action
def agent_start(self, observation): self.stepCount = 0 action = Action() action.intArray = observation.intArray action.doubleArray = observation.doubleArray action.charArray = observation.charArray return action
def agent_step(self, reward, observation): self.stepCount = self.stepCount + 1 action = Action() action.intArray = observation.intArray action.doubleArray = observation.doubleArray action.charArray = observation.charArray return action
def agent_step(self, reward, observation): # Observation obs_array = np.array(observation.doubleArray) #print "state: %3f %3f %3f %3f" % (obs_array[0],obs_array[1],obs_array[2],obs_array[3]) # Compose State : 4-step sequential observation #self.state = self.rescale_value(obs_array) self.state = obs_array #print("state2:"+str(self.state)) #print "state: %3f %3f %3f %3f" % (self.state[0],self.state[1],self.state[2],self.state[3]) state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1,12), dtype=np.float32)) #print("state2_:"+str(state_)) # Exploration decays along the time sequence if self.policyFrozen is False: # Learning ON/OFF if self.DQN.initial_exploration < self.time: self.epsilon -= 1.0/10**6 if self.epsilon < 0.1: self.epsilon = 0.1 eps = self.epsilon else: # Initial Exploation Phase print "Initial Exploration : %d / %d steps" % (self.time, self.DQN.initial_exploration) eps = 1.0 else: # Evaluation print "Policy is Frozen" eps = 0.05 # Generate an Action by e-greedy action selection returnAction = Action() action = self.DQN.e_greedy(state_, eps) #print(str(action)) returnAction.doubleArray = action[0].tolist() # Learning Phase if self.policyFrozen is False: # Learning ON/OFF self.DQN.stockExperience(self.time, self.last_state, np.asarray(self.lastAction.doubleArray,dtype=np.float32), reward, self.state, False) self.DQN.experienceReplay(self.time) # Target model update # if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0: # print "########### MODEL UPDATED ######################" # self.DQN.hard_target_model_update() # Simple text based visualization print 'Time Step %d / ACTION %s / REWARD %.5f / EPSILON %.5f' % (self.time,str(action[0]),reward,eps) # Updates for next step self.last_observation = obs_array if self.policyFrozen is False: self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.time += 1 return returnAction
def _getAction(self): """ Return a RLGlue action that is made out of a numpy array yielded by the hold pybrain agent. """ action = RLGlueAction() action.doubleArray = self.agent.getAction().tolist() action.intArray = [] return action
def getCellsNeededForDiscovery(self, node): ''' Generate list I contains locations to discover and send action with them ''' newPosition = self.newPosition(node.state.position, node.state.orintation)[1] action = Action() action.intArray = [1, newPosition[0], newPosition[1]] action.charArray.append('q') return action
def getRandomAction(self, mindir=-1, run = 0): action = Action(3, 0) # direction (left: -1, right: 1, neither: 0) action.intArray[0] = random.randint(mindir,1) # jumping (yes: 1, no: 0) action.intArray[1] = random.randint(0,1) # speed button (on: 1, off: 0) action.intArray[2] = random.randint(run,1) return action
def agent_step(self, reward, observation): global save_flg2 # Preproces tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :] # Scaling # 前回の結果を重ねたものを使用する obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames with elementwise # Compose State : 4-step sequential observation self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8) # idx:0 を飛ばしてずらす state_ = np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32) # Exploration decays along the time sequence if self.policyFrozen is False: # Learning ON/OFF if self.DQN.initial_exploration < self.time: self.epsilon -= 1.0/10**6 if self.epsilon < 0.1: self.epsilon = 0.1 eps = self.epsilon else: # Initial Exploation Phase print "Initial Exploration : %d/%d steps" % (self.time, self.DQN.initial_exploration) eps = 1.0 else: # Evaluation print "Policy is Frozen" eps = 0.05 # Generate an Action by e-greedy action selection returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, eps) returnAction.intArray = [action] # Learning Phase loss_val = 0 if self.policyFrozen is False: # Learning ON/OFF self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False) loss_val = self.DQN.experienceReplay(self.time) # Target model update if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0: print "########### MODEL UPDATED ######################" self.DQN.target_model_update() # Simple text based visualization print ' Time Step %d / ACTION %d / REWARD %.1f / EPSILON %.6f / Q_max %3f' % (self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now)) logger.info("{},{},{},{},{},{}".format(dt.now().strftime("%Y-%m-%d_%H:%M:%S"), \ self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now))) # Updates for next step self.last_observation = obs_array if self.policyFrozen is False: self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.time += 1 return returnAction
def agent_start(self,observation): self.optionCurrentlyOn = False theState=observation.intArray[0] s = self.valid_states.index(theState) # row index # Choose either a primitive action or option a = self.egreedy(s) if a<self.numActions: # Primitive action thisIntAction = a self.optionCurrentlyOn = False print 'Primitive action chosen' else: # Composing an option from S_i to S_j self.optionCurrentlyOn = True self.currentOptionTime = 0 self.curentOptionStartState = s self.currentOptionReward = 0.0 # 1. Find the abstract state you belong to & going to self.option_S_i = self.absStateMembership[s] # initiation step self.option_S_j = a-self.numActions # actually, we will have to choose S_j based on SMDP #print 'Shape of first term: ',self.p_mat[s][0].shape #print self.option_S_j #print 'Shape of second term: ', (self.chi_mat.T[self.option_S_j]).T.shape #print 'Debug:' #print self.chi_mat[0,0] # 2. Choose action based on membership ascent thisIntAction=1 maxVal = 0 for a in xrange(4): print 'Action: ',a,' ',max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0) action_pref = max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0) if action_pref > maxVal: thisIntAction = a maxVal = action_pref print 'Option chosen' self.currentOptionTime += 1 print 'Action chosen: ',thisIntAction returnAction=Action() returnAction.intArray=[thisIntAction] self.lastAction=copy.deepcopy(returnAction) self.lastObservation=copy.deepcopy(observation) self.episode += 1 return returnAction
def agent_start(self, Obs): State = Obs.intArray[0] action = self.epsilon_greedy(State) returnaction = Action() returnaction.intArray = [action] self.lastaction = copy.deepcopy(returnaction) self.lastObs = copy.deepcopy(Obs) #might need to return something return returnaction
def agent_start(self, observation): theState = observation.intArray[0] thisIntAction = self.egreedy(theState) returnAction = Action() returnAction.intArray = [thisIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction