def agent_step(self, reward, observation):
        lastState = self.lastObservation.intArray
        lastAction = self.lastAction.intArray
        lastStateId = SamplingUtility.getStateId(lastState)
        lastActionIdx = self.all_allowed_actions[lastStateId].index(tuple(lastAction))
        if reward == self.Bad_Action_Penalty:
            self.all_allowed_actions[lastStateId].pop(lastActionIdx)
            self.Q_value_function[lastStateId].pop(lastActionIdx)
            newAction = self.egreedy(self.lastObservation.intArray)
            returnAction = Action()
            returnAction.intArray = newAction
            self.lastAction = copy.deepcopy(returnAction)
            return returnAction

        newState = observation.intArray
        newAction = self.egreedy(newState)
        if type(newAction) is tuple:
            newAction = list(newAction)
        Q_sa = self.Q_value_function[lastStateId][lastActionIdx]
        Q_sprime_aprime = self.Q_value_function[SamplingUtility.getStateId(newState)][
                          self.all_allowed_actions[SamplingUtility.getStateId(newState)].index(tuple(newAction))]
        new_Q_sa = Q_sa + self.sarsa_stepsize * (reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa)
        if not self.policyFrozen:
            self.Q_value_function[SamplingUtility.getStateId(lastState)][
            self.all_allowed_actions[SamplingUtility.getStateId(lastState)].index(tuple(lastAction))] = new_Q_sa
        returnAction = Action()
        returnAction.intArray = newAction
        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        return returnAction
	def agent_step(self, reward, observation):
		'''
		called by the rl-glue 
		'''
		action = Action()
		
		#eagle mode 
		if self.agenda == self.EAGLE:
			# combine observation with partial nodes and enqueue
			self.successorStates = self.updateWorkingNodeSet(self.partialStateNodes, observation)			
			self.enqueue(self.successorStates)
			
			
			if self.heapQueue.empty():
				# In case Iterative Deepening it should check if it exceeded number of iterations or not
				if self.strategyIndex==2 and self.iteration <= self.MAX_DEPTH: # if ID
					self.iteration+=1
					action.charArray.append('q')
					action.intArray = [1, 0, 0]
					initPartialNode = Node()
					self.partialStateNodes = [initPartialNode]
					self.heapQueue = Queue.LifoQueue()
					self.visited.fill(False)
					return action
				# if not ID 
				print 'fail'
				print 'ellapsed time:', time.time()-self.startTime,'s'
				action.intArray = []
				action.charArray.append('x')
				action.intArray = []
				return action
		
			#Get first element from list
			first=self.heapQueue.get()[1]
			self.numExpandedNodes+=1
			#for debug
			self.depthq.append(first.depth)
			self.setVisited(first)
			# if reaching goal just send dummy action and change the mode	
			if self.goal(first):
				self.pathToGoal = self.createPathToGoal(first)
				self.agenda = self.AGENT
				print self.pathToGoal,'number of steps',len(self.pathToGoal)
				print 'number of expanded nodes:',self.numExpandedNodes
				print 'ellapsed time:', time.time()-self.startTime,'s'
				#print max(self.depthq)
				action.charArray.append('.')
				action.intArray = []
				return action
			
			self.partialStateNodes = self.getSuccessorStates(first)
			return self.getCellsNeededForDiscovery(first) 

		# Agent mode just send actions
		if self.agenda == self.AGENT:
			self.pathToGoalIndex = self.pathToGoalIndex + 1
			action.charArray.append(self.pathToGoal[self.pathToGoalIndex])
			action.intArray = []
			return action
Exemplo n.º 3
0
	def agent_step(self,Reward,Obs):

		new_state = Obs.intArray[0]
		last_state = self.lastObs.intArray[0]
		last_action = self.lastaction.intArray[0]

		new_action = self.epsilon_greedy(new_state)

		

		Q_sa = self.qfunction[last_state][last_action]
		Q_saprime = self.qfunction[new_state][new_action]

		delta = Reward + self.gamma*Q_saprime - Q_sa

		self.efunction[last_state][last_action] = self.efunction[last_state][last_action] + 1

		self.qfunction = np.array(self.qfunction)
		self.efunction = np.array(self.efunction)

		self.qfunction = self.qfunction + self.learningrate*delta*self.efunction

		self.efunction = self.gamma*self.lamda*self.efunction
		

		returnaction = Action()
		returnaction.intArray = [new_action]

		self.lastaction = copy.deepcopy(returnaction)
		self.lastObs = copy.deepcopy(Obs)

		return returnaction
Exemplo n.º 4
0
    def agent_step(self,reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """
        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]

        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)

        phi_t = numpy.zeros((self.numStates+1,))
        phi_tp = numpy.zeros((self.numStates+1,))
        phi_t[0] = lastDiscState
        phi_t[1:] = lastState
        phi_tp[0] = newDiscState
        phi_tp[1:] = newState

        #print ','.join(map(str, lastState))

        self.planner.updateExperience(phi_t, lastAction, phi_tp, reward)

        newIntAction = self.getAction(newState, newDiscState)
        returnAction=Action()
        returnAction.intArray=[newIntAction]

        self.lastAction=copy.deepcopy(returnAction)
        self.lastObservation=copy.deepcopy(observation)
        return returnAction
Exemplo n.º 5
0
    def agent_step(self, reward, obs):
        """ This function is called by the environment while the episode lasts.

        If learning is not frozen, the option-value function Q(s, o) is updated
        using intra-option learning.

        :param reward: The reward obtained as a result of the last transition.
        :param obs: An observation from the environment
        :rtype obs: :class:`rlglue.types.Observation`
        :returns: The primitive action to execute in the environment according to the
        behavior policy.
        :rtype: a primitive action under the form of a :class:`rlglue.types.Action`

        """
        observation = np.array(obs.doubleArray)
        current_features = self.basis.computeFeatures(observation)

        if not self.finished_learning:
            self.intraoption_update(reward, current_features, observation)

        self.last_observation = observation
        self.last_features = current_features
        self.last_action = self.mu(observation,
                                   current_features).pi(observation)

        action = Action()
        action.intArray = [self.last_action]
        return action
    def agent_step(self, reward, observation):
        """
        This method is called each time step. 

        Arguments: 
           reward      - Real valued reward.
           observation - An observation of type rlglue.types.Observation

        Returns: 
           An action of type rlglue.types.Action
        
        """
        # Generate random action
        this_int_action=self.randGenerator.randint(0,self.num_actions-1)
        return_action=Action()
        return_action.intArray=[this_int_action]
        
        if self.show_ale:
            self._show_ale_color()
            #self._show_ale_gray()

        if self.saving:
            if self.int_states:
                self.states.append(self.last_observation.intArray)
            else:
                self.states.append(self.last_observation.doubleArray)

            self.actions.append(self.last_action.intArray[0])
            self.rewards.append(reward)
            self.absorbs.append(False)

        self.last_action=copy.deepcopy(return_action)
        self.last_observation=copy.deepcopy(observation)

        return return_action
Exemplo n.º 7
0
	def agent_step(self,Reward,Obs):

		new_state = Obs.intArray[0]
		last_state = self.lastObs.intArray[0]
		last_action = self.lastaction.intArray[0]



		Q_sa = self.qfunction[last_state][last_action]
		Q_saprime = self.maxim(new_state)

		Q_new = Q_sa + self.learningrate*( Reward + self.gamma*Q_saprime - Q_sa)
		
		#if not self.pause:
		self.qfunction[last_state][last_action] = Q_new

		#To be taken
		new_action = self.epsilon_greedy(new_state)

		returnaction = Action()
		returnaction.intArray = [new_action]

		self.lastaction = copy.deepcopy(returnaction)
		self.lastObs = copy.deepcopy(Obs)

		return returnaction
Exemplo n.º 8
0
    def agent_step(self, reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """

        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]

        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)

        # Update eligibility traces
        phi_t = numpy.zeros(self.traces.shape)
        phi_t[lastDiscState, :,
              lastAction] = self.basis.computeFeatures(lastState)

        self.update_traces(phi_t, None)
        self.update(phi_t, newState, newDiscState, reward)

        # QLearning can choose action after update
        newIntAction = self.getAction(newState, newDiscState)
        returnAction = Action()
        returnAction.intArray = [newIntAction]

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        return returnAction
Exemplo n.º 9
0
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        self.step_counter = 0
        self.batch_counter = 0

        # We report the mean loss for every epoch.
        self.loss_averages = []

        self.start_time = time.time()
        this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        return_action = Action()
        return_action.intArray = [this_int_action]

        self.last_action = copy.deepcopy(return_action)

        self.last_img = self._resize_observation(observation.intArray)

        return return_action
Exemplo n.º 10
0
 def agent_step(self, reward, observation):
     # ステップを1増加
     self.step_counter += 1
     
     self.update_state(observation)
     self.update_targetQ()
     
     # 自分が打つ手を決定する。
     int_action = self.select_int_action() # 戻り値が -1 ならパス。
     action = Action()
     action.intArray = [int_action]
     self.reward = reward
     
     # epsを更新
     self.update_eps()
     
     # データを保存 (状態、アクション、報酬、結果)
     self.store_transition(terminal=False)
     
     if not self.frozen:
         # 学習実行
         if self.step_counter > self.learn_start:
             self.replay_experience()
     
     self.last_state = copy.deepcopy(self.state)
     self.last_action = copy.deepcopy(int_action)
     
     # ○の位置をエージェントへ渡す
     return action
Exemplo n.º 11
0
    def agent_step(self, reward, observation):
        self.step_counter += 1
        self.total_reward += reward
        cur_img = self.resize_image(observation.intArray)

        if self.is_testing:
            int_action = self.choose_action(self.test_table,
                                            cur_img,
                                            np.clip(reward, -1, 1),
                                            testing_ep=0.05)
        else:
            if self.step_counter % self.reset_after == 0:
                self.network.reset_q_hat()

            int_action = self.choose_action(self.train_table,
                                            cur_img,
                                            np.clip(reward, -1, 1),
                                            testing_ep=None)
            if self.train_table.num_entries > max(self.learn_start,
                                                  self.batch_size):
                states, actions, rewards, next_states, terminals = self.train_table.get_minibatch(
                    self.batch_size)
                loss, qvals = self.network.train(states, actions, rewards,
                                                 next_states, terminals)
                self.losses.append(loss)
                self.qvals.append(np.mean(qvals))
                self.batch_counter += 1

        return_action = Action()
        return_action.intArray = [int_action]

        self.last_action = int_action
        self.last_img = cur_img

        return return_action
	def agent_start(self, observation):
		'''
		initialize the episode strategy
		'''
		#Generate action, query 0,0
		action = Action()
		action.charArray.append('q')
		action.intArray = [1, 0, 0]
		# increment strategy
		self.strategyIndex += 1
		#add 1st node (0,0) and North with arrow to the partial nodes
		initPartialNode = Node()
		self.partialStateNodes = [initPartialNode]
		#initialize new queue according to strategy
		self.newQueu()
		#set the agenda
		self.agenda = self.EAGLE
		#reset the pointer to the action path
		self.pathToGoalIndex = -1 
		self.visited.fill(False)
		self.depthq=[]
		# to measure performance
		self.numExpandedNodes=0
		self.startTime=time.time()
		#print 'End the method start'
		return action
Exemplo n.º 13
0
    def _select_action(self, phi=None):
        """
        Utility function for selecting an action.

        phi: ndarray
            Memory from which action should be selected.
        """
        if self.action_count % self.k == 0:
            if (np.random.rand() > self.epsilon) and phi:
                # Get action from Q-function
                phi = np.array(phi)[:, :, :, None]
                action_int = self.action_func(phi)[0]
            else:
                # Get random action
                action_int = np.random.randint(0, len(self.action_map))
            self.action_log[action_int] += 1

            self.cmd = [0]*len(self.action_map)
            self.cmd[action_int] = 1

            # Map cmd to ALE action
            # 18 is the number of commands ALE accepts
            action = Action()
            action.intArray = [self.action_map[action_int]]
            self.action = action
Exemplo n.º 14
0
    def agent_start(self, observation):
        if self.debug_flag: print('agent start')

        # stepを1増やす
        self.step_counter += 1

        #開始時にstateをクリアしないとだめじゃない?
        #self.state = np.zeros((1, self.n_frames, self.bdim)).astype(np.float32)
        self.state = np.zeros(
            (1, 2, self.n_rows, self.n_cols)).astype(np.float32)

        # kmori: 独自のobservationを使用して、状態をアップデート。
        # 一部サンプルに合わせ、残りは別の方法で作成した。
        self.update_state(observation)
        self.update_targetQ()

        if self.debug_flag: print('自分が打つ手を決定する。')

        # 自分が打つ手を決定する。
        int_action = self.select_int_action()
        action = Action()
        action.intArray = [int_action]
        if self.debug_flag: print('eps を更新する。')

        # eps を更新する。epsはランダムに○を打つ確率
        self.update_eps()

        # state = 盤の状態 と action = ○を打つ場所 を退避する
        self.last_state2 = copy.deepcopy(self.last_state)  # 2手前の状態
        self.last_action2 = copy.deepcopy(self.last_action)  # 2手前の行動
        self.last_state = copy.deepcopy(self.state)
        self.last_action = copy.deepcopy(int_action)

        return action
Exemplo n.º 15
0
    def agent_step(self,reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """

        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]
        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)
        newIntAction = self.getAction(newState, newDiscState)

        phi_t = numpy.zeros((self.weights.shape[0], self.weights.shape[1]))
        phi_tp = numpy.zeros((self.weights.shape[0], self.weights.shape[1]))
        phi_t[lastDiscState, :] = self.basis.computeFeatures(lastState)
        phi_tp[newDiscState, :] = self.basis.computeFeatures(newState)

        self.step_count += 1
        self.update(phi_t, phi_tp, reward, self.getCompatibleFeatures(phi_t, lastAction, reward, phi_tp, newIntAction))

        returnAction=Action()
        returnAction.intArray=[newIntAction]
        self.lastAction=copy.deepcopy(returnAction)
        self.lastObservation=copy.deepcopy(observation)
        return returnAction
Exemplo n.º 16
0
    def agent_step(self, reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """

        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]
        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)
        newIntAction = self.getAction(newState, newDiscState)

        phi_t = numpy.zeros((self.weights.shape[0], self.weights.shape[1]))
        phi_tp = numpy.zeros((self.weights.shape[0], self.weights.shape[1]))
        phi_t[lastDiscState, :] = self.basis.computeFeatures(lastState)
        phi_tp[newDiscState, :] = self.basis.computeFeatures(newState)

        self.step_count += 1
        self.update(
            phi_t, phi_tp, reward,
            self.getCompatibleFeatures(phi_t, lastAction, reward, phi_tp,
                                       newIntAction))

        returnAction = Action()
        returnAction.intArray = [newIntAction]
        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        return returnAction
Exemplo n.º 17
0
 def agent_start(self, observation):
     #Generate random action, 0 or 1
     return_action = Action()
     return_action.intArray = []
     for i in xrange(0,self.action_size):
         return_action.intArray += [self.rng.randint(self.action_bounds[i][0],self.action_bounds[i][1])]
     return return_action
Exemplo n.º 18
0
    def agent_step(self, reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """
        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]

        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)

        phi_t = numpy.zeros((self.numStates + 1, ))
        phi_tp = numpy.zeros((self.numStates + 1, ))
        phi_t[0] = lastDiscState
        phi_t[1:] = lastState
        phi_tp[0] = newDiscState
        phi_tp[1:] = newState

        #print ','.join(map(str, lastState))

        self.planner.updateExperience(phi_t, lastAction, phi_tp, reward)

        newIntAction = self.getAction(newState, newDiscState)
        returnAction = Action()
        returnAction.intArray = [newIntAction]

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        return returnAction
Exemplo n.º 19
0
Arquivo: agent.py Projeto: npow/atari
  def agent_step(self, reward, observation):
    self.step_counter += 1
    self.total_reward += reward
    cur_img = self.resize_image(observation.intArray)

    if self.is_testing:
      int_action = self.choose_action(self.test_table, cur_img, np.clip(reward, -1, 1), testing_ep=0.05)
    else:
      if self.step_counter % self.reset_after == 0:
        self.network.reset_q_hat()

      int_action = self.choose_action(self.train_table, cur_img, np.clip(reward, -1, 1), testing_ep=None)
      if self.train_table.num_entries > max(self.learn_start, self.batch_size):
        states, actions, rewards, next_states, terminals = self.train_table.get_minibatch(self.batch_size)
        loss, qvals = self.network.train(states, actions, rewards, next_states, terminals)
        self.losses.append(loss)
        self.qvals.append(np.mean(qvals))
        self.batch_counter += 1

    return_action = Action()
    return_action.intArray = [int_action]

    self.last_action = int_action
    self.last_img = cur_img

    return return_action
Exemplo n.º 20
0
    def agent_step(self, reward, observation):
        """
        This method is called each time step. 

        Arguments: 
           reward      - Real valued reward.
           observation - An observation of type rlglue.types.Observation

        Returns: 
           An action of type rlglue.types.Action
        
        """
        # Generate random action
        this_int_action = self.randGenerator.randint(0, self.num_actions - 1)
        return_action = Action()
        return_action.intArray = [this_int_action]

        if self.show_ale:
            self._show_ale_color()
            #self._show_ale_gray()

        if self.saving:
            if self.int_states:
                self.states.append(self.last_observation.intArray)
            else:
                self.states.append(self.last_observation.doubleArray)

            self.actions.append(self.last_action.intArray[0])
            self.rewards.append(reward)
            self.absorbs.append(False)

        self.last_action = copy.deepcopy(return_action)
        self.last_observation = copy.deepcopy(observation)

        return return_action
Exemplo n.º 21
0
    def agent_start(self, observation):
        """Start an episode for the RL agent.

        Args:
            observation: The first observation of the episode. Should be an RLGlue Observation object.

        Returns:
            The first action the RL agent chooses to take, represented as an RLGlue Action object.
        """
        log = logging.getLogger('pyrl.agents.sarsa_lambda.agent_start')
        theState = numpy.array(list(observation.doubleArray))
        thisIntAction = self.getAction(theState,
                                       self.getDiscState(observation.intArray))

        returnAction = Action()
        returnAction.intArray = [thisIntAction]

        # Clear traces
        self.traces.fill(0.0)

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        log.debug("Action: %d", thisIntAction)
        log.debug("Start State: %s", theState)
        log.debug("Traces: %s", self.traces)
        return returnAction
Exemplo n.º 22
0
    def agent_step(self, reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """

        new_state = numpy.array(list(observation.doubleArray))
        last_state = numpy.array(list(self.last_observation.doubleArray))
        last_action = self.last_action.intArray[0]

        new_disc_state = self.getDiscState(observation.intArray)
        last_disc_state = self.getDiscState(self.last_observation.intArray)

        # Update eligibility traces
        phi_t = numpy.zeros(self.traces.shape)
        phi_t[last_disc_state, :, last_action] = self.basis.computeFeatures(last_state)

        self.update_traces(phi_t, None)
        self.update(phi_t, new_state, new_disc_state, reward)

        # QLearning can choose action after update
        new_int_action = self.getAction(new_state, new_disc_state)
        return_action = Action()
        return_action.intArray = [new_int_action]

        self.last_action = copy.deepcopy(return_action)
        self.last_observation = copy.deepcopy(observation)
        return return_action
Exemplo n.º 23
0
 def create_action(self, action):
     if np.isscalar(action):
         action = np.array([action])
     return_action = Action()
     return_action.intArray = [
         action[:self.learner.dim_action()].astype(int)]
     return return_action
Exemplo n.º 24
0
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        self.step_counter = 0
        self.batch_counter = 0

        # We report the mean loss for every epoch.
        self.loss_averages = []

        self.start_time = time.time()
        this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        return_action = Action()
        return_action.intArray = [this_int_action]

        self.last_action = copy.deepcopy(return_action)

        self.last_img = np.array(self._resize_observation(observation.intArray))
        self.last_img = self.last_img.reshape(CROPPED_WIDTH, CROPPED_HEIGHT).T

        return return_action
Exemplo n.º 25
0
    def agent_start(self, observation):

        # Get intensity from current observation array
        tmp = np.bitwise_and(
            np.asarray(observation.intArray[128:]).reshape([210, 160]),
            0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 -
                                                   8, :]  # Scaling

        # Initialize State
        self.state = np.zeros((4, 84, 84), dtype=np.uint8)
        self.state[0] = obs_array
        state_ = cuda.to_gpu(
            np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))

        # Generate an Action e-greedy
        returnAction = Action()
        action, Q_now = self.DQN.e_greedy(state_, self.epsilon)
        returnAction.intArray = [action]

        # Update for next step
        self.lastAction = copy.deepcopy(returnAction)
        self.last_state = self.state.copy()
        self.last_observation = obs_array

        return returnAction
Exemplo n.º 26
0
    def agent_step(self, reward, observation):

        # Preproces
        tmp = np.bitwise_and(
            np.asarray(observation.intArray[128:]).reshape([210, 160]),
            0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 -
                                                   8, :]  # Scaling
        obs_processed = np.maximum(
            obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        self.state = np.asanyarray(
            [self.state[1], self.state[2], self.state[3], obs_processed],
            dtype=np.uint8)
        state_ = cuda.to_gpu(
            np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))

        # Exploration decays along the time sequence
        if self.policyFrozen is False:  # Learning ON/OFF
            if self.DQN.initial_exploration < self.time:
                self.epsilon -= 1.0 / 10**6
                if self.epsilon < 0.1:
                    self.epsilon = 0.1
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print "Initial Exploration : %d/%d steps" % (
                    self.time, self.DQN.initial_exploration)
                eps = 1.0
        else:  # Evaluation
            print "Policy is Frozen"
            eps = 0.05

        # Generate an Action from e-greedy action selection
        returnAction = Action()
        action, Q_now = self.DQN.e_greedy(state_, eps)
        returnAction.intArray = [action]

        # Learning Phase
        if self.policyFrozen is False:  # Learning ON/OFF
            self.DQN.stockExperience(self.time, self.last_state,
                                     self.lastAction.intArray[0], reward,
                                     self.state, False)
            self.DQN.experienceReplay(self.time)

        # Simple text based visualization
        print ' Time Step %d /   ACTION  %d  /   REWARD %.1f   / EPSILON  %.6f  /   Q_max  %3f' % (
            self.time, self.DQN.action_to_index(action), np.sign(reward), eps,
            np.max(Q_now.get()))

        # Updates for next step
        self.last_observation = obs_array

        # Update for next step
        if self.policyFrozen is False:
            self.lastAction = copy.deepcopy(returnAction)
            self.last_state = self.state.copy()
            self.time += 1

        return returnAction
Exemplo n.º 27
0
    def agent_step(self, reward, observation):
        newState = observation.intArray[0]
        lastState = self.lastObservation.intArray[0]
        lastAction = self.lastAction.intArray[0]

        Q_sa = self.value_function[lastState][lastAction]
        Q_sprime_aprime = -500000
        for a in range(self.numberOfActions):
            if self.value_function[newState][a] > Q_sprime_aprime:
                Q_sprime_aprime = self.value_function[newState][a]
        #updating Q function
        new_Q_sa = Q_sa + self.sarsa_stepsize * (
            reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa)

        newIntAction = self.egreedy(newState)
        if not self.policyFrozen:
            self.value_function[lastState][lastAction] = new_Q_sa

        returnAction = Action()
        returnAction.intArray = [newIntAction]

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)

        return returnAction
Exemplo n.º 28
0
    def agent_step(self, reward, obs):
        """ This function is called by the environment while the episode lasts.

        If learning is not frozen, the option-value function Q(s, o) is updated
        using intra-option learning.

        :param reward: The reward obtained as a result of the last transition.
        :param obs: An observation from the environment
        :rtype obs: :class:`rlglue.types.Observation`
        :returns: The primitive action to execute in the environment according to the
        behavior policy.
        :rtype: a primitive action under the form of a :class:`rlglue.types.Action`

        """
        observation = np.array(obs.doubleArray)
        current_features = self.basis.computeFeatures(observation)

        if not self.finished_learning:
            self.intraoption_update(reward, current_features, observation)

        self.last_observation = observation
        self.last_features = current_features
        self.last_action = self.mu(observation, current_features).pi(observation)

        action = Action()
        action.intArray = [self.last_action]
        return action
Exemplo n.º 29
0
	def agent_step(self, reward, observation):
		observed_screen = self.preprocess_screen(observation)
		self.state = np.roll(self.state, 1, axis=0)
		self.state[0] = observed_screen

		########################### DEBUG ###############################
		# if self.total_time_step % 500 == 0 and self.total_time_step != 0:
		# 	self.dump_state()

		self.learn(reward)
		
		return_action = Action()
		q_max = None
		q_min = None
		if self.time_step % config.rl_action_repeat == 0:
			action, q_max, q_min = self.dqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate)
		else:
			action = self.last_action.intArray[0]
		return_action.intArray = [action]

		self.dump_result(reward, q_max, q_min)

		if self.policy_frozen is False:
			self.last_action = copy.deepcopy(return_action)
			self.last_state = self.state
			self.time_step += 1
			self.total_time_step += 1

		return return_action
Exemplo n.º 30
0
    def agent_step(self, reward, observation):
        self.stepCount = self.stepCount + 1
        action = Action()
        action.intArray = observation.intArray
        action.doubleArray = observation.doubleArray
        action.charArray = observation.charArray

        return action
Exemplo n.º 31
0
    def agent_start(self, observation):
        self.stepCount = 0
        action = Action()
        action.intArray = observation.intArray
        action.doubleArray = observation.doubleArray
        action.charArray = observation.charArray

        return action
Exemplo n.º 32
0
    def agent_step(self,reward, observation):
        self.stepCount=self.stepCount+1
        action=Action()
        action.intArray=observation.intArray
        action.doubleArray=observation.doubleArray
        action.charArray=observation.charArray

        return action
Exemplo n.º 33
0
    def agent_start(self,observation):
        self.stepCount=0
        action=Action()
        action.intArray=observation.intArray
        action.doubleArray=observation.doubleArray
        action.charArray=observation.charArray

        return action
Exemplo n.º 34
0
 def _getAction(self):
     """
     Return a RLGlue action that is made out of a numpy array yielded by
     the hold pybrain agent.
     """
     action = RLGlueAction()
     action.doubleArray = self.agent.getAction().tolist()
     action.intArray = []
     return action
    def agent_step(self, reward, observation):

        # Preproces
        tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :]  # Scaling
        obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8)
        state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))

        # Exploration decays along the time sequence
        if self.policyFrozen is False:  # Learning ON/OFF
            if self.DQN.initial_exploration < self.time:
                self.epsilon -= 1.0/10**6
                if self.epsilon < 0.1:
                    self.epsilon = 0.1
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print "Initial Exploration : %d/%d steps" % (self.time, self.DQN.initial_exploration)
                eps = 1.0
        else:  # Evaluation
                print "Policy is Frozen"
                eps = 0.05

        # Generate an Action by e-greedy action selection
        returnAction = Action()
        action, Q_now = self.DQN.e_greedy(state_, eps)
        returnAction.intArray = [action]

        # Learning Phase
        if self.policyFrozen is False:  # Learning ON/OFF
            self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False)
            self.DQN.experienceReplay(self.time)

        # Target model update
        if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0:
            print "########### MODEL UPDATED ######################"
            self.DQN.target_model_update()
            np.save('params/l1_W.npy',self.DQN.CNN_model.l1.W.get())
            np.save('params/l1_b.npy',self.DQN.CNN_model.l1.b.get())
            np.save('params/l2_W.npy',self.DQN.CNN_model.l2.W.get())
            np.save('params/l2_b.npy',self.DQN.CNN_model.l2.b.get())
            np.save('params/l3_W.npy',self.DQN.CNN_model.l3.W.get())
            np.save('params/l3_b.npy',self.DQN.CNN_model.l3.b.get())
        # Simple text based visualization
        print ' Time Step %d /   ACTION  %d  /   REWARD %.1f   / EPSILON  %.6f  /   Q_max  %3f' % (self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now.get()))

        # Updates for next step
        self.last_observation = obs_array

        if self.policyFrozen is False:
            self.lastAction = copy.deepcopy(returnAction)
            self.last_state = self.state.copy()
            self.time += 1

        return returnAction
Exemplo n.º 36
0
	def agent_start(self, observation):
		thisIntAction=self.randGenerator.randint(0,1)
		returnAction=Action()
		returnAction.intArray=[thisIntAction]
		
		lastAction=copy.deepcopy(returnAction)
		lastObservation=copy.deepcopy(observation)

		return returnAction
	def getCellsNeededForDiscovery(self, node):
		'''
		Generate list I contains locations to discover and send action with them
		'''
		newPosition = self.newPosition(node.state.position, node.state.orintation)[1]
		action = Action()
		action.intArray = [1, newPosition[0], newPosition[1]]
		action.charArray.append('q')
		return action
Exemplo n.º 38
0
 def _getAction(self):
     """
     Return a RLGlue action that is made out of a numpy array yielded by
     the hold pybrain agent.
     """
     action = RLGlueAction()
     action.doubleArray = self.agent.getAction().tolist()
     action.intArray = []
     return action
Exemplo n.º 39
0
    def agent_step(self, reward, observation):
        global  save_flg2
        # Preproces
        tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :]  # Scaling

        # 前回の結果を重ねたものを使用する
        obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames with elementwise

        # Compose State : 4-step sequential observation
        self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8) # idx:0 を飛ばしてずらす
        state_ = np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)

        # Exploration decays along the time sequence
        if self.policyFrozen is False:  # Learning ON/OFF
            if self.DQN.initial_exploration < self.time:
                self.epsilon -= 1.0/10**6
                if self.epsilon < 0.1:
                    self.epsilon = 0.1
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print "Initial Exploration : %d/%d steps" % (self.time, self.DQN.initial_exploration)
                eps = 1.0
        else:  # Evaluation
                print "Policy is Frozen"
                eps = 0.05

        # Generate an Action by e-greedy action selection
        returnAction = Action()
        action, Q_now = self.DQN.e_greedy(state_, eps)
        returnAction.intArray = [action]

        # Learning Phase
        loss_val = 0
        if self.policyFrozen is False:  # Learning ON/OFF
            self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False)
            loss_val = self.DQN.experienceReplay(self.time)

        # Target model update
        if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0:
            print "########### MODEL UPDATED ######################"
            self.DQN.target_model_update()

        # Simple text based visualization
        print ' Time Step %d /   ACTION  %d  /   REWARD %.1f   / EPSILON  %.6f  /   Q_max  %3f' % (self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now))
        logger.info("{},{},{},{},{},{}".format(dt.now().strftime("%Y-%m-%d_%H:%M:%S"), \
              self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now)))

        # Updates for next step
        self.last_observation = obs_array

        if self.policyFrozen is False:
            self.lastAction = copy.deepcopy(returnAction)
            self.last_state = self.state.copy()
            self.time += 1

        return returnAction
Exemplo n.º 40
0
    def agent_start(self,observation):

        self.optionCurrentlyOn = False
        theState=observation.intArray[0]
        s = self.valid_states.index(theState) # row index

        # Choose either a primitive action or option
        a = self.egreedy(s)

        if a<self.numActions:
            # Primitive action
            thisIntAction = a
            self.optionCurrentlyOn = False
            print 'Primitive action chosen'

        else:    
            # Composing an option from S_i to S_j
            self.optionCurrentlyOn = True
            self.currentOptionTime = 0
            self.curentOptionStartState = s
            self.currentOptionReward = 0.0

            # 1. Find the abstract state you belong to & going to
            self.option_S_i = self.absStateMembership[s] # initiation step
            self.option_S_j = a-self.numActions # actually, we will have to choose S_j based on SMDP

            #print 'Shape of first term: ',self.p_mat[s][0].shape
            #print self.option_S_j
            #print 'Shape of second term: ', (self.chi_mat.T[self.option_S_j]).T.shape

            #print 'Debug:'
            #print self.chi_mat[0,0]

            # 2. Choose action based on membership ascent
            thisIntAction=1
            maxVal = 0
            for a in xrange(4): 
                print 'Action: ',a,' ',max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0)
                action_pref = max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0)
                if action_pref > maxVal:
                    thisIntAction = a
                    maxVal = action_pref
                print 'Option chosen'

            self.currentOptionTime += 1

        print 'Action chosen: ',thisIntAction

        returnAction=Action()
        returnAction.intArray=[thisIntAction]
        
        self.lastAction=copy.deepcopy(returnAction)
        self.lastObservation=copy.deepcopy(observation)

        self.episode += 1
        return returnAction
Exemplo n.º 41
0
    def agent_start(self, observation):
        #Generate random action, 0 or 1
        thisIntAction = self.randGenerator.randint(0, 1)
        returnAction = Action()
        returnAction.intArray = [thisIntAction]

        lastAction = copy.deepcopy(returnAction)
        lastObservation = copy.deepcopy(observation)

        return returnAction
Exemplo n.º 42
0
    def agent_start(self, observation):
        newState = observation.intArray[0]
        x = self.egreedy(newState)
        returnAction = Action()
        returnAction.intArray = [x]

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)

        return returnAction
Exemplo n.º 43
0
    def agent_step(self,reward, observation):
        #Generate random action, 0 or 1
        thisIntAction=self.randGenerator.randint(0,1)
        returnAction=Action()
        returnAction.intArray=[thisIntAction]
        
        last_action=copy.deepcopy(returnAction)
        last_observation=copy.deepcopy(observation)

        return returnAction
Exemplo n.º 44
0
    def agent_start(self,observation):
        theState=observation.intArray[0];
        thisIntAction=self.egreedy(theState);
        returnAction=Action()
        returnAction.intArray=[thisIntAction]
        
        self.lastAction=copy.deepcopy(returnAction)
        self.lastObservation=copy.deepcopy(observation)

        return returnAction
Exemplo n.º 45
0
    def agent_start(self, Obs):
        State = Obs.intArray[0]
        action = self.epsilon_greedy(State)
        returnaction = Action()
        returnaction.intArray = [action]
        self.lastaction = copy.deepcopy(returnaction)
        self.lastObs = copy.deepcopy(Obs)

        #might need to return something
        return returnaction
Exemplo n.º 46
0
    def agent_start(self, observation):
        theState = observation.intArray[0]
        thisIntAction = self.egreedy(theState)
        returnAction = Action()
        returnAction.intArray = [thisIntAction]

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)

        return returnAction
Exemplo n.º 47
0
    def agent_step(self, reward, observation):

        lastState = self.lastObservation.intArray
        lastAction = self.lastAction.intArray
        lastStateId = SamplingUtility.getStateId(lastState)
        lastActionIdx = self.all_allowed_actions[lastStateId].index(
            tuple(lastAction))
        if reward == self.Bad_Action_Penalty:
            self.all_allowed_actions[lastStateId].pop(lastActionIdx)
            self.Q_value_function[lastStateId].pop(lastActionIdx)
            newAction = self.egreedy(self.lastObservation.intArray)
            returnAction = Action()
            returnAction.intArray = newAction
            self.lastAction = copy.deepcopy(returnAction)
            return returnAction

        newState = observation.intArray
        newAction = self.egreedy(
            newState)  #for random player, egreedy=random_player

        if type(newAction) is tuple:
            newAction = list(newAction)
            #print newAction
        #we kept the same names from sarsa because it was a bit convenient ---> test test sarsa again, just replace max(blah,blah), with Q_sprime_aprime and uncomment the code below
        Q_sprime_aprime = self.Q_value_function[SamplingUtility.getStateId(
            newState)][self.all_allowed_actions[SamplingUtility.getStateId(
                newState)].index(tuple(newAction))]
        #------>comment lines 133-139 when you want random player
        Q_sa = self.Q_value_function[lastStateId][lastActionIdx]
        new_Q_sa = Q_sa + self.stepsize * (
            reward + self.discount * Q_sprime_aprime - Q_sa)

        if not self.policyFrozen:
            self.Q_value_function[SamplingUtility.getStateId(lastState)][
                self.all_allowed_actions[SamplingUtility.getStateId(
                    lastState)].index(tuple(lastAction))] = new_Q_sa
        #------>comment lines<-----
        returnAction = Action()
        returnAction.intArray = newAction
        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        return returnAction
	def agent_step(self, reward, observation):
		screen = observation.intArray[128:]
		screen = np.reshape(screen, (210, -1))
		self.image.new_image(screen)
		return_action = Action()
		action = randrange(self.numActions)
		return_action.intArray = [action]
		self.lastAction=copy.deepcopy(return_action)
		self.lastObservation=copy.deepcopy(observation)

		return return_action
	def agent_start(self, observation):
		screen = observation.intArray[128:]
		screen = np.reshape(screen, (210, -1))
		maze = detect_maze(screen)
		self.image = pacman_image(maze)
		return_action = Action()
		action = randrange(self.numActions)
		return_action.intArray = [action]
		self.lastAction = copy.deepcopy(return_action)
		self.lastObservation = copy.deepcopy(observation)

		return return_action
Exemplo n.º 50
0
    def agent_start(self, observation):
        theState = observation.intArray
        thisIntAction = self.egreedy(theState)
        if type(thisIntAction) is tuple:
            thisIntAction = list(thisIntAction)
        returnAction = Action()
        returnAction.intArray = thisIntAction

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)

        return returnAction
Exemplo n.º 51
0
Arquivo: agent.py Projeto: npow/atari
  def agent_start(self, observation):
    this_int_action = np.random.randint(0, self.num_actions)
    return_action = Action()
    return_action.intArray = [this_int_action]
    self.start_time = time.time()
    self.batch_counter = 0
    self.last_action = 0
    self.losses = []

    self.last_img = self.resize_image(observation.intArray)

    return return_action
Exemplo n.º 52
0
    def agent_step(self, reward, observation):
        """
        This method is called each time step.

        Arguments:
           reward      - Real valued reward.
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action

        """

        self.step_counter += 1
        return_action = Action()

        cur_img = self._resize_observation(observation.intArray)

        #TESTING---------------------------
        if self.testing:
            self.total_reward += reward
            int_action = self._choose_action(self.test_data_set, .05,
                                             cur_img, np.clip(reward, -1, 1))
            if self.pause > 0:
                time.sleep(self.pause)

        #NOT TESTING---------------------------
        else:

            if len(self.data_set) > self.replay_start_size:
                self.epsilon = max(self.epsilon_min,
                                   self.epsilon - self.epsilon_rate)

                int_action = self._choose_action(self.data_set, self.epsilon,
                                                 cur_img,
                                                 np.clip(reward, -1, 1))

                if self.step_counter % self.update_frequency == 0:
                    loss = self._do_training()
                    self.batch_counter += 1
                    self.loss_averages.append(loss)

            else: # Still gathering initial random data...
                int_action = self._choose_action(self.data_set, 1.0,
                                                 cur_img,
                                                 np.clip(reward, -1, 1))

        return_action.intArray = [int_action]

        self.last_action = copy.deepcopy(return_action)
        self.last_img = cur_img

        return return_action
	def agent_start(self, observation):
		state = observation.intArray[0]

		actionChoice = self.get_action(state)

		action = Action()
		action.intArray = [actionChoice]

		self.lastAction = copy.deepcopy(action)
		self.lastObservation = copy.deepcopy(observation)

		return action
Exemplo n.º 54
0
 def create_action(self,act):
     self.last_act=act
     if np.isscalar(act):
         act = np.array([act])
     assert (act.size == self.action_dims()),'illegal action dimension'
     return_action=Action()
     if self.int_action_dims() > 0:
         return_action.intArray=[act[:self.int_action_dims()].astype(int)] 
     if self.double_action_dims() > 0:
         return_action.doubleArray=[
             act[self.double_action_dims():].astype(float)]
     return return_action
Exemplo n.º 55
0
 def create_action(self,act):
     self.last_act=act
     if np.isscalar(act):
         act = np.array([act])
     assert (act.size == self.action_dims()),'illegal action dimension'
     return_action=Action()
     if self.int_action_dims() > 0:
         return_action.intArray=[act[:self.int_action_dims()].astype(int)] 
     if self.double_action_dims() > 0:
         return_action.doubleArray=[
             act[self.double_action_dims():].astype(float)]
     return return_action
Exemplo n.º 56
0
    def agent_start(self, observation):
        state = observation.intArray[0]

        actionChoice = self.get_action(state)

        action = Action()
        action.intArray = [actionChoice]

        self.lastAction = copy.deepcopy(action)
        self.lastObservation = copy.deepcopy(observation)

        return action