예제 #1
0
    def _step(self, action):

        self._counter += 1

        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self.reset()

        self._state[self._counter - 1, 0] = action[0]
        self._state[self._counter - 1, 1] = action[1]

        # Make sure episodes don't go on forever.
        if self._counter > self._END:
            self._episode_ended = True

        s1 = self._state[11, 0]
        a1 = self._state[self._counter - 1, 0] * s1
        s2 = self._state[11, 1]
        a2 = self._state[self._counter - 1, 1] * s2
        self._state[11, 0] = s1 - a1 + (a1 + a2) * self._MULT / 2
        self._state[11, 1] = s2 - a2 + (a1 + a2) * self._MULT / 2

        if self._episode_ended:
            reward_self = (self._state[11, 0] - s1) / s1
            reward_other = (self._state[11, 1] - s2) / s2
            my_reward = reward_fun(reward_self, reward_other)

            ret = ts.termination(np.array(self._state, dtype=np.float32),
                                 my_reward)
            self._counter += 1

            return ret
        else:
            reward_self = (self._state[11, 0] - s1) / s1
            reward_other = (self._state[11, 1] - s2) / s2
            my_reward = reward_fun(reward_self, reward_other)
            ret = ts.transition(np.array(self._state, dtype=np.float32),
                                reward=my_reward,
                                discount=1.0)

            for i in range(2):
                self._state[self._counter + 1, i] += self._state[self._counter,
                                                                 i]

            #print("transition:", ret)
            return ret
  def step_adversary(self, action):
    action = action.item() if self._action_is_discrete else action

    observation, reward, self._done, self._info = self._gym_env.step_adversary(
        action)

    if self._match_obs_space_dtype:
      observation = self._adversary_to_obs_space_dtype(observation)

    reward = np.asarray(reward, dtype=self.reward_spec().dtype)
    outer_dims = nest_utils.get_outer_array_shape(reward, self.reward_spec())

    if self._done:
      return ts_lib.termination(observation, reward, outer_dims=outer_dims)
    else:
      return ts_lib.transition(observation, reward, self._discount,
                               outer_dims=outer_dims)
    def _step(self, action):
        if self._game.is_episode_finished():
            # The last action ended the episode. Ignore the current action and start a new episode.
            return self.reset()

        for i in range(self._frame_skip):
            if i == 0:
                self.take_action(action)
            else:
                self.take_action(0)

            if self._game.is_episode_finished():
                return ts.termination(self.get_screen_buffer_preprocessed(),
                                      self.get_reward())

        return ts.transition(self.get_screen_buffer_preprocessed(),
                             self.get_reward())
예제 #4
0
    def _step(self, action):
        """
        Performs the action as a move on the board.
        :param action: the move to perform, a color from 0-4 referring to the REGULARIZED board.
        :return: the next time_step.
        """
        if self._episode_ended:
            return self.reset()

        if self.state.move_count > 200:
            print("Canceling game due to stalling.")
            reward = -1
            self._episode_ended = True
        else:
            # We need to map to the normal colors using the reverse mapping.
            new_owned_count = self.state.move(self.mapping.index(action))
            if self.state.last_move_illegal:
                reward = -1
                print("INFO: Illegal move!")
            else:
                reward = new_owned_count
            self.total_score += new_owned_count
            if not self.state.is_final_state:
                self.state.move(random.randint(0, 5))
            if self.state.is_final_state:
                self._episode_ended = True
                if self.total_score > 28:
                    # print("Won!")
                    pass
                    # reward += 10
                elif self.total_score < 28:
                    pass
                    # print("Lost!")
                # elif self.total_score < 28:
                #     reward -= 10
                # print(str_board(self.state.board))
                # print(self.total_score)

        self.rewards.append(reward)
        self.__update_mapping()
        self.__regularize_board()
        if self._episode_ended:
            return ts.termination(self.regularized_board, reward)
        else:
            # TODO was fürn discount?
            return ts.transition(self.regularized_board, reward, 0.7)
예제 #5
0
 def visualize(self, num_episodes=1):
     # Ticket (https://github.com/tensorflow/agents/issues/59) recommends
     # to do the rendering in the original environment
     if self._unwrapped_runtime is not None:
         for _ in range(0, num_episodes):
             state = self._unwrapped_runtime.reset()
             is_terminal = False
             while not is_terminal:
                 print(state)
                 action_step = self._agent._eval_policy.action(
                     ts.transition(state, reward=0.0, discount=1.0))
                 # print(action_step)
                 # TODO(@hart); make generic for multi agent planning
                 state, reward, is_terminal, _ = self._unwrapped_runtime.step(
                     action_step.action.numpy())
                 # print(reward)
                 self._unwrapped_runtime.render()
예제 #6
0
    def _step(self, action):
        if action < self._action_spec.minimum or action > self._action_spec.maximum:
            raise ValueError(
                'Action should be in [{0}, {1}], but saw: {2}'.format(
                    self._action_spec.minimum, self._action_spec.maximum,
                    action))

        if self._state >= self._final_state:
            # Start a new episode. Ignore action
            self._state = 0
            return ts.restart(self._state)

        self._state += action
        if self._state < self._final_state:
            return ts.transition(self._state, 1.)
        else:
            return ts.termination(self._state, 1.)
예제 #7
0
 def _step(self, action):
     """ Step, action is velocities of left/right wheel """
     # Reset if ended
     if self._episode_ended:
         return self.reset()
     self._num_steps += 1
     # Step the environment
     self._env.step(self._actions[action])
     done, reward = self._compute_reward()
     # Compute and save states
     self.set_state()
     self._episode_ended = done
     # Transition
     if self._episode_ended:
         return ts.termination(self._state, reward)
     else:
         return ts.transition(self._state, reward)
예제 #8
0
    def _step(self, action):
        """
        Execute one time step in the environment.
        
        Input:
            action -- dictionary of batched actions
        
        Output:
            TimeStep object (see tf-agents docs)  
            
        """
        self._state, info, obs = self.control_circuit(self._state, action)
        self.info['psi_cached'] = info
        # Calculate rewards
        self._elapsed_steps += 1
        self._episode_ended = (self._elapsed_steps == self.episode_length)

        # Add dummy time dimension to tensors and append them to history
        for a in action.keys():
            self.history[a].append(action[a])
        self.history['msmt'].append(obs)

        # Make observations of 'msmt' of horizon H, shape=[batch_size,H]
        # measurements are selected with hard-coded attention step.
        # Also add clock of period 'T' to observations, shape=[batch_size,T]
        observation = {}
        H = [
            self.history['msmt'][-self.attn_step * i - 1]
            for i in range(self.H)
        ]
        H.reverse()  # to keep chronological order of measurements
        observation['msmt'] = tf.concat(H, axis=1)
        C = tf.one_hot([self._elapsed_steps % self.T] * self.batch_size,
                       self.T)
        observation['clock'] = C
        observation['const'] = tf.ones(shape=[self.batch_size, 1])

        reward = self.calculate_reward(action)
        self._episode_return += reward

        if self._episode_ended:
            self._epoch += 1
            self._current_time_step_ = ts.termination(observation, reward)
        else:
            self._current_time_step_ = ts.transition(observation, reward)
        return self.current_time_step()
예제 #9
0
파일: gym_wrapper.py 프로젝트: yyht/agents
  def _step(self, action):
    # Automatically reset the environments on step if they need to be reset.
    if self._auto_reset and self._done:
      return self.reset()

    # TODO(oars): Figure out how tuple or dict actions will be generated by the
    # agents and if we can pass them through directly to gym.

    observation, reward, self._done, self._info = self._gym_env.step(action)

    if self._match_obs_space_dtype:
      observation = self._to_obs_space_dtype(observation)

    if self._done:
      return ts.termination(observation, reward)
    else:
      return ts.transition(observation, reward, self._discount)
예제 #10
0
  def _step(self, action):
    if action < self._action_spec.minimum or action > self._action_spec.maximum:
      raise ValueError('Action should be in [{0}, {1}], but saw: {2}'.format(
          self._action_spec.minimum, self._action_spec.maximum, action))
    if action.shape != ():  # pylint: disable=g-explicit-bool-comparison
      raise ValueError('Action should be a scalar.')

    if self._state >= self._final_state:
      # Start a new episode. Ignore action
      return self.reset()

    self._state += action
    self._state = np.int32(self._state)
    if self._state < self._final_state:
      return ts.transition(self._state, 1.)
    else:
      return ts.termination(self._state, 1.)
예제 #11
0
    def _step(self, action):

        if self._episode_ended:  # The last action ended the episode. Ignore the current action and start a new episode.
            return self.reset()
            # Make sure episodes don't go on forever.
        instructionint = action[0]
        self.game.instruction = gl.decode[instructionint]
        status = self.game.dqn_update()
        if status == "end_episode" or self.game.stepcount == 1000:
            self._episode_ended = True
        reward = self.game.stepcount + 10 * self.game.score  #score for clearing lines is weighted 10x compared to just keeping the game going for one more move
        if self._episode_ended:
            return ts.termination(self.game.screenmat.astype("int32"), reward)
        else:
            return ts.transition(self.game.screenmat.astype("int32"),
                                 reward,
                                 discount=1.0)
예제 #12
0
    def _step(self, action):
        if self._episode_ended:
            self._send(Action(value=action, phase=self._phase))
            return self.reset()

        self._send(Action(value=action, phase=self._phase))
        message = self._receive()
        finished = message.finished
        reward = message.reward

        if finished:
            self._episode_ended = True
            return ts.termination(np.array(self._state, dtype=np.float),
                                  reward)

        observation = np.array(message.observable, dtype=np.float)
        return ts.transition(observation, reward, discount=self.discount)
    def testCriticLossWithMaskedActions(self):
        # Observations are now a tuple of the usual observation and an action mask.
        observation_spec_with_mask = (self._obs_spec,
                                      tensor_spec.BoundedTensorSpec([2],
                                                                    tf.int32,
                                                                    0, 1))
        time_step_spec = ts.time_step_spec(observation_spec_with_mask)
        dummy_categorical_net = DummyCategoricalNet(self._obs_spec)
        agent = categorical_dqn_agent.CategoricalDqnAgent(
            time_step_spec,
            self._action_spec,
            dummy_categorical_net,
            self._optimizer,
            observation_and_action_constraint_splitter=lambda x: (x[0], x[1]))

        # For `observations`, the masks are set up so that only one action is valid
        # for each element in the batch.
        observations = (tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
                        tf.constant([[1, 0], [0, 1]], dtype=tf.int32))
        time_steps = ts.restart(observations, batch_size=2)

        actions = tf.constant([0, 1], dtype=tf.int32)
        action_steps = policy_step.PolicyStep(actions)

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

        # For `next_observations`, the masks are set up so the opposite actions as
        # before are valid.
        next_observations = (tf.constant([[5, 6], [7, 8]], dtype=tf.float32),
                             tf.constant([[0, 1], [1, 0]], dtype=tf.int32))
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        experience = test_utils.stacked_trajectory_from_transition(
            time_steps, action_steps, next_time_steps)

        # Due to the constant initialization of the DummyCategoricalNet, we can
        # expect the same loss every time. Note this is different from the loss in
        # testCriticLoss above due to previously optimal actions being masked out.
        expected_loss = 5.062895
        loss_info = agent._loss(experience)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        evaluated_loss = self.evaluate(loss_info).loss
        self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
 def step(self, action):
     """Apply action and return new time_step."""
     data = []
     for df in self.dfs:
         data.append(np.array([df['volume'].values[range(self.current_step-self.look_back_window, self.current_step)],
                               df['open'].values[range(self.current_step-self.look_back_window, self.current_step)],
                               df['high'].values[range(self.current_step-self.look_back_window, self.current_step)],
                               df['low'].values[range(self.current_step-self.look_back_window, self.current_step)],
                               df['close'].values[range(self.current_step-self.look_back_window, self.current_step)]]))
     
     self._state = np.array(data)
     
     #seperate our action list to different variables
     coin = action[0]
     action_type = action[1]
     amount = action[2]/10.0
         
     #initiliaze the reward to 0 for each timestep
     reward = 0
     
     if self.wallet[0]<0.01*self.initial_balance:
         self._episode_ended = True
         return ts.termination(self._state, reward)
     
     if action_type==0:
         #Buy coin
         current_price = data[coin][1, self.look_back_window-1]
         usd_val = amount*self.wallet[0]
         self.wallet[0] -= usd_val
         self.wallet[coin+1] += usd_val/current_price
         self.moves[coin].append(['buy', self.current_step, current_price])
         
     
     if action_type==1:
         #Sell coin
         current_price = data[coin][1, self.look_back_window-1]
         coin_val = amount*self.wallet[coin+1]
         self.wallet[coin+1] -= coin_val
         self.wallet[0] += coin_val*current_price
         self.moves[coin].append(['sell', self.current_step, current_price])
     
     #increment the step in our environment
     self.current_step+=1
     #return the timestep as a transition containing the state, reward, and discount value
     return ts.transition(self._state, reward, 1.0)
예제 #15
0
  def testLossWithChangedOptimalActions(self, agent_class):
    q_net = DummyNet(self._observation_spec, self._action_spec)
    agent = agent_class(
        self._time_step_spec,
        self._action_spec,
        q_network=q_net,
        optimizer=None)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)

    actions = tf.constant([0, 1], dtype=tf.int32)
    action_steps = policy_step.PolicyStep(actions)

    rewards = tf.constant([10, 20], dtype=tf.float32)
    discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

    # Note that instead of [[5, 6], [7, 8]] as before, we now have -5 and -7.
    next_observations = tf.constant([[-5, 6], [-7, 8]], dtype=tf.float32)
    next_time_steps = ts.transition(next_observations, rewards, discounts)

    experience = trajectories_test_utils.stacked_trajectory_from_transition(
        time_steps, action_steps, next_time_steps)

    # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer
    # [[1], [1]] from DummyNet above, we can calculate the following values:
    # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5
    # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8
    # (Here we use the second row of the kernel initializer above, since the
    # chosen action is now 1 instead of 0.)
    #
    # For the target Q-values here, note that since we've replaced 5 and 7 with
    # -5 and -7, it is better to use action 1 with a kernel of [1, 1] instead of
    # action 0 with a kernel of [2, 1].
    # Target Q-value for first next_observation: 1 * -5 + 1 * 6 + 1 = 2
    # Target Q-value for second next_observation: 1 * -7 + 1 * 8 + 1 = 2
    # TD targets: 10 + 0.9 * 2 = 11.8 and 20 + 0.9 * 2 = 21.8
    # TD errors: 11.8 - 5 = 6.8 and 21.8 - 8 = 13.8
    # TD loss: 6.3 and 13.3 (Huber loss subtracts 0.5)
    # Overall loss: (6.3 + 13.3) / 2 = 9.8
    expected_loss = 9.8
    loss, _ = agent._loss(experience)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.assertAllClose(self.evaluate(loss), expected_loss)
예제 #16
0
    def testCriticLoss(self, include_critic_entropy_term,
                       reward_noise_variance, use_tf_variable, td_targets):
        if use_tf_variable:
            reward_noise_variance = tf.Variable(reward_noise_variance)
        agent = cql_sac_agent.CqlSacAgent(
            self._time_step_spec,
            self._action_spec,
            critic_network=DummyCriticNet(),
            actor_network=None,
            actor_optimizer=None,
            critic_optimizer=None,
            alpha_optimizer=None,
            cql_alpha=1.0,
            num_cql_samples=1,
            include_critic_entropy_term=include_critic_entropy_term,
            use_lagrange_cql_alpha=False,
            reward_noise_variance=reward_noise_variance,
            actor_policy_ctor=DummyActorPolicy)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        actions = tf.constant([[5], [6]], dtype=tf.float32)

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
        next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        pred_td_targets = [7., 10.]
        self.evaluate(tf.compat.v1.global_variables_initializer())

        # Expected critic loss has factor of 2, for the two TD3 critics.
        expected_loss = self.evaluate(
            2 * tf.compat.v1.losses.mean_squared_error(
                tf.constant(td_targets), tf.constant(pred_td_targets)))

        loss = agent._critic_loss_with_optional_entropy_term(
            time_steps,
            actions,
            next_time_steps,
            td_errors_loss_fn=tf.math.squared_difference)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_ = self.evaluate(loss)
        self.assertAllClose(loss_, expected_loss)
예제 #17
0
    def _step(self, actions):
        next_actions = transform_actions(actions, self.obs, self.config)

        self.last_obs = self.obs
        self.obs, reward, done, info = self.env.step(next_actions)

        x_obs = transform_observation(self.obs, self.config)
        x_reward = transform_reward(done, self.last_obs, self.obs, self.config)


        # final
        if x_reward <= REWARD_LOST:
            done, info = True, {}
            return_object = ts.termination(np.array(x_obs, dtype=np.int32), x_reward)
            return return_object
        else:
            return_object = ts.transition(np.array(x_obs, dtype=np.int32), reward=x_reward, discount=1.0)
            return return_object
예제 #18
0
파일: podagent.py 프로젝트: Kricket/tf-pods
    def _step(self, action):
        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self.reset()

        if self._player.pod.nextCheckId > 1 or self._player.pod.turns > 100:
            # That's enough for training...
            self._episode_ended = True
        else:
            # Play the given action
            self._player.controller.set_play(action, self._player.pod)
            self._player.step(self._board)

        if self._episode_ended:
            return ts.termination(self._to_observation(), self._get_reward())
        else:
            return ts.transition(self._to_observation(), reward = self._get_reward(), discount = np.asarray(100, dtype=np.float32))
예제 #19
0
    def _step(self, action):
        prices = self.productsCosts * action
        observation = np.round(
            (self.placeSize * self.productsUsualBuyingRates) *
            (self.productsPriceFlexibility**
             ((self.productsUsualPrices - prices) / self.productsUsualPrices)))

        marginPerProduct = (prices - self.productsCosts) * observation

        reward = marginPerProduct.sum()
        # convert to numpy array of float32, otherwise not accepted by specs
        observation = np.array(observation, dtype=np.float32)

        if self._state < self.duration:
            self._state += 1
            return ts.transition(observation, reward)
        else:
            return ts.termination(observation, reward)
예제 #20
0
    def _step(self, action):

        if self._episode_ended:
            return self.reset()

        cumulative_reward = 0
        cumulative_done = False
        obs, reward, done, info = self._env.step(action)
        cumulative_reward += reward
        self._state = obs
        if (done):
            cumulative_done = True

        if cumulative_done:
            self._episode_ended = True
            return ts.termination(self._state, reward)
        else:
            return ts.transition(self._state, reward, discount=0.98)
예제 #21
0
    def _step(self, action):
        self._state = self.states[self.time_flow][:-1]
        raw_reward = self.states[self.time_flow][-1]
        reward = 1. + raw_reward * tf.sign(action)

        if self.delay:
            if self.delay_counter < self.delay_threshold:
                self.delay_counter += 1
                self.total_reward *= (1. + (raw_reward * self.curr_position))
            else:
                if self.curr_position == tf.sign(action):  # action not changed
                    self.total_reward *= (1. +
                                          (raw_reward * self.curr_position))
                else:  # action changed; so delay counter reset
                    self.curr_position = tf.sign(action)
                    self.actions.append(
                        [self.curr_position.numpy(), self.time_flow])
                    self.delay_counter = 0
                    self.total_reward *= reward
                    self.action_counter += 1

        else:
            self.total_reward *= reward
            if self.curr_position * tf.sign(action) < 0:  # position changed
                self.action_counter += 1
                self.actions.append([tf.sign(action).numpy(), self.time_flow])
            self.curr_position = tf.sign(action)  # current position changed

        self.time_flow += 1

        if self.time_flow == self.states.shape[0]:
            termination = ts.termination(
                np.array(self._state, dtype=np.float32), reward)
            print('action counter is ', self.action_counter)
            print('total reward during training', self.total_reward)
            # if self.eval==True:
            #     print('action counter is ', self.action_counter)
            print('where actions have done?', self.actions)
            self.reset()
            return termination
        else:
            return ts.transition(np.array(self._state, dtype=np.float32),
                                 reward,
                                 discount=1.0)
예제 #22
0
    def testLoss(self, agent_class, run_mode):
        if tf.executing_eagerly() and run_mode == context.graph_mode:
            self.skipTest('b/123778560')
        with run_mode(), tf.compat.v2.summary.record_if(False):
            q_net = DummyNet(self._observation_spec, self._action_spec)
            agent = agent_class(self._time_step_spec,
                                self._action_spec,
                                q_network=q_net,
                                optimizer=None)

            observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
            time_steps = ts.restart(observations, batch_size=2)

            actions = [tf.constant([[0], [1]], dtype=tf.int32)]
            action_steps = policy_step.PolicyStep(actions)

            rewards = tf.constant([10, 20], dtype=tf.float32)
            discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
            next_observations = [
                tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
            ]
            next_time_steps = ts.transition(next_observations, rewards,
                                            discounts)

            experience = test_utils.stacked_trajectory_from_transition(
                time_steps, action_steps, next_time_steps)

            # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer
            # [[1], [1]] from DummyNet above, we can calculate the following values:
            # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5
            # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8
            # (Here we use the second row of the kernel initializer above, since the
            # chosen action is now 1 instead of 0.)
            # Q-value for first next_observation: 2 * 5 + 1 * 6 + 1 = 17
            # Q-value for second next_observation: 2 * 7 + 1 * 8 + 1 = 23
            # TD targets: 10 + 0.9 * 17 = 25.3 and 20 + 0.9 * 23 = 40.7
            # TD errors: 25.3 - 5 = 20.3 and 40.7 - 8 = 32.7
            # TD loss: 19.8 and 32.2 (Huber loss subtracts 0.5)
            # Overall loss: (19.8 + 32.2) / 2 = 26
            expected_loss = 26.0
            loss, _ = agent._loss(experience)

            self.evaluate(tf.compat.v1.initialize_all_variables())
            self.assertAllClose(self.evaluate(loss), expected_loss)
    def _step(self, action):
        #if done, reset
        if (self._episode_ended):
            return self.reset()

        #behavior for valid input
        if (action == 0 or self._state[8] != 0):
            self._episode_ended = True #stand or max of 4 hits
        elif (action == 1):
            newCard = self.drawNewCard()
            self._state[0] += newCard

            #start looking at 4th spot since first 3 are set by default
            for i in range(3,9):
                #once you find the first empty card slot, add new card then break
                if(self._state[i] == 0):
                    self._state[i] = newCard
                    break

            #when over, change any 11s(aces) to 1s (skip state[0] because its a sum)
            if(self._state[0] > 21):
                for i in range(1,9):
                    if(self._state[i] == 11):
                        self._state[i] = 1 #fix card
                        self._state[0] -= 10 #fix sum

                    #break if successfully reduced
                    if(self._state[0] <= 21):
                        break
        else:
            raise ValueError('Invalid action, non-binary action detected')


        #if game is over, grant rewards, otherwise just transition
        if (self._episode_ended or self._state[0] >= 21):
            #score - 21 is reward value, if bust, -21. If sum is <11
            resultReward = self._state[0] - 21 if self._state[0] <= 21 else -21

            if(self._state[0] <= 11): #less than 12 (could have safely hit, so this is bad) 
                resultReward = -100

            return ts.termination(np.array([self._state], dtype=np.int32), resultReward)
        else:
            return ts.transition(np.array([self._state], dtype=np.int32), reward = 0.0, discount = 1.0)
예제 #24
0
    def _step(self, action):
        if self._game.is_episode_finished():
            # The last action ended the episode. Ignore the current action and start a new episode.
            return self.reset()

        # construct one hot encoded action as required by ViZDoom
        one_hot = [0] * self._num_actions
        one_hot[action] = 1

        # execute action and receive reward
        reward = self._game.make_action(one_hot)

        # return transition depending on game state
        if self._game.is_episode_finished():
            return time_step.termination(self.get_screen_buffer_preprocessed(),
                                         reward)
        else:
            return time_step.transition(self.get_screen_buffer_preprocessed(),
                                        reward)
예제 #25
0
    def _step(self, action: types.NestedArray) -> ts.TimeStep:
        if self._current_time_step.is_last():
            return self.reset()

        obs, legal_moves, rewards, done = self._env.step(action)
        self._cumulative_rewards += rewards

        observations_and_legal_moves = {
            'observations': obs,
            'cumulative_rewards': self._cumulative_rewards,
            'legal_moves': legal_moves
        }

        reward = self._utiility_function(self._cumuluative_rewards)
        if done:
            return ts.termination(observations_and_legal_moves, reward)
        else:
            return ts.transition(observations_and_legal_moves, reward,
                                 self.gamma)
예제 #26
0
    def _step(self, player_actions):
        if self._episode_ended:
            # print("game already ended resetting")
            return self.reset()

        player_index = self.current_player_index

        player_action = Action(int(player_actions[0]))
        structure_index = int(player_actions[1])

        player = self.players[player_index]
        player_deck = self.player_deck(player_index)

        success_reward_modifier = 0
        if structure_index >= len(player_deck):
            structure_index = len(player_deck) - 1

        structure = player_deck.pop(structure_index)
        try:
            if player_action == Action.BUILD_STRUCTURE:
                player.build_structure(structure)
            elif player_action == Action.BUILD_WONDER_STAGE:
                player.build_wonder_stage()
            elif player_action == Action.DISCARD:
                player.discard_structure()
            # print("Player " + str(player_index) + " choose to " + player_action.name + " " + structure['name'])
        except ImpossibleBuildException:
            player.discard_structure()
            success_reward_modifier = -3

        self.finish_player_turn()

        observation = self.to_observation()
        reward = self.calculate_score_difference(
            player_index) + success_reward_modifier

        if self._episode_ended:
            # print("game terminated at age " + str(self.age) + " reward " + str(reward))
            return time_step.termination(observation, reward)
        else:
            # print("transition player " + str(self.current_player_index) + " action " + str(player_action.name)
            #      + " turn " + str(self.turn) + " age " + str(self.age) + " reward " + str(reward))
            return time_step.transition(observation, reward)
예제 #27
0
 def Visualize(self, num_episodes=1):
     self._agent._training = False
     for _ in range(0, num_episodes):
         state = self._environment.reset()
         is_terminal = False
         while not is_terminal:
             action_step = self._agent._eval_policy.action(
                 ts.transition(state, reward=0.0, discount=1.0))
             action_shape = action_step.action.shape
             expected_shape = self._agent._eval_policy.action_spec.shape
             action = action_step.action.numpy()
             if action_shape != expected_shape:
                 logging.warning("Action shape" + str(action_shape) + \
                   " does not match with expected shape " + str(expected_shape) +\
                   " -> reshaping is tried")
                 action = np.reshape(action, expected_shape)
                 logging.info(action)
             state, reward, is_terminal, _ = self._environment.step(action)
             self._environment.render()
  def _step(self, action):

    trade_seq = np.argsort(np.floor(100*action[N:,:])).reshape(N, N)
    trade_ratio = action[:N, :]

    # iterate over all possible buying/selling pairs
    for i in np.arange(0,N*N):

      # determine buyer seller and amount to trade
      buyer = int(trade_seq/N)
      seller = trade_seq%N
      trade_ratio_current = trade_ratio[buyer,seller]

      # stock to be taded
      s_tbt = min(np.floor(self._state[self._step_counter, buyer]/D),
                  self._state[N+self._step_counter, seller])
      w_tbt = s_tbt*D
      self._state[self._step_counter, buyer] -= w_tbt
      self._state[N+self._step_counter, buyer] += s_tbt
      self._state[self._step_counter, seller] += w_tbt
      self._state[N+self._step_counter, seller] -= s_tbt

    if self._step_counter == 10:
      # The last action ended the episode. Ignore the current action and start
      # a new episode.
      return self.reset()
      self._episode_ended = True

    # Make sure episodes don't go on forever.kk
    if action == 1:
      self._episode_ended = True
    elif action == 0:
      new_card = np.random.randint(1, 11)
      self._state += new_card
    else:
      raise ValueError('`action` should be 0 or 1.')

    if self._episode_ended or self._state >= 21:
      reward =  np.sum(self._state[N-1,:]) + d*(1+r)/r*np.sum(self._state[2*N,:])
      return ts.termination(np.array([self._state], dtype=np.int32), reward)
    else:
      return ts.transition(
        np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)
예제 #29
0
    def test_resets_after_limit(self):
        max_steps = 5
        base_env = mock.MagicMock()
        wrapped_env = atari_wrappers.AtariTimeLimit(base_env, max_steps)

        base_env.gym.game_over = False
        base_env.reset.return_value = ts.restart(1)
        base_env.step.return_value = ts.transition(2, 0)
        action = 1

        for _ in range(max_steps + 1):
            wrapped_env.step(action)

        self.assertTrue(wrapped_env.game_over)
        self.assertEqual(1, base_env.reset.call_count)

        wrapped_env.step(action)
        self.assertFalse(wrapped_env.game_over)
        self.assertEqual(2, base_env.reset.call_count)
예제 #30
0
 def _success(self, action, action_letter):
     for index, l in enumerate(self.word_to_guess):
         if action == (ord(l) - 97):
             self._state[index] = action
     # 26 correspond to not found
     if 26 not in self._state:
         self._episode_ended = True
         logging.debug(f"You Found {self.word_to_guess}")
         return ts.termination(
             np.array([self._state], dtype=np.int32),
             self.number_of_life * self.reward_map["game_success_reward"],
         )
     else:
         self.render()
         return ts.transition(
             np.array([self._state], dtype=np.int32),
             reward=self.reward_map["guess_success_reward"],
             discount=1.0,
         )