示例#1
0
def env_load_fn(env_name):
  del env_name
  obs_spec = array_spec.BoundedArraySpec((2,), np.int32, -10, 10)
  action_spec = array_spec.BoundedArraySpec((2, 3), np.int32, -10, 10)
  return random_py_environment.RandomPyEnvironment(
      obs_spec, action_spec=action_spec, min_duration=2, max_duration=4)
    def __init__(
        self,
        start_state,
        target_state,
        min_observation=MIN_STATE,
        max_observation=MAX_STATE,
        min_action=MIN_VELOCITY,
        max_action=MAX_VELOCITY,
        low_process_noise_var=LOW_PROCESS_NOISE_VAR,
        high_process_noise_var=HIGH_PROCESS_NOISE_VAR,
        gating_bitmap=None,
        velocity_init=0.0,
        delta_time=DELTA_TIME,
        min_acceleration=MIN_ACCELERATION,
        max_acceleration=MAX_ACCELERATION,
    ):
        # self.gravitational_acceleration = tf.constant(9.81, dtype=float_type)
        self.e3 = tf.constant([[0.0], [0.0], [1.0]], dtype=float_type)
        self.mass = tf.constant(1.0, dtype=float_type)
        self.inertia_matrix = tf.constant([[1.0]], dtype=float_type)
        self.inv_inertia_matrix = 1.0 / self.inertia_matrix

        # simulation parameters
        self.start_state = start_state
        self.target_state = target_state
        self.state_dim = 6
        self.control_dim = 2

        self.state_init = start_state
        print("self.state_init")
        print(self.state_init)
        self._state = self.state_init
        self.delta_time = delta_time

        # environment parameters
        if isinstance(low_process_noise_var, np.ndarray):
            self.low_process_noise_var = low_process_noise_var
        else:
            print("low_process_noise_var isn't array so broadcasting")
            self.low_process_noise_var = low_process_noise_var * np.ones(
                num_states)
        if isinstance(high_process_noise_var, np.ndarray):
            self.high_process_noise_var = high_process_noise_var
        else:
            print("high_process_noise_var isn't array so broadcasting")
            self.high_process_noise_var = high_process_noise_var * np.ones(
                num_states)

        # # configure action spec
        min_action = np.array([-10, -10])
        max_action = np.array([10, 10])
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(self.control_dim, ),
            dtype=float_type,
            minimum=min_action,
            maximum=max_action,
            name="action",
        )
        # configure observation spec
        # if not isinstance(min_observation, np.ndarray):
        #     min_observation = min_observation * np.ones(state_dim)
        #     print("min_observation isn't array so broadcasting")
        # if not isinstance(max_observation, np.ndarray):
        #     max_observation = max_observation * np.ones(state_dim)
        #     print("max_observation isn't array so broadcasting")
        min_observation = np.array([-3.0, -3.0, -0.5, -0.5, -360, -5])
        max_observation = np.array([3.0, 3.0, 0.5, 0.5, 360, 5])
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(self.state_dim, ),
            dtype=float_type,
            minimum=min_observation,
            maximum=max_observation,
            name="observation",
        )
        self.episode_ended = False

        if gating_bitmap is None:
            resolution = BITMAP_RESOLUTION
            self.gating_bitmap = np.ones([resolution, resolution])
        elif isinstance(gating_bitmap, str):
            self.gating_bitmap = cv2.imread(gating_bitmap,
                                            cv2.IMREAD_GRAYSCALE)
            self.gating_bitmap = self.gating_bitmap / 255
        elif isinstance(gating_bitmap, np.ndarray):
            self.gating_bitmap = gating_bitmap
        else:
            raise (
                "gating_bitmap must be np.ndarray or filepath string for bitmap"
            )
        # TODO check x and y are the right way around
        self.num_pixels = np.array(
            # [self.gating_bitmap.shape[0] - 1, self.gating_bitmap.shape[1] - 1]
            [self.gating_bitmap.shape[1] - 1, self.gating_bitmap.shape[0] - 1])

        self.viewer = EnvRenderer(self)
示例#3
0
 def test_unbounded(self):
   obs_spec = array_spec.BoundedArraySpec((2, 3), np.int32, -10, 10)
   action_spec = array_spec.ArraySpec((2,), np.int32)
   with self.assertRaisesRegexp(ValueError, 'bounded action specs'):
     env = random_py_environment.RandomPyEnvironment(obs_spec, action_spec)
     env = wrappers.ActionOffsetWrapper(env)
示例#4
0
    def __init__(  # pylint: disable=W0231
        self,
        alphabet: str,
        starting_seq: str,
        model: flexs.Model,
        landscape: flexs.Landscape,
        max_num_steps: int,
    ):
        """
        Initialize DyNA-PPO agent environment.

        Based on this tutorial:
        https://www.mikulskibartosz.name/how-to-create-an-environment-for-a-tensorflow-agent

        Args:
            alphabet: Usually UCGA.
            starting_seq: When initializing the environment,
                the sequence which is initially mutated.
            model: Landscape or model which evaluates
                each sequence.
            max_num_steps: Maximum number of steps before
                episode is forced to terminate. Usually the
                `model_queries_per_batch`.

        """
        self.alphabet = alphabet

        # model/model/measurements
        self.model = model
        self.landscape = landscape
        self.fitness_model_is_gt = False
        self.previous_fitness = -float("inf")

        self.seq = starting_seq
        self._state = {
            "sequence":
            s_utils.string_to_one_hot(self.seq,
                                      self.alphabet).astype(np.float32),
            "fitness":
            self.model.get_fitness([starting_seq]).astype(np.float32),
        }
        self.episode_seqs = set()  # the sequences seen in the current episode
        self.all_seqs = {}
        self.measured_sequences = {}

        self.lam = 0.1

        # tf_agents environment
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(1, ),
            dtype=np.integer,
            minimum=0,
            maximum=len(self.seq) * len(self.alphabet) - 1,
            name="action",
        )
        self._observation_spec = {
            "sequence":
            array_spec.BoundedArraySpec(
                shape=(len(self.seq), len(self.alphabet)),
                dtype=np.float32,
                minimum=0,
                maximum=1,
            ),
            "fitness":
            array_spec.ArraySpec(shape=(1, ), dtype=np.float32),
        }

        self.num_steps = 0
        self.max_num_steps = max_num_steps
示例#5
0
    def __init__(self,
                 emulator,
                 balance,
                 logger=None,
                 start_time=1581434096,
                 test_time=12 * 3600,
                 indent=3600,
                 period=1.,
                 reset=True,
                 string_start='',
                 orderbook_depth=5,
                 action_ratio=0.25,
                 return_type='delta',
                 pair_list=None,
                 asset_list=None):
        super().__init__()
        self.action_ratio = action_ratio
        self.db = DB()
        self.emulator = emulator
        self.logger = logger
        self.indent = indent
        self.return_type = return_type
        self.period = period
        self.start_time = start_time
        self.test_time = test_time
        self.current_data = {}
        self.memory = {}
        self.data = {}
        self.somes = {}
        self.times = {}
        self.current_time = start_time
        self.agent_balance = balance.copy()
        self.start_balance = balance.copy()
        self.max_balance = balance.copy()
        self.currency_number = len(balance)
        self.orderbook_depth = orderbook_depth

        if pair_list is None:
            with open(string_start + 'settings/pairs.txt') as file:
                self.pairs = [a[:-1] for a in file.readlines()]
            self.pair_number = 11
        else:
            self.pairs = pair_list.copy()
            self.pair_number = len(self.pairs)

        if asset_list is None:
            with open(string_start + 'settings/cryptos.txt') as file:
                self.assets = [a[:-1] for a in file.readlines()]
        else:
            self.assets = asset_list.copy()

        assert len(self.agent_balance) == len(
            self.assets), 'Эй друг, что за махинации ты проворачиваешь?'

        n = self.orderbook_depth

        memory_columns = [f'depth_ask_price_{i + 1}' for i in range(n)] + \
                         [f'depth_bid_price_{i + 1}' for i in range(n)] + \
                         [f'depth_ask_quantity_{i + 1}' for i in range(n)] + \
                         [f'depth_bid_quantity_{i + 1}' for i in range(n)]
        self.memory_columns = {
            val: idx
            for idx, val in enumerate(memory_columns)
        }

        for pair in self.pairs:
            self.memory[pair] = self.db.fetch_pandas(start=start_time - indent,
                                                     end=start_time +
                                                     test_time,
                                                     pair_names={pair})
            self.times[pair] = self.memory[pair]['time'].copy()
            self.times[pair].index = self.times[pair].apply(
                datetime.datetime.fromtimestamp)
            self.memory[pair].index = self.memory[pair]['time'].apply(
                datetime.datetime.fromtimestamp)
            data = dp.basic_clean(self.memory[pair].copy())
            copy = data.copy()
            some = dp.make_x(copy)
            self.times[pair] = self.times[pair][some.index]
            self.somes[pair] = some

        common_index = self.times[self.pairs[0]].index
        for pair in self.pairs:
            common_index = common_index.intersection(self.times[pair].index)
        self.time = self.times[self.pairs[0]][common_index]

        for pair in self.pairs:
            some = self.somes[pair]
            if reset:
                scaler = StandardScaler()
                ok_cols = list(some.columns)
                scaler.fit(some)
                joblib.dump(
                    ok_cols, string_start + 'settings/Env_settings/' + pair +
                    '_columns.joblib')
                joblib.dump(
                    scaler, string_start + 'settings/Env_settings/' + pair +
                    '_scaler.joblib')
            else:
                scaler = joblib.load(string_start + 'settings/Env_settings/' +
                                     pair + '_scaler.joblib')
                ok_cols = joblib.load(string_start + 'settings/Env_settings/' +
                                      pair + '_columns.joblib')
                some = some[ok_cols]

            some = some.loc[common_index]
            self.memory[pair] = self.memory[pair].loc[common_index][
                memory_columns].values
            self.data[pair] = scaler.transform(some)

        time = self.time.reset_index(drop=True)
        current = pd.Series(range(start_time, start_time + test_time))
        timeta = pd.DataFrame(time, columns=['time'])
        timeta['index'] = timeta.index
        curta = pd.DataFrame(current, columns=['time'])
        merged = curta.merge(timeta, how='outer', sort=True)
        merged.ffill(inplace=True)
        final = curta.join(merged.set_index('time'), on='time')
        final = final.set_index('time')
        final['index'] = final['index'].apply(int)
        self.time_to_id = final

        del self.times, self.somes

        self._action_spec = array_spec.BoundedArraySpec(
            shape=(),
            dtype=np.int32,
            minimum=0,
            maximum=self.pair_number * 2,
            name='action')
        obs_shape = self.pair_number * 109 + self.currency_number
        self._observation_spec = array_spec.ArraySpec(shape=(obs_shape, ),
                                                      dtype=np.float64,
                                                      name='observation')
        self._episode_ended = False

        init_handle = self.emulator.handle([], self.agent_balance,
                                           self.form_orderbook())
        self.history = [(self.current_time, init_handle['new_usdt'])]
示例#6
0
 def testCheckArrayMatch(self, dtype):
   spec = array_spec.BoundedArraySpec((2,), dtype, minimum=5, maximum=15)
   self.assertTrue(spec.check_array(np.array([6, 7], dtype)))
   # Bounds should be inclusive.
   self.assertTrue(spec.check_array(np.array([5, 15], dtype)))
示例#7
0
 def testBoundedArraySpecSample(self, dtype):
   spec = array_spec.BoundedArraySpec((2, 3), dtype, -10, 10)
   sample = array_spec.sample_spec_nest(spec, self.rng)
   self.assertTrue(np.all(sample >= -10))
   self.assertTrue(np.all(sample <= 10))
示例#8
0
    def __init__(self, fake=False, metrics_key='001'):
        with open('running', 'w') as f:
            f.write(str(os.getpid()))
        
        self._episode_ended = False

        self.game = serpent.initialize_game('T4TF1')

        game_frame = self.game.screen_regions['GAME_REGION']
        self.width = 10
        self.height = 10

        self.state_shape = (int(self.height / 2), int(self.width / 2), 1)
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=self.state_shape, dtype=np.float32, minimum=0.0, name='observation')


        self._state = np.zeros(self.state_shape).astype(np.float32)

        if fake:
            return
        self.interrupted = False

        self.game.launch()
        self.game.start_frame_grabber()
        self.input_controller = InputController(game=self.game)
        # self.input_proc = 

        self.frame_buffer = FrameGrabber.get_frames([0])
        self.frame_buffer = self.extract_game_area(self.frame_buffer)

        self.width = self.frame_buffer[0].shape[1]
        self.height = self.frame_buffer[0].shape[0]
        print('width: %d' % self.width)
        print('height: %d' % self.height)
        self.state_shape = (self.height, self.width, 3)
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=self.state_shape, dtype=np.float32, minimum=0.0, name='observation')

        self._state = np.zeros(self.state_shape).astype(np.float32)

        # print('created input with pid: %s' % self.input_proc.pid)
        self.sell_keys = [KeyboardKey.KEY_LEFT_SHIFT, KeyboardKey.KEY_LEFT_CTRL, KeyboardKey.KEY_S]
        self.buy_keys = [KeyboardKey.KEY_LEFT_SHIFT, KeyboardKey.KEY_LEFT_CTRL, KeyboardKey.KEY_B]
        self.step_keys = [KeyboardKey.KEY_LEFT_SHIFT, KeyboardKey.KEY_LEFT_CTRL, KeyboardKey.KEY_F]


        self.visual_debugger = VisualDebugger()

        self.scraper = T4Scraper(game=self.game, visual_debugger=self.visual_debugger)
        frame = self.game.grab_latest_frame()
        self.scraper.current_frame = frame
        self.pl = 0
        self.working_trade = 0
        self.current_action = ''
        self.held = False
        self.fill_count = 0

        self.window_controller = WindowController()
        self.window_id = self.window_controller.locate_window(".*Mini-Dow .*")
        # self.window_id = self.window_controller.locate_window(".*S&P .*")

        self.keys = RedisKeys(metrics_key)
#         self.redis = redis.Redis(port=6001)
    
        self.number_of_trades = 0
        self.number_of_wins = 0
        self.buys = 0
        self.sells = 0
        self.holds = 0
        self.history = list()
        self.actions = 0
        self.last_action = ''

        self.previous_write = -1
        self.get_metadata()
        
        self.active_frame = None
        
        self.start_time = time.time()
        
        self.step_read_time = 0
        self.step_write_time = 0
示例#9
0
 def action_spec(self):
   spec = self._env.action_spec()
   minimum = np.zeros(shape=spec.shape, dtype=spec.dtype)
   maximum = spec.maximum - spec.minimum
   return array_spec.BoundedArraySpec(spec.shape, spec.dtype, minimum=minimum,
                                      maximum=maximum)
示例#10
0
    def __init__(self, window_name, render_me=True):
        # game parameters
        self._board_size = 5
        self._max_turns = 400
        if self._max_turns > 20:
            self._frames = 20
        else:
            self._frames = self._max_turns
        self._agent_count = 2
        self._channels = 3
        self._action_def = {
            0: ShipAction.EAST,
            1: ShipAction.NORTH,
            2: "NOTHING",
            3: ShipAction.SOUTH,
            4: ShipAction.WEST
        }

        # runtime parameters
        self.turns_counter = 0
        self.episode_ended = False
        self.total_reward = 0
        self.ships_idle = []
        self.shipyards_idle = []
        self.last_reward = 0

        self.render_step = render_me

        # initialize game
        self.environment = make("halite",
                                configuration={
                                    "size": self._board_size,
                                    "startingHalite": 1000,
                                    "episodeSteps": self._max_turns
                                })
        self.environment.reset(self._agent_count)

        self._action_spec = array_spec.BoundedArraySpec(
            shape=(),
            dtype=np.int32,
            minimum=0,
            maximum=len(self._action_def) - 1,
            name='action')
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(self._frames, self._board_size, self._board_size,
                   self._channels),
            dtype=np.int32,
            minimum=0,
            maximum=1,
            name='observation')

        self.state = np.zeros(
            [self._board_size, self._board_size, self._channels])
        # 0 = Halite 0-1
        # 1 = Ships (This One Hot, rest are .5)
        # 2 = Shipyardss (This One Hot, rest are .5)

        self.state_history = [self.state] * self._frames

        # get board
        self.board = self.get_board()
        self.prime_board()
        self.halite_image_render = image_render(self._board_size)
        self.previous_ship_count = 0
 def action_spec(self):
   return array_spec.BoundedArraySpec(
       [7], dtype=np.float32, minimum=-1.0, maximum=1.0)
    def __init__(self, simualtor, discount_factor=1.0):
        self.timestep = None # Initialize the time step to be zero
        
        # These parameters need to be overwritten

        # 1. Action spec - i.e. what actions are allowed by the environment.
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(<INSERT_HERE>), dtype=np.float32, minimum=<INSERT_HERE>,
            maximum=<INSERT_HERE>, name='action')  # Actions specification

        # 2. Observation Spec - i.e. what is an ageng allowed to observe in this environment
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(<INSERT_HERE>, dtype=np.float32, minimum=<INSERT_HERE>,
            name='observation')  # States are [x y theta velocity]^T
        
        # 3. The "reset" state
        self.state0 = <INSERT_HERE>  # Store initial state for resets

        # 4. The general state
        self._state = <INSERT HERE> # Synchronize env <--> simulator

        # 5. Keep track of if an episode is completed
        self._episode_ended = False

        # 6. Discount factor
        self.discount_factor = discount_factor

    def action_spec(self):
        """Get action_spec class attributes.
        Getter method for the action_spec class attribute.
        Returns:
            Returns the action specification for this Python environment class.
        """

        return self._action_spec

    def observation_spec(self):
        """Get observation_spec class attributes.
        Getter method for the observation_spec class attribute.
        Returns:
            Returns the observation specification for this Python environment
            class.
        """

        return self._observation_spec

    def batch_size(self):
        return self.batch_size

    def batched(self):
        if self.batch_size() != 1:
            return True
        return False

    def _reset(self):
        """Reset the environment back to its default state.
        This method is used for resetting at the end of episodes,
        and returns the environment state to its initialized state.
        Returns:
            A tf-agents function that carries information about
            resetting relevant environment variables back to their default
            settings.
        """
        self._state = <PICK_RANDOM_STATE>  # Reset this to a random state
        self.timestep = 0  # Reset time step counter
        self._episode_ended = False
        return ts.restart(np.array(self.state0, dtype=np.float32))

    def _step(self, action):
        """Main functionality for stepping the RL model in this environment.
        This function lets the agent take an action, then updates the
        agent's state appropriately and computes the agent's reward.
        Arguments:
            action (list): A list corresponding to the action componets
                           [a1, a2, ... , aN] the agent takes at each time step.
        Returns:
            A tf-agents function that carries information about the
            current observation and discounted reward.
        """

        # If episode is over, after terminating time step, reset environment
        if self._episode_ended:
            return self.reset()

        # Else, step the agent, update state, and compute reward
        position_x, position_y, theta, velocity = \
            self.simulator.state_transition(self._state, action, self._dt)
        position_x, position_y = self.check_bounding_box([position_x,
                                                          position_y])
        # Update state here
        self._state = <UPDATE_STATE_HERE>

        # Compute reward
        reward = <INSERT REWARD COMPUTATION>

        # Check if the episode has ended
        if self.timestep.is_last():
            self._episode_ended = True
            return ts.termination(np.array(self._state,
                                           dtype=np.float32), reward=reward)

        # Else, step the time step counter and transition
        self.timestep += 1
        return ts.transition(np.array(self._state, dtype=np.float32),
                             reward=reward,
                             discount=float(self.discount_factor))
示例#13
0
    def __init__(self,
                 global_context_sampling_fn: Callable[[], types.Array],
                 arm_context_sampling_fn: Callable[[], types.Array],
                 num_actions: int,
                 reward_fn: Callable[[types.Array], Sequence[float]],
                 batch_size: Optional[int] = 1):
        """Initializes the environment.

    In each round, global context is generated by global_context_sampling_fn,
    per-arm contexts are generated by arm_context_sampling_fn.

    The two feature generating functions should output a single observation, not
    including either the batch_size or the number of actions.

    The reward_fn function takes a global and a per-arm feature, and outputs a
    possibly random reward.

    Example:
      def global_context_sampling_fn():
        return np.random.randint(0, 10, [2])  # 2-dimensional global features.

      def arm_context_sampling_fn():
        return {'armf1': np.random.randint(-3, 4, [3]),    # A dictionary of
                'armf2': np.random.randint(0, 2, [4, 5])}  # arm features.

      def reward_fn(global, arm):
        return sum(global) + arm['armf1'][0] + arm['armf2'][3, 3]

      env = StationaryStochasticPyEnvironment(global_context_sampling_fn,
                                              arm_context_sampling_fn,
                                              5,
                                              reward_fn,
                                              batch_size=5)

    Args:
      global_context_sampling_fn: A function that outputs a possibly nested
        structure of features. This output is the global context. Its shapes and
        types must be consistent accross calls.
      arm_context_sampling_fn: A function that outputs a possibly nested
        structure of features. This output is the per-arm context. Its shapes
        must be consistent accross calls.
      num_actions: (int) the number of actions in every sample.
      reward_fn: A function that generates a reward when called with a global
        and a per-arm observation.
      batch_size: The batch size.
    """
        self._global_context_sampling_fn = global_context_sampling_fn
        self._arm_context_sampling_fn = arm_context_sampling_fn
        self._num_actions = num_actions
        self._reward_fn = reward_fn
        self._batch_size = batch_size

        global_example = global_context_sampling_fn()
        arm_example = arm_context_sampling_fn()
        observation_spec = {
            GLOBAL_KEY:
            tf.nest.map_structure(array_spec.ArraySpec.from_array,
                                  global_example),
            PER_ARM_KEY:
            array_spec.add_outer_dims_nest(
                tf.nest.map_structure(array_spec.ArraySpec.from_array,
                                      arm_example), (num_actions, ))
        }

        action_spec = array_spec.BoundedArraySpec(shape=(),
                                                  dtype=np.int32,
                                                  minimum=0,
                                                  maximum=num_actions - 1,
                                                  name='action')

        super(StationaryStochasticStructuredPyEnvironment,
              self).__init__(observation_spec, action_spec)
示例#14
0
    def __init__(self,
                 global_context_sampling_fn,
                 arm_context_sampling_fn,
                 max_num_actions,
                 reward_fn,
                 num_actions_fn=None,
                 batch_size=1,
                 variable_action_method=VariableActionMethod.FIXED):
        """Initializes the environment.

    In each round, global context is generated by global_context_sampling_fn,
    per-arm contexts are generated by arm_context_sampling_fn. The reward_fn
    function takes the concatenation of a gloabl and a per-arm feature, and
    outputs a possibly random reward.
    In case `num_action_fn` is specified, the number of actions will be dynamic.
    The actual number of actions can be encoded in multiple ways, specified by
    `variable_action_method`. The observation spec constructed by the
    environment will also reflect the method used. The below list explains how
    the observations are built for all the methods.

    The different values of `variable_action_method` and the corresponding
    behavior:
    -- `FIXED` (default): The number of actions per sample is fixed. In this
       case, `num_actions_fn` should be `None`.
    -- 'MASK': The actually available actions are encoded by an action mask
       added to the observation in the format of
       `(observation, [1 1 ... 1 0 ... 0])`. The length of the mask, as well of
       the number of arm observations if `max_num_actions`.
    -- `NUM_ACTIONS_FEATURE`: An extra feature key `num_actions` is added to the
       observation, with an integer feature value indicating the number of
       available actions. The arm observation tensor has shape
       `[batch_size, max_num_actions, arm_feature_dim]`.
    -- `IN_BATCH_DIM`: The number of actions is folded into the batch dimension.
       In this case, the actual batch size should be 1, and the batch dimension
       is used to list all the actions for a sample. The global observation will
       internally be tiled to match this induced batch size. Also note that in
       this case, the `max_num_actions` parameter is ignored.


    Example:
      def global_context_sampling_fn():
        return np.random.randint(0, 10, [2])  # 2-dimensional global features.

      def arm_context_sampling_fn():
        return np.random.randint(-3, 4, [3])  # 3-dimensional arm features.

      def reward_fn(x):
        return sum(x)

      def num_actions_fn():
        return np.random.randint(2, 6)

      env = StationaryStochasticPerArmPyEnvironment(
          global_context_sampling_fn,
          arm_context_sampling_fn,
          5,
          reward_fn,
          num_actions_fn,
          VariableActionMethod.NUM_ACTIONS_FEATURE)

    Args:
      global_context_sampling_fn: A function that outputs a random 1d array or
        list of ints or floats. This output is the global context. Its shape and
        type must be consistent accross calls.
      arm_context_sampling_fn: A function that outputs a random 1 array or list
        of ints or floats (same type as the output of
        `global_context_sampling_fn`). This output is the per-arm context. Its
        shape must be consistent accross calls.
      max_num_actions: (int) the maximum number of actions in every sample. If
        `num_actions_fn` is not set, this many actions are available in every
        time step.
      reward_fn: A function that generates a reward when called with an
        observation.
      num_actions_fn: If set, it should be a function that outputs a single
        integer specifying the number of actions for a given time step. The
        value output by this function will be capped between 1 and
        `max_num_actions`. The number of actions will be encoded based on the
        method specified in `variable_action_method`. The different encodings
        are explained in the documentation above.
      batch_size: The batch size.
      variable_action_method: An instance of `VariableActionMethod`. Determines
        the way variable number of actions are handled.
    """
        self._global_context_sampling_fn = global_context_sampling_fn
        self._arm_context_sampling_fn = arm_context_sampling_fn
        self._max_num_actions = max_num_actions
        self._reward_fn = reward_fn
        self._batch_size = batch_size
        self._num_actions_fn = num_actions_fn
        self._variable_action_method = variable_action_method

        observation_spec = self._create_observation_spec()

        action_spec = array_spec.BoundedArraySpec(shape=(),
                                                  dtype=np.int32,
                                                  minimum=0,
                                                  maximum=max_num_actions - 1,
                                                  name='action')

        super(StationaryStochasticPerArmPyEnvironment,
              self).__init__(observation_spec, action_spec)
示例#15
0
 def testNotEqualDifferentMaximum(self):
   spec_1 = array_spec.BoundedArraySpec(
       (1, 2), np.int32, minimum=0.0, maximum=2.0)
   spec_2 = array_spec.BoundedArraySpec(
       (1, 2), np.int32, minimum=[0.0, 0.0], maximum=[1.0, 1.0])
   self.assertNotEqual(spec_1, spec_2)
示例#16
0
  def __init__(self,
               data_dir: Text,
               rank_k: int,
               batch_size: int = 1,
               num_actions: int = 50,
               csv_delimiter=',',
               name: Optional[Text] = 'movielens_per_arm'):
    """Initializes the Per-arm MovieLens Bandit environment.

    Args:
      data_dir: (string) Directory where the data lies (in text form).
      rank_k : (int) Which rank to use in the matrix factorization. This will
        also be the feature dimension of both the user and the movie features.
      batch_size: (int) Number of observations generated per call.
      num_actions: (int) How many movies to choose from per round.
      csv_delimiter: (string) The delimiter to use in loading the data csv file.
      name: (string) The name of this environment instance.
    """
    self._batch_size = batch_size
    self._context_dim = rank_k
    self._num_actions = num_actions

    # Compute the matrix factorization.
    self._data_matrix = dataset_utilities.load_movielens_data(
        data_dir, delimiter=csv_delimiter)
    self._num_users, self._num_movies = self._data_matrix.shape

    # Compute the SVD.
    u, s, vh = np.linalg.svd(self._data_matrix, full_matrices=False)

    # Keep only the largest singular values.
    self._u_hat = u[:, :rank_k].astype(np.float32)
    self._s_hat = s[:rank_k].astype(np.float32)
    self._v_hat = np.transpose(vh[:rank_k]).astype(np.float32)

    self._approx_ratings_matrix = np.matmul(self._u_hat * self._s_hat,
                                            np.transpose(self._v_hat))

    self._action_spec = array_spec.BoundedArraySpec(
        shape=(),
        dtype=np.int32,
        minimum=0,
        maximum=num_actions - 1,
        name='action')
    observation_spec = {
        GLOBAL_KEY:
            array_spec.ArraySpec(shape=[rank_k], dtype=np.float32),
        PER_ARM_KEY:
            array_spec.ArraySpec(
                shape=[num_actions, rank_k], dtype=np.float32),
    }
    self._time_step_spec = ts.time_step_spec(observation_spec)

    self._current_user_indices = np.zeros(batch_size, dtype=np.int32)
    self._previous_user_indices = np.zeros(batch_size, dtype=np.int32)

    self._current_movie_indices = np.zeros([batch_size, num_actions],
                                           dtype=np.int32)
    self._previous_movie_indices = np.zeros([batch_size, num_actions],
                                            dtype=np.int32)

    self._observation = {
        GLOBAL_KEY:
            np.zeros([batch_size, rank_k]),
        PER_ARM_KEY:
            np.zeros([batch_size, num_actions, rank_k]),
    }

    super(MovieLensPerArmPyEnvironment, self).__init__(
        observation_spec, self._action_spec, name=name)
示例#17
0
 def testRepr(self):
   as_string = repr(
       array_spec.BoundedArraySpec(
           (1, 2), np.int32, minimum=73.0, maximum=101.0))
   self.assertIn("101", as_string)
   self.assertIn("73", as_string)
示例#18
0
 def testInvalidMaximum(self):
   with self.assertRaisesRegexp(ValueError, "not compatible"):
     array_spec.BoundedArraySpec((3, 5), np.uint8, 0, (1, 1, 1))
示例#19
0
 def testCheckArrayNoMatch(self, array):
   spec = array_spec.BoundedArraySpec((2,), np.int64, minimum=5, maximum=15)
   self.assertFalse(spec.check_array(array))
示例#20
0
 def testMinLargerThanMax(self):
   with self.assertRaisesRegexp(ValueError, "min has values greater than max"):
     array_spec.BoundedArraySpec((3,), np.uint8, (1, 2, 3), (3, 2, 1))
  def __init__(self,
               data_dir: Text,
               rank_k: int,
               batch_size: int = 1,
               num_movies: int = 20,
               csv_delimiter: Text = ',',
               name: Optional[Text] = 'movielens'):
    """Initializes the MovieLens Bandit environment.

    Args:
      data_dir: (string) Directory where the data lies (in text form).
      rank_k : (int) Which rank to use in the matrix factorization.
      batch_size: (int) Number of observations generated per call.
      num_movies: (int) Only the first `num_movies` movies will be used by the
        environment. The rest is cut out from the data.
      csv_delimiter: (string) The delimiter to use in loading the data csv file.
      name: The name of this environment instance.
    """
    self._num_actions = num_movies
    self._batch_size = batch_size
    self._context_dim = rank_k

    # Compute the matrix factorization.
    self._data_matrix = dataset_utilities.load_movielens_data(
        data_dir, delimiter=csv_delimiter)
    # Keep only the first items.
    self._data_matrix = self._data_matrix[:, :num_movies]
    # Filter the users with no iterm rated.
    nonzero_users = list(np.nonzero(np.sum(self._data_matrix, axis=1) > 0.0)[0])
    self._data_matrix = self._data_matrix[nonzero_users, :]
    self._effective_num_users = len(nonzero_users)

    # Compute the SVD.
    u, s, vh = np.linalg.svd(self._data_matrix, full_matrices=False)

    # Keep only the largest singular values.
    self._u_hat = u[:, :rank_k] * np.sqrt(s[:rank_k])
    self._v_hat = np.transpose(
        np.transpose(vh[:rank_k, :]) * np.sqrt(s[:rank_k]))
    self._approx_ratings_matrix = np.matmul(self._u_hat, self._v_hat)

    self._current_users = np.zeros(batch_size)
    self._previous_users = np.zeros(batch_size)

    self._action_spec = array_spec.BoundedArraySpec(
        shape=(),
        dtype=np.int32,
        minimum=0,
        maximum=self._num_actions - 1,
        name='action')
    observation_spec = array_spec.ArraySpec(
        shape=(self._context_dim,), dtype=np.float64, name='observation')
    self._time_step_spec = ts.time_step_spec(observation_spec)
    self._observation = np.zeros((self._batch_size, self._context_dim))

    self._optimal_action_table = np.argmax(
        self._approx_ratings_matrix, axis=1)
    self._optimal_reward_table = np.max(
        self._approx_ratings_matrix, axis=1)

    super(MovieLensPyEnvironment, self).__init__(
        observation_spec, self._action_spec, name=name)
示例#22
0
 def testMinMaxAttributes(self):
   spec = array_spec.BoundedArraySpec((1, 2, 3), np.float32, 0, (5, 5, 5))
   self.assertEqual(type(spec.minimum), np.ndarray)
   self.assertEqual(type(spec.maximum), np.ndarray)
    def __init__(self,
                 global_context_sampling_fn,
                 arm_context_sampling_fn,
                 max_num_actions,
                 reward_fn,
                 num_actions_fn=None,
                 batch_size=1):
        """Initializes the environment.

    In each round, global context is generated by global_context_sampling_fn,
    per-arm contexts are generated by arm_context_sampling_fn. The reward_fn
    function takes the concatenation of a global and a per-arm feature, and
    outputs a possibly random reward.
    In case `num_action_fn` is specified, the number of actions will be dynamic
    and a `num_actions` feature key indicates the number of actions in any given
    sample.

    Example:
      def global_context_sampling_fn():
        return np.random.randint(0, 10, [2])  # 2-dimensional global features.

      def arm_context_sampling_fn():
        return np.random.randint(-3, 4, [3])  # 3-dimensional arm features.

      def reward_fn(x):
        return sum(x)

      def num_actions_fn():
        return np.random.randint(2, 6)

      env = StationaryStochasticPerArmPyEnvironment(global_context_sampling_fn,
                                                    arm_context_sampling_fn,
                                                    5,
                                                    reward_fn,
                                                    num_actions_fn)

    Args:
      global_context_sampling_fn: A function that outputs a random 1d array or
        list of ints or floats. This output is the global context. Its shape and
        type must be consistent across calls.
      arm_context_sampling_fn: A function that outputs a random 1 array or list
        of ints or floats (same type as the output of
        `global_context_sampling_fn`). This output is the per-arm context. Its
        shape must be consistent across calls.
      max_num_actions: (int) the maximum number of actions in every sample. If
        `num_actions_fn` is not set, this many actions are available in every
        time step.
      reward_fn: A function that generates a reward when called with an
        observation.
      num_actions_fn: If set, it should be a function that outputs a single
        integer specifying the number of actions for a given time step. The
        value output by this function will be capped between 1 and
        `max_num_actions`. The number of actions will be encoded in the
        observation by the feature key `num_actions`.
      batch_size: The batch size.
    """
        self._global_context_sampling_fn = global_context_sampling_fn
        self._arm_context_sampling_fn = arm_context_sampling_fn
        self._max_num_actions = max_num_actions
        self._reward_fn = reward_fn
        self._batch_size = batch_size
        self._num_actions_fn = num_actions_fn

        observation_spec = {
            GLOBAL_KEY:
            array_spec.ArraySpec.from_array(global_context_sampling_fn()),
            PER_ARM_KEY:
            array_spec.add_outer_dims_nest(
                array_spec.ArraySpec.from_array(arm_context_sampling_fn()),
                (max_num_actions, ))
        }
        if self._num_actions_fn is not None:
            num_actions_spec = array_spec.BoundedArraySpec(
                shape=(),
                dtype=np.dtype(type(self._num_actions_fn())),
                minimum=1,
                maximum=max_num_actions)
            observation_spec.update({NUM_ACTIONS_KEY: num_actions_spec})

        action_spec = array_spec.BoundedArraySpec(shape=(),
                                                  dtype=np.int32,
                                                  minimum=0,
                                                  maximum=max_num_actions - 1,
                                                  name='action')

        super(StationaryStochasticPerArmPyEnvironment,
              self).__init__(observation_spec, action_spec)
示例#24
0
 def testNotWriteable(self):
   spec = array_spec.BoundedArraySpec((1, 2, 3), np.float32, 0, (5, 5, 5))
   with self.assertRaisesRegexp(ValueError, "read-only"):
     spec.minimum[0] = -1
   with self.assertRaisesRegexp(ValueError, "read-only"):
     spec.maximum[0] = 100
示例#25
0
 def observation_spec(self):
     return array_spec.BoundedArraySpec(
         shape=(360,),
         dtype=np.dtype('float64'),
         name='observation'
     )
示例#26
0
 def testEqualBroadcastingBounds(self):
   spec_1 = array_spec.BoundedArraySpec(
       (1, 2), np.int32, minimum=0.0, maximum=1.0)
   spec_2 = array_spec.BoundedArraySpec(
       (1, 2), np.int32, minimum=[0.0, 0.0], maximum=[1.0, 1.0])
   self.assertEqual(spec_1, spec_2)
示例#27
0
 def setUp(self):
     super(AgentPolicyTest, self).setUp()
     self._action_spec = array_spec.BoundedArraySpec(shape=(3, ),
                                                     dtype=np.float,
                                                     minimum=[0, 0, 0],
                                                     maximum=[1, 1, 1])
示例#28
0
 def testReuseSpec(self):
   spec_1 = array_spec.BoundedArraySpec(
       (1, 2), np.int32, minimum=0.0, maximum=1.0)
   spec_2 = array_spec.BoundedArraySpec(spec_1.shape, spec_1.dtype,
                                        spec_1.minimum, spec_1.maximum)
   self.assertEqual(spec_1, spec_2)
示例#29
0
 def test_continuous(self):
   obs_spec = array_spec.BoundedArraySpec((2, 3), np.int32, -10, 10)
   action_spec = array_spec.BoundedArraySpec((2,), np.float32, -1, 1)
   with self.assertRaisesRegexp(ValueError, 'discrete action specs'):
     env = random_py_environment.RandomPyEnvironment(obs_spec, action_spec)
     env = wrappers.ActionOffsetWrapper(env)
示例#30
0
from random import randint

from tf_agents.specs import array_spec

from generic_environment import GenericEnv
from dqn_agent import DqnAgent

#params
num_episode = 2000  # @param
board_size = 9

#env
dqn = DqnAgent(
    array_spec.BoundedArraySpec(shape=(),
                                dtype=np.int32,
                                minimum=0,
                                maximum=3,
                                name='action'),
    array_spec.BoundedArraySpec(shape=(2, ),
                                dtype=np.int32,
                                minimum=0,
                                maximum=board_size,
                                name='observation'),
    np.array([0, 0], dtype=np.int32))

#[row, column]
state = np.array([0, 0], dtype=np.int32)

episode_count = 0
step_count = 0
while episode_count < num_episode: