Exemplo n.º 1
0
 def request_output(self, obs):
     # obs must match the data structure defined in self.ds
     data = self.ds.flatten(obs)
     if self._compress:
         data = TensorZipper.compress(data)
     else:
         data = pickle.dumps(data)
     return self._request(data)
Exemplo n.º 2
0
 def run(self):
     self.replay_task = self._data_pool_apis.request_replay_task()
     while self.replay_task != "":
         game_version = self.replay_task.game_version or self._game_version
         self._adapt_system(game_version)
         if game_version != self._game_version:
             # need re-init replay converter
             self._game_version = game_version
             self.converter_config['game_version'] = game_version
             self._replay_converter = self.replay_converter_type(
                 **self.converter_config)
         game_core_config = ({} if 'game_core_config'
                             not in self.converter_config else
                             self.converter_config['game_core_config'])
         extractor = ReplayExtractor(
             replay_dir=self._replay_dir,
             replay_filename=self.replay_task.replay_name,
             player_id=self.replay_task.player_id,
             replay_converter=self._replay_converter,
             step_mul=self._step_mul,
             version=game_version,
             game_core_config=game_core_config,
             da_rate=self._da_rate,
             unk_mmr_dft_to=self._unk_mmr_dft_to)
         self._steps = 0
         first_frame = True
         if self._use_policy:
             self.agent.reset()
             self._update_agent_model()
         for frame in extractor.extract():
             if self._post_process_data:
                 obs, act = self._post_process_data(*frame[0])
             else:
                 obs, act = frame[0]
             if self._use_policy:
                 data = (obs, act, self.agent.state,
                         np.array(first_frame, np.bool))
                 self.agent.update_state(obs)
                 first_frame = False
             else:
                 data = (obs, act)
             data = self.ds.flatten(self.ds.structure(data))
             if self._data_queue.full():
                 logger.log("Actor's queue is full.", level=logger.WARN)
             self._data_queue.put((TensorZipper.compress(data), frame[1]))
             logger.log('successfully put one tuple.', level=logger.DEBUG)
             self._steps += 1
             if self._steps % self._log_interval == 0:
                 logger.log(
                     "%d frames of replay task [%s] sent to learner." %
                     (self._steps, self.replay_task))
             if self._use_policy and self._steps % self._update_model_freq == 0:
                 self._update_agent_model()
         logger.log("Replay task [%s] done. %d frames sent to learner." %
                    (self.replay_task, self._steps))
         self.replay_task = self._data_pool_apis.request_replay_task()
     logger.log("All tasks done.")
Exemplo n.º 3
0
 def data_generator(self):
     pull_socket = self._zmq_context.socket(zmq.PULL)
     pull_socket.connect(self.req_ep)
     while True:
         msg = pull_socket.recv_multipart()
         if self._compress:
             data = TensorZipper.decompress(msg[-1])
         else:
             data = pickle.loads(msg[-1])
         yield data + (msg[0], )
Exemplo n.º 4
0
 def data_generator(self):
     while True:
         while True:
             try:
                 msg = self._data_queue.get_nowait()
                 break
             except:
                 time.sleep(0.01)
         if self._compress:
             data = TensorZipper.decompress(msg[-1])
         else:
             data = pickle.loads(msg[-1])
         yield data + (msg[0].bytes, )
Exemplo n.º 5
0
 def request_output(self, obs):
     # obs must match the data structure defined in self.ds
     data = self.ds.flatten(obs)
     if self._compress:
         data = TensorZipper.compress(data)
     else:
         data = pickle.dumps(data)
     self._req_socket.send(data)
     while True:
         try:
             ret = self._req_socket.recv_pyobj()
             break
         except Exception as e:
             print(
                 f'Exception:{e} After {self.timeout} ms for request inference '
                 f'service {self.server_addr}, restart a socket and try again!'
             )
             self._rebuild_socket()
             self._req_socket.send(data)
     return ret
Exemplo n.º 6
0
    def _push_data_to_learner(self, data_queue):
        logger.log('entering _push_data_to_learner',
                   'steps: {}'.format(self._steps),
                   level=logger.DEBUG + 5)
        me_id = self._learning_agent_id  # short name
        oppo_id = self._oppo_agent_id  # short name

        # initialize
        last_obs, actions, reward, info, done, other_vars = data_queue.get()
        if self.distillation:
            self._update_distill_agent_model()
            self.distill_agent.reset(last_obs[me_id])

        # loop infinitely to make the unroll on and on
        while True:
            data_model_id = self.task.model_key1
            mb_rewards, mb_values, mb_dones, mb_skips = [], [], [], []
            unroll = []
            infos = []
            mask = False
            while True:
                if last_obs[me_id] is not None:
                    # extend the unroll until a desired length
                    me_action = actions[me_id]
                    if isinstance(me_action, list):
                        me_action = tuple(me_action)
                    # Make a `data` for this time step. The `data` is a PGData compatible
                    # list, see the PGData definition
                    data = [last_obs[me_id], me_action, other_vars['neglogp']]
                    if self.rnn:
                        # hidden state and temporal mask for rnn
                        data.extend(
                            [other_vars['state'],
                             np.array(mask, np.bool)])
                    if self.distillation:
                        # teacher logits
                        head_param = (self.distill_agent.head_param(
                            last_obs[me_id], me_action)
                                      if last_obs[me_id] is not None else None)
                        data.append(head_param)
                    if self.use_oppo_obs:
                        # for fully centralized value net
                        data.append(last_obs[oppo_id])
                        if self.rnn:
                            # oppo hidden state for rnn; mask same as self_agent
                            data.append(other_vars['oppo_state'])
                    data = self.ds.structure(data)
                    data.V = other_vars['v']
                    data.R = 0.0  # filled later by td_lambda return
                    mb_values.append(other_vars['v'])
                    mb_rewards.append(reward)
                    mb_dones.append(done)
                    # Notice: a new episode must starts with a valid obs, not None obs,
                    # which is correct currently. Otherwise, mask will be incorrect since
                    # it is decided by the last frame's done
                    mask = done
                    unroll.append(data)
                    mb_skips.append(0)
                else:
                    mb_skips[-1] += 1
                    mb_rewards[-1] += (self._gamma**mb_skips[-1]) * reward
                    mb_dones[-1] += done

                last_obs, actions, reward, info, done, other_vars = data_queue.get(
                )
                if done:
                    infos.append(info)
                if mask and self.distillation:
                    self._update_distill_agent_model()
                    self.distill_agent.reset(last_obs[me_id])

                if len(unroll) >= self._unroll_length and last_obs[
                        me_id] is not None:
                    # need to collect a complete Noop duration
                    break

            last_gae_lam = 0
            for t in reversed(range(self._unroll_length)):
                next_values = (other_vars['v'] if t == self._unroll_length -
                               1 else mb_values[t + 1])
                delta = (mb_rewards[t] +
                         (self._gamma**(mb_skips[t] + 1)) * next_values *
                         (1 - mb_dones[t]) - mb_values[t])
                last_gae_lam = (delta +
                                (self._gamma**(mb_skips[t] + 1)) * self._lam *
                                (1 - mb_dones[t]) * last_gae_lam)
                unroll[t].R = np.array(last_gae_lam + mb_values[t], np.float32)
            compressed_unroll = [
                TensorZipper.compress(self.ds.flatten(_data))
                for _data in unroll
            ]
            self._learner_apis.push_data(
                (data_model_id, compressed_unroll, infos))
            logger.log(
                f"Pushed one unroll to learner at time "
                f"{time.strftime('%Y%m%d%H%M%S')}",
                level=logger.DEBUG + 5)
Exemplo n.º 7
0
    def _push_data_to_learner(self, data_queue):
        logger.log('entering _push_data_to_learner',
                   'steps: {}'.format(self._steps),
                   level=logger.DEBUG + 5)
        me_id = self._learning_agent_id  # short name
        oppo_id = self._oppo_agent_id  # short name

        # initialize
        last_obs, actions, reward, info, done, other_vars = data_queue.get()
        if self.distillation:
            self._update_distill_agent_model()
            self.distill_agent.reset(last_obs[me_id])
        if self.use_oppo_obs:
            value, state, neglogpac, oppo_state = other_vars
        else:
            value, state, neglogpac = other_vars
            oppo_state = None

        # loop infinitely to make the unroll on and on
        while True:
            data_model_id = self.task.model_key1
            mb_skips = []
            unroll = []
            infos = []
            mask = False
            while True:
                if last_obs[me_id] is not None:
                    # extend the unroll until a desired length
                    me_action = actions[me_id]
                    if isinstance(me_action, list):
                        me_action = tuple(me_action)
                    # Make a `data` for this time step. The `data` is a PGData compatible
                    # list, see the PGData definition
                    data = [last_obs[me_id], me_action, neglogpac]
                    if self.rnn:
                        # hidden state and temporal mask for rnn
                        data.extend([state, np.array(mask, np.bool)])
                    if self.distillation:
                        # teacher logits
                        logits = (self.distill_agent.logits(
                            last_obs[me_id], me_action)
                                  if last_obs[me_id] is not None else None)
                        data.append(logits)
                    if self.use_oppo_obs:
                        # for fully centralized value net
                        data.append(last_obs[oppo_id])
                        if self.rnn:
                            # oppo hidden state for rnn; mask same as self_agent
                            data.append(oppo_state)
                    data = self.ds.structure(data)
                    data.r = reward
                    data.discount = 1.0
                    # Notice: a new episode must starts with a valid obs, not None obs,
                    # which is correct currently. Otherwise, mask will be incorrect since
                    # it is decided by the last frame's done
                    mask = done
                    unroll.append(data)
                    mb_skips.append(0)
                else:
                    mb_skips[-1] += 1
                    # correct cumulated reward and discount factor
                    data.r += (self._gamma**mb_skips[-1]) * reward
                    data.discount *= (1 - done) * self._gamma

                last_obs, actions, reward, info, done, other_vars = data_queue.get(
                )
                if self.use_oppo_obs:
                    value, state, neglogpac, oppo_state = other_vars
                else:
                    value, state, neglogpac = other_vars
                if done:
                    info = deepcopy(info)
                    info['outcome'] = self.log_outcome(info)
                    infos.append(info)
                if mask and self.distillation:
                    self._update_distill_agent_model()
                    self.distill_agent.reset(last_obs[me_id])

                if len(unroll) >= self._unroll_length and last_obs[
                        me_id] is not None:
                    # need to collect a complete Noop duration
                    break

            compressed_unroll = [
                TensorZipper.compress(self.ds.flatten(_data))
                for _data in unroll
            ]
            self._learner_apis.push_data(
                (data_model_id, compressed_unroll, infos))
            logger.log(
                f"Pushed one unroll to learner at time "
                f"{time.strftime('%Y%m%d%H%M%S')}",
                level=logger.DEBUG + 5)
Exemplo n.º 8
0
 def _decode_sample(self, sample):
     return TensorZipper.decompress(sample)
Exemplo n.º 9
0
def main(_):
    policy = "tpolicies.net_zoo.mnet_v6.mnet_v6d6"
    policy_config = {
        'use_xla': True,
        'test': False,
        'use_loss_type': 'none',
        'use_value_head': False,
        'use_self_fed_heads': True,
        'use_lstm': True,
        'nlstm': 256,
        'hs_len': 256 * 2,
        'lstm_duration': 1,
        'lstm_dropout_rate': 0.0,
        'lstm_cell_type': 'lstm',
        'lstm_layer_norm': True,
        'weight_decay': 0.00002,
        'arg_scope_type': 'type_b',
        'endpoints_verbosity': 10,
        'n_v': 7,
        'distillation': True,
        'fix_all_embed': False,
        'use_base_mask': True,
        'zstat_embed_version': 'v3',
        'sync_statistics': 'horovod',
        'temperature': 0.8,
        'merge_pi': False,
    }
    converter_config = {
        'zstat_data_src': '/root/replay_ds/rp1522-mv-zstat',
        'input_map_size': (128, 128),
        'output_map_size': (128, 128),
        'delete_useless_selection': False,
        'dict_space': True,
        'max_bo_count': 50,
        'max_bobt_count': 20,
        'zstat_zeroing_prob': 0.1,
        'zmaker_version': 'v5',
    }
    policy = import_module_or_data(policy)
    replay_converter_name = "timitate.lib6.pb2all_converter.PB2AllConverter"
    converter_module, converter_name = replay_converter_name.rsplit(".", 1)
    replay_converter_type = getattr(importlib.import_module(converter_module),
                                    converter_name)
    replay_converter = replay_converter_type(**converter_config)
    ob_space, ac_space = replay_converter.space
    rnn = (False
           if 'use_lstm' not in policy_config else policy_config['use_lstm'])
    hs_len = (policy_config['hs_len'] if
              ('hs_len' in policy_config) else 2 * policy_config['nlstm'] if
              ('nlstm' in policy_config) else 128)
    ds = InfData(ob_space, ac_space, policy_config['use_self_fed_heads'], rnn,
                 hs_len)
    cached_ds = ILData(ob_space, ac_space, rnn, hs_len)

    if FLAGS.role == 'Server':
        S = InfServer(None,
                      None,
                      FLAGS.port,
                      ds,
                      FLAGS.batch_size,
                      ob_space,
                      ac_space,
                      policy,
                      policy_config=policy_config,
                      gpu_id=FLAGS.gpu_id,
                      pull_worker_num=FLAGS.pull_worker_num)
        S.run()
    elif FLAGS.role == 'Actor':
        data = pickle.load(open('data', 'rb'))
        data_set = [
            cached_ds.make_structure(TensorZipper.decompress(d)) for d in data
        ]
        data_set = [ds.structure(d.X, d.S, d.M) for d in data_set]
        n = len(data_set)
        policy_config['batch_size'] = 1
        policy_config['rollout_len'] = 1
        policy_config['use_loss_type'] = 'none'
        if FLAGS.use_gpu_server:
            from tleague.actors.agent import PGAgentGPU
            agent = PGAgentGPU(FLAGS.server_addr, ds, hs_len)
        else:
            from tleague.actors.agent import PGAgent2
            agent = PGAgent2(policy,
                             ob_space,
                             ac_space,
                             policy_config=policy_config)
        while True:
            t0 = time.time()
            for sample in data_set:
                pred = agent.step(sample.X)
                # print(pred['A_AB'])
            cost = time.time() - t0
            print('Predict {} samples costs {} seconds, fps {}.'.format(
                n, cost, n / cost),
                  flush=True)
Exemplo n.º 10
0
 def data_generator():
     while True:
         while not rm.ready_for_sample():
             time.sleep(5)
         for sample, weight in rm.sample_rollout():
             yield (TensorZipper.decompress(sample), weight)