def test_batch(): assert list(Batch()) == [] assert Batch().is_empty() assert not Batch(b={'c': {}}).is_empty() assert Batch(b={'c': {}}).is_empty(recurse=True) assert not Batch(a=Batch(), b=Batch(c=Batch())).is_empty() assert Batch(a=Batch(), b=Batch(c=Batch())).is_empty(recurse=True) assert not Batch(d=1).is_empty() assert not Batch(a=np.float64(1.0)).is_empty() assert len(Batch(a=[1, 2, 3], b={'c': {}})) == 3 assert not Batch(a=[1, 2, 3]).is_empty() b = Batch({'a': [4, 4], 'b': [5, 5]}, c=[None, None]) assert b.c.dtype == object b = Batch(d=[None], e=[starmap], f=Batch) assert b.d.dtype == b.e.dtype == object and b.f == Batch b = Batch() b.update() assert b.is_empty() b.update(c=[3, 5]) assert np.allclose(b.c, [3, 5]) # mimic the behavior of dict.update, where kwargs can overwrite keys b.update({'a': 2}, a=3) assert 'a' in b and b.a == 3 assert b.pop('a') == 3 assert 'a' not in b with pytest.raises(AssertionError): Batch({1: 2}) assert Batch(a=[np.zeros((2, 3)), np.zeros((3, 3))]).a.dtype == object with pytest.raises(TypeError): Batch(a=[np.zeros((3, 2)), np.zeros((3, 3))]) with pytest.raises(TypeError): Batch(a=[torch.zeros((2, 3)), torch.zeros((3, 3))]) with pytest.raises(TypeError): Batch(a=[torch.zeros((3, 3)), np.zeros((3, 3))]) with pytest.raises(TypeError): Batch(a=[1, np.zeros((3, 3)), torch.zeros((3, 3))]) batch = Batch(a=[torch.ones(3), torch.ones(3)]) assert torch.allclose(batch.a, torch.ones(2, 3)) batch.cat_(batch) assert torch.allclose(batch.a, torch.ones(4, 3)) Batch(a=[]) batch = Batch(obs=[0], np=np.zeros([3, 4])) assert batch.obs == batch["obs"] batch.obs = [1] assert batch.obs == [1] batch.cat_(batch) assert np.allclose(batch.obs, [1, 1]) assert batch.np.shape == (6, 4) assert np.allclose(batch[0].obs, batch[1].obs) batch.obs = np.arange(5) for i, b in enumerate(batch.split(1, shuffle=False)): if i != 5: assert b.obs == batch[i].obs else: with pytest.raises(AttributeError): batch[i].obs with pytest.raises(AttributeError): b.obs print(batch) batch = Batch(a=np.arange(10)) with pytest.raises(AssertionError): list(batch.split(0)) data = [ (1, False, [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]), (1, True, [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]), (3, False, [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]), (3, True, [[0, 1, 2], [3, 4, 5], [6, 7, 8, 9]]), (5, False, [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]), (5, True, [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]), (7, False, [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]), (7, True, [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]), (10, False, [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]), (10, True, [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]), (15, False, [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]), (15, True, [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]), (100, False, [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]), (100, True, [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]), ] for size, merge_last, result in data: bs = list(batch.split(size, shuffle=False, merge_last=merge_last)) assert [bs[i].a.tolist() for i in range(len(bs))] == result batch_dict = {'b': np.array([1.0]), 'c': 2.0, 'd': torch.Tensor([3.0])} batch_item = Batch({'a': [batch_dict]})[0] assert isinstance(batch_item.a.b, np.ndarray) assert batch_item.a.b == batch_dict['b'] assert isinstance(batch_item.a.c, float) assert batch_item.a.c == batch_dict['c'] assert isinstance(batch_item.a.d, torch.Tensor) assert batch_item.a.d == batch_dict['d'] batch2 = Batch(a=[{ 'b': np.float64(1.0), 'c': np.zeros(1), 'd': Batch(e=np.array(3.0))}]) assert len(batch2) == 1 assert Batch().shape == [] assert Batch(a=1).shape == [] assert Batch(a=set((1, 2, 1))).shape == [] assert batch2.shape[0] == 1 assert 'a' in batch2 and all([i in batch2.a for i in 'bcd']) with pytest.raises(IndexError): batch2[-2] with pytest.raises(IndexError): batch2[1] assert batch2[0].shape == [] with pytest.raises(IndexError): batch2[0][0] with pytest.raises(TypeError): len(batch2[0]) assert isinstance(batch2[0].a.c, np.ndarray) assert isinstance(batch2[0].a.b, np.float64) assert isinstance(batch2[0].a.d.e, np.float64) batch2_from_list = Batch(list(batch2)) batch2_from_comp = Batch([e for e in batch2]) assert batch2_from_list.a.b == batch2.a.b assert batch2_from_list.a.c == batch2.a.c assert batch2_from_list.a.d.e == batch2.a.d.e assert batch2_from_comp.a.b == batch2.a.b assert batch2_from_comp.a.c == batch2.a.c assert batch2_from_comp.a.d.e == batch2.a.d.e for batch_slice in [batch2[slice(0, 1)], batch2[:1], batch2[0:]]: assert batch_slice.a.b == batch2.a.b assert batch_slice.a.c == batch2.a.c assert batch_slice.a.d.e == batch2.a.d.e batch2.a.d.f = {} batch2_sum = (batch2 + 1.0) * 2 assert batch2_sum.a.b == (batch2.a.b + 1.0) * 2 assert batch2_sum.a.c == (batch2.a.c + 1.0) * 2 assert batch2_sum.a.d.e == (batch2.a.d.e + 1.0) * 2 assert batch2_sum.a.d.f.is_empty() with pytest.raises(TypeError): batch2 += [1] batch3 = Batch(a={ 'c': np.zeros(1), 'd': Batch(e=np.array([0.0]), f=np.array([3.0]))}) batch3.a.d[0] = {'e': 4.0} assert batch3.a.d.e[0] == 4.0 batch3.a.d[0] = Batch(f=5.0) assert batch3.a.d.f[0] == 5.0 with pytest.raises(ValueError): batch3.a.d[0] = Batch(f=5.0, g=0.0) with pytest.raises(ValueError): batch3[0] = Batch(a={"c": 2, "e": 1}) # auto convert batch4 = Batch(a=np.array(['a', 'b'])) assert batch4.a.dtype == object # auto convert to object batch4.update(a=np.array(['c', 'd'])) assert list(batch4.a) == ['c', 'd'] assert batch4.a.dtype == object # auto convert to object batch5 = Batch(a=np.array([{'index': 0}])) assert isinstance(batch5.a, Batch) assert np.allclose(batch5.a.index, [0]) batch5.b = np.array([{'index': 1}]) assert isinstance(batch5.b, Batch) assert np.allclose(batch5.b.index, [1]) # None is a valid object and can be stored in Batch a = Batch.stack([Batch(a=None), Batch(b=None)]) assert a.a[0] is None and a.a[1] is None assert a.b[0] is None and a.b[1] is None # nx.Graph corner case assert Batch(a=np.array([nx.Graph(), nx.Graph()], dtype=object)).a.dtype == object g1 = nx.Graph() g1.add_nodes_from(list(range(10))) g2 = nx.Graph() g2.add_nodes_from(list(range(20))) assert Batch(a=np.array([g1, g2])).a.dtype == object
def collect( self, n_step: Optional[int] = None, n_episode: Optional[Union[int, List[int]]] = None, random: bool = False, render: Optional[float] = None, no_grad: bool = True, ) -> Dict[str, float]: """Collect a specified number of step or episode. :param int n_step: how many steps you want to collect. :param n_episode: how many episodes you want to collect. If it is an int, it means to collect at lease ``n_episode`` episodes; if it is a list, it means to collect exactly ``n_episode[i]`` episodes in the i-th environment :param bool random: whether to use random policy for collecting data, defaults to False. :param float render: the sleep time between rendering consecutive frames, defaults to None (no rendering). :param bool no_grad: whether to retain gradient in policy.forward, defaults to True (no gradient retaining). .. note:: One and only one collection number specification is permitted, either ``n_step`` or ``n_episode``. :return: A dict including the following keys * ``n/ep`` the collected number of episodes. * ``n/st`` the collected number of steps. * ``v/st`` the speed of steps per second. * ``v/ep`` the speed of episode per second. * ``rew`` the mean reward over collected episodes. * ``len`` the mean length over collected episodes. """ assert (n_step is not None and n_episode is None and n_step > 0) or ( n_step is None and n_episode is not None and np.sum(n_episode) > 0 ), "Only one of n_step or n_episode is allowed in Collector.collect, " f"got n_step = {n_step}, n_episode = {n_episode}." start_time = time.time() step_count = 0 # episode of each environment episode_count = np.zeros(self.env_num) # If n_episode is a list, and some envs have collected the required # number of episodes, these envs will be recorded in this list, and # they will not be stepped. finished_env_ids = [] rewards = [] whole_data = Batch() if isinstance(n_episode, list): assert len(n_episode) == self.get_env_num() finished_env_ids = [ i for i in self._ready_env_ids if n_episode[i] <= 0] self._ready_env_ids = np.array( [x for x in self._ready_env_ids if x not in finished_env_ids]) while True: if step_count >= 100000 and episode_count.sum() == 0: warnings.warn( "There are already many steps in an episode. " "You should add a time limitation to your environment!", Warning) is_async = self.is_async or len(finished_env_ids) > 0 if is_async: # self.data are the data for all environments in async # simulation or some envs have finished, # **only a subset of data are disposed**, # so we store the whole data in ``whole_data``, let self.data # to be the data available in ready environments, and finally # set these back into all the data whole_data = self.data self.data = self.data[self._ready_env_ids] # restore the state and the input data last_state = self.data.state if isinstance(last_state, Batch) and last_state.is_empty(): last_state = None self.data.update(state=Batch(), obs_next=Batch(), policy=Batch()) # calculate the next action if random: spaces = self._action_space result = Batch( act=[spaces[i].sample() for i in self._ready_env_ids]) else: if no_grad: with torch.no_grad(): # faster than retain_grad version result = self.policy(self.data, last_state) else: result = self.policy(self.data, last_state) state = result.get("state", Batch()) # convert None to Batch(), since None is reserved for 0-init if state is None: state = Batch() self.data.update(state=state, policy=result.get("policy", Batch())) # save hidden state to policy._state, in order to save into buffer if not (isinstance(state, Batch) and state.is_empty()): self.data.policy._state = self.data.state self.data.act = to_numpy(result.act) if self._action_noise is not None: assert isinstance(self.data.act, np.ndarray) self.data.act += self._action_noise(self.data.act.shape) # step in env if not is_async: obs_next, rew, done, info = self.env.step(self.data.act) else: # store computed actions, states, etc _batch_set_item( whole_data, self._ready_env_ids, self.data, self.env_num) # fetch finished data obs_next, rew, done, info = self.env.step( self.data.act, id=self._ready_env_ids) self._ready_env_ids = np.array([i["env_id"] for i in info]) # get the stepped data self.data = whole_data[self._ready_env_ids] # move data to self.data self.data.update(obs_next=obs_next, rew=rew, done=done, info=info) if render: self.env.render() time.sleep(render) # add data into the buffer if self.preprocess_fn: result = self.preprocess_fn(**self.data) # type: ignore self.data.update(result) for j, i in enumerate(self._ready_env_ids): # j is the index in current ready_env_ids # i is the index in all environments if self.buffer is None: # users do not want to store data, so we store # small fake data here to make the code clean self._cached_buf[i].add(obs=0, act=0, rew=rew[j], done=0) else: self._cached_buf[i].add(**self.data[j]) if done[j]: if not (isinstance(n_episode, list) and episode_count[i] >= n_episode[i]): episode_count[i] += 1 rewards.append(self._rew_metric( np.sum(self._cached_buf[i].rew, axis=0))) step_count += len(self._cached_buf[i]) if self.buffer is not None: self.buffer.update(self._cached_buf[i]) if isinstance(n_episode, list) and \ episode_count[i] >= n_episode[i]: # env i has collected enough data, it has finished finished_env_ids.append(i) self._cached_buf[i].reset() self._reset_state(j) obs_next = self.data.obs_next if sum(done): env_ind_local = np.where(done)[0] env_ind_global = self._ready_env_ids[env_ind_local] obs_reset = self.env.reset(env_ind_global) if self.preprocess_fn: obs_reset = self.preprocess_fn( obs=obs_reset).get("obs", obs_reset) obs_next[env_ind_local] = obs_reset self.data.obs = obs_next if is_async: # set data back whole_data = deepcopy(whole_data) # avoid reference in ListBuf _batch_set_item( whole_data, self._ready_env_ids, self.data, self.env_num) # let self.data be the data in all environments again self.data = whole_data self._ready_env_ids = np.array( [x for x in self._ready_env_ids if x not in finished_env_ids]) if n_step: if step_count >= n_step: break else: if isinstance(n_episode, int) and \ episode_count.sum() >= n_episode: break if isinstance(n_episode, list) and \ (episode_count >= n_episode).all(): break # finished envs are ready, and can be used for the next collection self._ready_env_ids = np.array( self._ready_env_ids.tolist() + finished_env_ids) # generate the statistics episode_count = sum(episode_count) duration = max(time.time() - start_time, 1e-9) self.collect_step += step_count self.collect_episode += episode_count self.collect_time += duration return { "n/ep": episode_count, "n/st": step_count, "v/st": step_count / duration, "v/ep": episode_count / duration, "rew": np.mean(rewards), "rew_std": np.std(rewards), "len": step_count / episode_count, }
def collect( self, n_step: Optional[int] = None, n_episode: Optional[Union[int, List[int]]] = None, # 多少个episodes random: bool = False, render: Optional[float] = None, no_grad: bool = True, ) -> Dict[str, float]: """Collect a specified number of step or episode. :param int n_step: how many steps you want to collect. :param n_episode: how many episodes you want to collect. If it is an int, it means to collect at lease ``n_episode`` episodes; if it is a list, it means to collect exactly ``n_episode[i]`` episodes in the i-th environment :param bool random: whether to use random policy for collecting data, defaults to False. :param float render: the sleep time between rendering consecutive frames, defaults to None (no rendering). :param bool no_grad: whether to retain gradient in policy.forward, defaults to True (no gradient retaining). .. note:: One and only one collection number specification is permitted, either ``n_step`` or ``n_episode``. :return: A dict including the following keys * ``n/ep`` the collected number of episodes. * ``n/st`` the collected number of steps. * ``v/st`` the speed of steps per second. * ``v/ep`` the speed of episode per second. * ``rew`` the mean reward over collected episodes. * ``len`` the mean length over collected episodes. """ assert (n_step is not None and n_episode is None and n_step > 0) or ( n_step is None and n_episode is not None and np.sum(n_episode) > 0 ), "Only one of n_step or n_episode is allowed in Collector.collect, " f"got n_step = {n_step}, n_episode = {n_episode}." start_time = time.time() step_count = 0 # episode of each environment episode_count = np.zeros(self.env_num) # If n_episode is a list, and some envs have collected the required # number of episodes, these envs will be recorded in this list, and # they will not be stepped. finished_env_ids = [] rewards = [] whole_data = Batch() if isinstance(n_episode, list): assert len(n_episode) == self.get_env_num() finished_env_ids = [ i for i in self._ready_env_ids if n_episode[i] <= 0 ] self._ready_env_ids = np.array( [x for x in self._ready_env_ids if x not in finished_env_ids]) right, wrong = 0., 0. mate_num = 0. right_index = defaultdict(int) while True: if step_count >= 100000 and episode_count.sum() == 0: warnings.warn( "There are already many steps in an episode. " "You should add a time limitation to your environment!", Warning) is_async = self.is_async or len(finished_env_ids) > 0 if is_async: # self.data are the data for all environments in async(异步的) # simulation or some envs have finished, # **only a subset of data are disposed**, # so we store the whole data in ``whole_data``, let self.data # to be the data available in ready environments, and finally # set these back into all the data whole_data = self.data self.data = self.data[self._ready_env_ids] # restore the state and the input data last_state = self.data.state if isinstance(last_state, Batch) and last_state.is_empty(): last_state = None self.data.update(state=Batch(), obs_next=Batch(), policy=Batch()) # print("self.data: ", self.data) # print("know: ", self.env.goal_num) # calculate the next action # print("self.data.obs: ", self.data.obs.shape) if random: spaces = self._action_space result = Batch( act=[spaces[i].sample() for i in self._ready_env_ids]) else: if no_grad: with torch.no_grad(): # faster than retain_grad version result = self.policy(self.data, last_state) else: result = self.policy(self.data, last_state) # print("result: ", result['logits'].size()) # print("really: ", self.env.goal_num) state = result.get("state", Batch()) # convert None to Batch(), since None is reserved for 0-init if state is None: state = Batch() self.data.update(state=state, policy=result.get("policy", Batch())) # save hidden state to policy._state, in order to save into buffer if not (isinstance(state, Batch) and state.is_empty()): self.data.policy._state = self.data.state self.data.act = to_numpy(result.act) if self._action_noise is not None: assert isinstance(self.data.act, np.ndarray) self.data.act += self._action_noise(self.data.act.shape) # step in env # print(self.env_num) # print("is_async: ", is_async) # print("in collect data.act: ", self.data.act) # print('f**k', self.env.goal_num) # print('self.data: ', type(self.data.act)) # print(self.data.act) # 实在不行就在这里修改一下action吧 if not is_async: obs_next, rew, done, info = self.env.step(self.data.act) # print("kk: ", self.env.goal_num) else: # store computed actions, states, etc _batch_set_item(whole_data, self._ready_env_ids, self.data, self.env_num) # fetch finished data obs_next, rew, done, info = self.env.step( self.data.act, id=self._ready_env_ids) self._ready_env_ids = np.array([i["env_id"] for i in info]) # get the stepped data self.data = whole_data[self._ready_env_ids] # move data to self.data # print("in every step info: ",info) # print("self.data: ", type(self.data)) # self.data.update(obs_next=obs_next, rew=rew, done=done, info=info) # 暂时还不能更新info,得要在更新obs的地方更新info self.data.update(obs_next=obs_next, rew=rew, done=done) # 暂时还不能更新info,得要在更新obs的地方更新info # print("what? ", done) # print("action: ", self.data.act) if render: self.env.render() time.sleep(render) # print('Updatea: ', self.env.goal_num) # add data into the buffer if self.preprocess_fn: result = self.preprocess_fn(**self.data) # type: ignore self.data.update(result) # print("self._ready_env_ids: ", self._ready_env_ids) # print('len: ', [len(self._cached_buf[i]) for i in self._ready_env_ids]) # print("",self.data) for j, i in enumerate(self._ready_env_ids): # print(i,j) # j is the index in current ready_env_ids # i is the index in all environments if self.buffer is None: # users do not want to store data, so we store # small fake data here to make the code clean self._cached_buf[i].add( obs=0, act=0, rew=rew[j], done=0) # 每一个env都有一个cached_bug收集已经经历过的状态 else: self._cached_buf[i].add( **self.data[j]) # 增加,并非覆盖,所以过程中的所有采样都会被采样到 # print("maybe: ", self.env.goal_num) # print("buffer: ") # print(self.buffer) # print("done: ",done) if done[j]: if not (isinstance(n_episode, list) and episode_count[i] >= n_episode[i]): episode_count[i] += 1 rewards.append( self._rew_metric( np.sum(self._cached_buf[i].rew, axis=0))) step_count += len(self._cached_buf[i]) if self.buffer is not None: self.buffer.update(self._cached_buf[i]) if isinstance(n_episode, list) and \ episode_count[i] >= n_episode[i]: # env i has collected enough data, it has finished finished_env_ids.append(i) # print("right? ", info[j]['right']) # print("two: ", self.env.goal_num) mate_num += info[j]['mate_num'] # print("mate_num:", mate_num) if info[j]['right']: right += 1 right_index[info[j]['ans']] += 1 else: wrong += 1 self._cached_buf[i].reset() self._reset_state(j) # print("really?", i, j) # print("three: ", self.env.goal_num) # print("after done: ", self.data['obs_next']) obs_next = self.data.obs_next self.data.info = info if sum(done): ##在这里会自动更新一个新的state env_ind_local = np.where(done)[0] env_ind_global = self._ready_env_ids[env_ind_local] # print("env_ind_global: ", env_ind_global) obs_reset = self.env.reset(env_ind_global) self.data['info']['history'][env_ind_local] = np.where( obs_reset != 0, np.ones_like(obs_reset), np.zeros_like(obs_reset)) self.data['info']['turn'][env_ind_local] = np.zeros( len(env_ind_global)) # print("Data: ", self.data) # print("obs_reset: ",obs_reset) if self.preprocess_fn: obs_reset = self.preprocess_fn(obs=obs_reset).get( "obs", obs_reset) obs_next[env_ind_local] = obs_reset self.data.obs = obs_next if is_async: # set data back whole_data = deepcopy(whole_data) # avoid reference in ListBuf _batch_set_item(whole_data, self._ready_env_ids, self.data, self.env_num) # let self.data be the data in all environments again self.data = whole_data self._ready_env_ids = np.array( [x for x in self._ready_env_ids if x not in finished_env_ids]) if n_step: if step_count >= n_step: break else: if isinstance(n_episode, int) and \ episode_count.sum() >= n_episode: break if isinstance(n_episode, list) and \ (episode_count >= n_episode).all(): break # finished envs are ready, and can be used for the next collection self._ready_env_ids = np.array(self._ready_env_ids.tolist() + finished_env_ids) # generate the statistics episode_count = sum(episode_count) duration = max(time.time() - start_time, 1e-9) self.collect_step += step_count self.collect_episode += episode_count self.collect_time += duration # print(self.env_num == 2, right + wrong) return { "n/ep": episode_count, "n/st": step_count, "v/st": step_count / duration, "v/ep": episode_count / duration, "rew": np.mean(rewards), "rew_std": np.std(rewards), "len": step_count / episode_count, "hit_rate": right / (right + wrong), "class_rate": right_index, 'mate_num': mate_num }
def test_batch(): assert list(Batch()) == [] assert Batch().is_empty() assert not Batch(b={'c': {}}).is_empty() assert Batch(b={'c': {}}).is_empty(recurse=True) assert not Batch(a=Batch(), b=Batch(c=Batch())).is_empty() assert Batch(a=Batch(), b=Batch(c=Batch())).is_empty(recurse=True) assert not Batch(d=1).is_empty() assert not Batch(a=np.float64(1.0)).is_empty() assert len(Batch(a=[1, 2, 3], b={'c': {}})) == 3 assert not Batch(a=[1, 2, 3]).is_empty() b = Batch() b.update() assert b.is_empty() b.update(c=[3, 5]) assert np.allclose(b.c, [3, 5]) # mimic the behavior of dict.update, where kwargs can overwrite keys b.update({'a': 2}, a=3) assert b.a == 3 with pytest.raises(AssertionError): Batch({1: 2}) with pytest.raises(TypeError): Batch(a=[np.zeros((2, 3)), np.zeros((3, 3))]) with pytest.raises(TypeError): Batch(a=[np.zeros((3, 2)), np.zeros((3, 3))]) with pytest.raises(TypeError): Batch(a=[torch.zeros((2, 3)), torch.zeros((3, 3))]) with pytest.raises(TypeError): Batch(a=[torch.zeros((3, 3)), np.zeros((3, 3))]) with pytest.raises(TypeError): Batch(a=[1, np.zeros((3, 3)), torch.zeros((3, 3))]) batch = Batch(a=[torch.ones(3), torch.ones(3)]) assert torch.allclose(batch.a, torch.ones(2, 3)) Batch(a=[]) batch = Batch(obs=[0], np=np.zeros([3, 4])) assert batch.obs == batch["obs"] batch.obs = [1] assert batch.obs == [1] batch.cat_(batch) assert np.allclose(batch.obs, [1, 1]) assert batch.np.shape == (6, 4) assert np.allclose(batch[0].obs, batch[1].obs) batch.obs = np.arange(5) for i, b in enumerate(batch.split(1, shuffle=False)): if i != 5: assert b.obs == batch[i].obs else: with pytest.raises(AttributeError): batch[i].obs with pytest.raises(AttributeError): b.obs print(batch) batch_dict = {'b': np.array([1.0]), 'c': 2.0, 'd': torch.Tensor([3.0])} batch_item = Batch({'a': [batch_dict]})[0] assert isinstance(batch_item.a.b, np.ndarray) assert batch_item.a.b == batch_dict['b'] assert isinstance(batch_item.a.c, float) assert batch_item.a.c == batch_dict['c'] assert isinstance(batch_item.a.d, torch.Tensor) assert batch_item.a.d == batch_dict['d'] batch2 = Batch(a=[{ 'b': np.float64(1.0), 'c': np.zeros(1), 'd': Batch(e=np.array(3.0)) }]) assert len(batch2) == 1 assert Batch().shape == [] assert Batch(a=1).shape == [] assert batch2.shape[0] == 1 with pytest.raises(IndexError): batch2[-2] with pytest.raises(IndexError): batch2[1] assert batch2[0].shape == [] with pytest.raises(IndexError): batch2[0][0] with pytest.raises(TypeError): len(batch2[0]) assert isinstance(batch2[0].a.c, np.ndarray) assert isinstance(batch2[0].a.b, np.float64) assert isinstance(batch2[0].a.d.e, np.float64) batch2_from_list = Batch(list(batch2)) batch2_from_comp = Batch([e for e in batch2]) assert batch2_from_list.a.b == batch2.a.b assert batch2_from_list.a.c == batch2.a.c assert batch2_from_list.a.d.e == batch2.a.d.e assert batch2_from_comp.a.b == batch2.a.b assert batch2_from_comp.a.c == batch2.a.c assert batch2_from_comp.a.d.e == batch2.a.d.e for batch_slice in [batch2[slice(0, 1)], batch2[:1], batch2[0:]]: assert batch_slice.a.b == batch2.a.b assert batch_slice.a.c == batch2.a.c assert batch_slice.a.d.e == batch2.a.d.e batch2_sum = (batch2 + 1.0) * 2 assert batch2_sum.a.b == (batch2.a.b + 1.0) * 2 assert batch2_sum.a.c == (batch2.a.c + 1.0) * 2 assert batch2_sum.a.d.e == (batch2.a.d.e + 1.0) * 2 batch3 = Batch(a={ 'c': np.zeros(1), 'd': Batch(e=np.array([0.0]), f=np.array([3.0])) }) batch3.a.d[0] = {'e': 4.0} assert batch3.a.d.e[0] == 4.0 batch3.a.d[0] = Batch(f=5.0) assert batch3.a.d.f[0] == 5.0 with pytest.raises(KeyError): batch3.a.d[0] = Batch(f=5.0, g=0.0) # auto convert batch4 = Batch(a=np.array(['a', 'b'])) assert batch4.a.dtype == np.object # auto convert to np.object batch4.update(a=np.array(['c', 'd'])) assert list(batch4.a) == ['c', 'd'] assert batch4.a.dtype == np.object # auto convert to np.object batch5 = Batch(a=np.array([{'index': 0}])) assert isinstance(batch5.a, Batch) assert np.allclose(batch5.a.index, [0]) batch5.b = np.array([{'index': 1}]) assert isinstance(batch5.b, Batch) assert np.allclose(batch5.b.index, [1]) # None is a valid object and can be stored in Batch a = Batch.stack([Batch(a=None), Batch(b=None)]) assert a.a[0] is None and a.a[1] is None assert a.b[0] is None and a.b[1] is None
def collect( self, n_step: Optional[int] = None, n_episode: Optional[Union[int, List[int]]] = None, random: bool = False, render: Optional[float] = None, no_grad: bool = True, ) -> Dict[str, float]: """Collect a specified number of step or episode. :param int n_step: how many steps you want to collect. :param n_episode: how many episodes you want to collect. If it is an int, it means to collect at lease ``n_episode`` episodes; if it is a list, it means to collect exactly ``n_episode[i]`` episodes in the i-th environment :param bool random: whether to use random policy for collecting data, defaults to ``False``. :param float render: the sleep time between rendering consecutive frames, defaults to ``None`` (no rendering). :param bool no_grad: whether to retain gradient in policy.forward, defaults to ``True`` (no gradient retaining). .. note:: One and only one collection number specification is permitted, either ``n_step`` or ``n_episode``. :return: A dict including the following keys * ``n/ep`` the collected number of episodes. * ``n/st`` the collected number of steps. * ``v/st`` the speed of steps per second. * ``v/ep`` the speed of episode per second. * ``rew`` the mean reward over collected episodes. * ``len`` the mean length over collected episodes. """ assert (n_step is not None and n_episode is None and n_step > 0) or ( n_step is None and n_episode is not None and np.sum(n_episode) > 0 ), "Only one of n_step or n_episode is allowed in Collector.collect, " f"got n_step = {n_step}, n_episode = {n_episode}." start_time = time.time() step_count = 0 # episode of each environment # 每一个环境进行的episode次数 episode_count = np.zeros(self.env_num) # If n_episode is a list, and some envs have collected the required # number of episodes, these envs will be recorded in this list, and # they will not be stepped. # 如果有的环境中已经达到了迭代的轮次,那么它们将不再被运行。 finished_env_ids = [] reward_total = 0.0 whole_data = Batch() list_n_episode = False # 多环境不同episode初始化处理 if n_episode is not None and not np.isscalar(n_episode): assert len(n_episode) == self.get_env_num() # 标记为多环境运行不同episode list_n_episode = True finished_env_ids = [ i for i in self._ready_env_ids if n_episode[i] <= 0 ] self._ready_env_ids = np.array( [x for x in self._ready_env_ids if x not in finished_env_ids]) while True: if step_count >= 100000 and episode_count.sum() == 0: warnings.warn( 'There are already many steps in an episode. ' 'You should add a time limitation to your environment!', Warning) # 如果本身设计的就是异步运行,或者一些环境运行已结束,则启动异步收集 is_async = self.is_async or len(finished_env_ids) > 0 if is_async: # self.data are the data for all environments in async # simulation or some envs have finished, # **only a subset of data are disposed**, # so we store the whole data in ``whole_data``, let self.data # to be the data available in ready environments, and finally # set these back into all the data whole_data = self.data self.data = self.data[self._ready_env_ids] # restore the state and the input data last_state = self.data.state if isinstance(last_state, Batch) and last_state.is_empty(): last_state = None self.data.update(state=Batch(), obs_next=Batch(), policy=Batch()) # calculate the next action # print(type(self.data.obs)) if random: spaces = self._action_space result = Batch( act=[spaces[i].sample() for i in self._ready_env_ids]) else: if no_grad: with torch.no_grad(): # faster than retain_grad version result = self.policy(self.data, last_state) else: result = self.policy(self.data, last_state) # 在RNN的RL方法中使用state state = result.get('state', Batch()) # convert None to Batch(), since None is reserved for 0-init # 这行代码可以删除??? if state is None: state = Batch() self.data.update(state=state, policy=result.get('policy', Batch())) # save hidden state to policy._state, in order to save into buffer if not (isinstance(state, Batch) and state.is_empty()): self.data.policy._state = self.data.state self.data.act = to_numpy(result.act) if self._action_noise is not None: # noqa self.data.act += self._action_noise(self.data.act.shape) # step in env if not is_async: obs_next, rew, done, info = self.env.step(self.data.act) else: # store computed actions, states, etc # 把self.data中得到的新的值赋给whole_data _batch_set_item(whole_data, self._ready_env_ids, self.data, self.env_num) # print(self._ready_env_ids, 1) # fetch finished data obs_next, rew, done, info = self.env.step( self.data.act, id=self._ready_env_ids) # print(self._ready_env_ids, 2) # 这行代码可以删除??? self._ready_env_ids = np.array([i['env_id'] for i in info]) # get the stepped data self.data = whole_data[self._ready_env_ids] # move data to self.data self.data.update(obs_next=obs_next, rew=rew, done=done, info=info) if render: self.render() time.sleep(render) # 在加入buffer之前对数据进行预处理 if self.preprocess_fn: result = self.preprocess_fn(**self.data) self.data.update(result) # add data into the buffer # 首先将这一step的数据存储到对应的_cached_buf中,如果当前这一step对应了一个done状态,那么则清空对应的_cached_buf将其加入到buffer中 for j, i in enumerate(self._ready_env_ids): # j is the index in current ready_env_ids # i is the index in all environments if self.buffer is None: # users do not want to store data, so we store # small fake data here to make the code clean self._cached_buf[i].add(obs=0, act=0, rew=rew[j], done=0) else: self._cached_buf[i].add(**self.data[j]) if done[j]: if not (list_n_episode and episode_count[i] >= n_episode[i]): episode_count[i] += 1 reward_total += np.sum(self._cached_buf[i].rew, axis=0) step_count += len(self._cached_buf[i]) if self.buffer is not None: self.buffer.update(self._cached_buf[i]) if list_n_episode and \ episode_count[i] >= n_episode[i]: # env i has collected enough data, it has finished finished_env_ids.append(i) self._cached_buf[i].reset() self._reset_state(j) # 更新当前状态 obs_next = self.data.obs_next # 如果有已完结状态,则将对应的环境reset() if sum(done): #np.where返回一个元组,维度与输入相同 env_ind_local = np.where(done)[0] env_ind_global = self._ready_env_ids[env_ind_local] obs_reset = self.env.reset(env_ind_global) if self.preprocess_fn: obs_next[env_ind_local] = self.preprocess_fn( obs=obs_reset).get('obs', obs_reset) else: obs_next[env_ind_local] = obs_reset self.data.obs = obs_next if is_async: # set data back whole_data = deepcopy(whole_data) # avoid reference in ListBuf _batch_set_item(whole_data, self._ready_env_ids, self.data, self.env_num) # let self.data be the data in all environments again self.data = whole_data self._ready_env_ids = np.array( [x for x in self._ready_env_ids if x not in finished_env_ids]) if n_step: if step_count >= n_step: break else: if isinstance(n_episode, int) and \ episode_count.sum() >= n_episode: break if isinstance(n_episode, list) and \ (episode_count >= n_episode).all(): break # finished envs are ready, and can be used for the next collection self._ready_env_ids = np.array(self._ready_env_ids.tolist() + finished_env_ids) # generate the statistics episode_count = sum(episode_count) duration = max(time.time() - start_time, 1e-9) self.collect_step += step_count self.collect_episode += episode_count self.collect_time += duration # average reward across the number of episodes reward_avg = reward_total / episode_count if np.asanyarray(reward_avg).size > 1: # non-scalar reward_avg reward_avg = self._rew_metric(reward_avg) return { 'n/ep': episode_count, 'n/st': step_count, 'v/st': step_count / duration, 'v/ep': episode_count / duration, 'rew': reward_avg, 'len': step_count / episode_count, }