def tf_gen(step=4): with dask.config.set(scheduler="sync"): for index in range(0, len(self), step): arrs = [self[index : index + step].values() for i in range(1)] arrs = list(map(lambda x: x._array, _flatten(arrs))) arrs = dask.delayed(list, pure=False, nout=len(list(self.keys())))( arrs ) arrs = arrs.compute() for ind, arr in enumerate(arrs): if arr.dtype.type is np.str_: arr = [ ([ord(x) for x in sample.tolist()[0:max_text_len]]) for sample in arr ] arr = np.array( [ np.pad( sample, (0, max_text_len - len(sample)), "constant", constant_values=(32), ) for sample in arr ] ) arrs[ind] = arr for i in range(step): sample = {key: r[i] for key, r in zip(self[index].keys(), arrs)} yield sample
def __getitem__(self, index): with dask.config.set(scheduler="sync", delayed_pure=True): arrs = [self._ds[index : index + 1].values() for i in range(1)] arrs = list(map(lambda x: x._array, _flatten(arrs))) arrs = dask.delayed(list, pure=True, nout=len(list(self._ds.keys())))(arrs) arrs = arrs.compute() arrs = {key: r[0] for key, r in zip(self._ds[index].keys(), arrs)} objs = self._do_transform(arrs) if isinstance(objs, dict): objs = {k: self._to_tensor(k, v) for k, v in objs.items()} elif isinstance(objs, list): objs = [self._to_tensor(v) for v in objs] return objs
def test_flatten_array(): expected_list = [1, 2, 3, 4, 5] flatten_list = _flatten([[1, 2], [3, 4, 5]]) assert flatten_list == expected_list