def read_sents(self, filename, filter_ids=None): npzFile = np.load(filename, mmap_mode=None if filter_ids is None else "r") npzKeys = sorted(npzFile.files, key=lambda x: int(x.split('_')[-1])) if filter_ids is not None: npzKeys = [npzKeys[i] for i in filter_ids] npzKeys.sort(key=lambda x: int(x.split('_')[-1])) for idx, key in enumerate(npzKeys): inp = npzFile[key] if self.transpose: inp = inp.transpose() sub_inp = inp[self.feat_from:self.feat_to:self.feat_skip, :self. timestep_truncate:self.timestep_skip] if sub_inp.size < inp.size: inp = np.empty_like(sub_inp) np.copyto(inp, sub_inp) else: inp = sub_inp if idx % 1000 == 999: logger.info( f"Read {idx+1} lines ({float(idx+1)/len(npzKeys)*100:.2f}%) of {filename} at {key}" ) yield ArrayInput(inp) npzFile.close()
def read_sents(self, filename, filter_ids=None): npzFile = np.load(filename, mmap_mode=None if filter_ids is None else "r") npzKeys = sorted(npzFile.files, key=lambda x: int(x.split('_')[-1])) if filter_ids is not None: npzKeys = [npzKeys[i] for i in filter_ids] for idx, key in enumerate(npzKeys): inp = npzFile[key] if self.transpose: inp = inp.transpose() if idx % 1000 == 999: logger.info(f"Read {idx+1} lines ({float(idx+1)/len(npzKeys)*100:.2f}%) of {filename} at {key}") yield ArrayInput(inp) npzFile.close()
def read_sents(self, filename, filter_ids=None): with h5py.File(filename, "r") as hf: h5_keys = sorted(hf.keys(), key=lambda x: int(x)) if filter_ids is not None: h5_keys = [h5_keys[i] for i in filter_ids] h5_keys.sort(key=lambda x: int(x)) for idx, key in enumerate(h5_keys): inp = hf[key][:] if self.transpose: inp = inp.transpose() sub_inp = inp[self.feat_from: self.feat_to: self.feat_skip, :self.timestep_truncate:self.timestep_skip] if sub_inp.size < inp.size: inp = np.empty_like(sub_inp) np.copyto(inp, sub_inp) else: inp = sub_inp if idx % 1000 == 999: logger.info(f"Read {idx+1} lines ({float(idx+1)/len(h5_keys)*100:.2f}%) of {filename} at {key}") yield ArrayInput(inp)