def load_hdf5(data_fp): with download_h5(data_fp) as hdf5_data: columns = [ s.decode("utf-8") for s in hdf5_data[HDF5_COLUMNS_KEY][()].tolist() ] numpy_dataset = {} for column in columns: numpy_dataset[column] = hdf5_data[column][()] return from_numpy_dataset(numpy_dataset)
def load_hdf5(data_fp, clean_cols: bool = False): with download_h5(data_fp) as hdf5_data: columns = [ s.decode("utf-8") for s in hdf5_data[HDF5_COLUMNS_KEY][()].tolist() ] numpy_dataset = {} for column in columns: # Column names from training hdf5 will be in the form 'Survived_a2fv4' np_col = column.rsplit("_", 1)[0] if clean_cols else column numpy_dataset[np_col] = hdf5_data[column][()] return from_numpy_dataset(numpy_dataset)
def get(self, proc_column, idx=None): if idx is None: idx = range(self.size) if (self.data_hdf5_fp is None or PREPROCESSING not in self.features[proc_column] or 'in_memory' not in self.features[proc_column][ 'preprocessing']): return self.dataset[proc_column][idx] if self.features[proc_column][PREPROCESSING]['in_memory']: return self.dataset[proc_column][idx] sub_batch = self.dataset[proc_column][idx] indices = np.empty((3, len(sub_batch)), dtype=np.int64) indices[0, :] = sub_batch indices[1, :] = np.arange(len(sub_batch)) indices = indices[:, np.argsort(indices[0])] with download_h5(self.data_hdf5_fp) as h5_file: im_data = h5_file[proc_column + '_data'][indices[0, :], :, :] indices[2, :] = np.arange(len(sub_batch)) indices = indices[:, np.argsort(indices[1])] return im_data[indices[2, :]]