def test_create_dataset_with_format_change(): h5_path = mkstemp(suffix=".h5")[1] sparse_matrix = ss.csr_matrix( [[0, 1, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1], [1, 1, 0, 1]], dtype=np.float64) with h5sparse.File(h5_path, 'w') as h5f: h5f.create_dataset('sparse/matrix', data=sparse_matrix, sparse_format='csc') with h5sparse.File(h5_path) as h5f: assert 'sparse' in h5f assert 'matrix' in h5f['sparse'] assert h5f['sparse']['matrix'].format_str == 'csc' result_matrix = h5f['sparse']['matrix'][()] assert isinstance(result_matrix, ss.csc_matrix) assert (result_matrix != sparse_matrix).size == 0 assert (h5f['sparse']['matrix'][1:3] != sparse_matrix[:, 1:3]).size == 0 assert (h5f['sparse']['matrix'][2:] != sparse_matrix[:, 2:]).size == 0 assert (h5f['sparse']['matrix'][:2] != sparse_matrix[:, :2]).size == 0 assert (h5f['sparse']['matrix'][-2:] != sparse_matrix[:, -2:]).size == 0 assert (h5f['sparse']['matrix'][:-2] != sparse_matrix[:, :-2]).size == 0 os.remove(h5_path)
def __init__(self, gvm_part_fname, gvm_path, indices=None, batch_size=128, shuffle=True, verbose=True): 'Initialization' if gvm_path is None: with h5sparse.File(gvm_part_fname, 'r') as h: self.GVD = h[()] else: with h5sparse.File(gvm_part_fname, 'r') as h: self.GVD = h[gvm_path][()] # If no indices are specified, use all of them. if indices is None: with h5sparse.File(gvm_part_fname, 'r') as h: indices = list(range(self.GVD.shape[0])) self.indices = list(map(int, indices)) self.batch_size = batch_size self.shuffle = shuffle self.verbose = verbose self.on_epoch_end()
def processdata(id): datalabels = array([1, 7, 41, 42, 43, 44, 61, 69, 71, 72, 74], dtype=uint8) with h5py.File('/datadrive/musicnet.h5', 'r') as file: with h5sparse.File('/datadrive/labels.h5') as sparse: doit = join('sparse', 'matrix', id) not in sparse if doit: size = len(file[id]['data']) bar = Bar('Processing ' + str(id), max=len(file[id]['labels'])) segments = lil_matrix((size, 11), dtype=uint8) for _, end_time, instrument, _, note, _, start_time in file[id][ 'labels']: label_index = list(datalabels == instrument).index(True) for t in range(start_time, end_time): segments[t, label_index] = note bar.next() with h5sparse.File('/datadrive/labels.h5') as sparse: sparse.create_dataset(join('sparse', 'matrix', id), data=segments.tocsr()) bar.finish() print('Done!') del segments collect()
def test_marginal_likelihood_hdf5(self): """ I don't think this gets us much because ´.value´ will probably load the complete sparse matrix :return: """ transition_counts = csr_matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float64) transition_probabilities = normalize(transition_counts, "l1", axis=1) pseudo_counts = transition_probabilities * 5 if os.path.isfile("test.h5"): os.remove("test.h5") with h5sparse.File("test.h5") as h5f: h5f.create_dataset('transition/counts', data=transition_counts) h5f.create_dataset('pseudo/counts', data=pseudo_counts) h5f = h5sparse.File("test.h5") print(h5f['transition/counts'].h5py_group.attrs["h5sparse_shape"]) print(h5f['transition/counts'][0:2]) ml = MarkovChain.marginal_likelihood(h5f['transition/counts'].value, h5f['pseudo/counts'].value, smoothing=1.0) os.remove("test.h5") print(ml)
def test_create_empty_dataset(): h5_path = mkstemp(suffix=".h5")[1] with h5sparse.File(h5_path) as h5f: h5f.create_dataset('empty_data', shape=(100, 200)) with h5sparse.File(h5_path) as h5f: assert h5f['empty_data'].shape == (100, 200) os.remove(h5_path)
def merge_hdf5s(hdf5_paths, output_path): arrays_to_merge = [] for hdf5_path in hdf5_paths: arrays_to_merge.append(h5sparse.File(hdf5_path)["data"].value) merged_array = scipy.sparse.hstack(arrays_to_merge, format="coo") output_file = h5sparse.File(output_path, "w", libver="latest") output_file.create_dataset("data", data=merged_array.toarray())
def yPrepare(userNowDealing, user_id_dict, item_id_dict, filePlace, item_id, user_id, target_id, train): usrList = [user_id_dict[user] for user in user_id_dict] usrList.sort() usrList = np.array(usrList) # select the data from train which is related with usrNowDealing with h5sparse.File(filePlace + "userdot_cosine.h5") as user_net: # get the relationship between user u and other user: usrRelationship = user_net['dot_cosineData/data'][userNowDealing:( userNowDealing + 1)].toarray().ravel() usrHasRelation = usrList[usrRelationship] trainHasRelation = train[train[user_id].\ isin(usrHasRelation)].sort_values(by = [item_id]) relatedItem = trainHasRelation[item_id].values relatedUser = trainHasRelation[user_id].values relatedTarget = trainHasRelation[target_id] # get the item-train-sized net work with h5sparse.File(filePlace + "itemdot_cosine.h5") as item_net: itemRelationship = item_net['dot_cosineData/data'] itemToitemNetRelated = [ itemRelationship[itemRelated:(itemRelated + 1)] for itemRelated in relatedItem ] itemToitemNetRelated = vstack(itemToitemNetRelated).transpose() # get the item-train-sized kernel data # get the item-train-sized x data with h5sparse.File(filePlace + "itemdis.h5") as item_dis: itemDisRelationship = item_dis['disData/data'] itemToitemDisRelated = [ itemDisRelationship[itemDisRelated:(itemDisRelated + 1)] for itemDisRelated in relatedItem ] itemToitemDisRelated = vstack(itemToitemDisRelated).transpose() # broadcast with the user-train-sized x data with h5sparse.File(filePlace + "userdis.h5") as user_dis: userDisRelationship = user_dis['disData/data']\ [userNowDealing:(userNowDealing+1)] userDisRelationship = userDisRelationship[:, relatedUser].todense() itemToitemDisRelated = itemToitemDisRelated + userDisRelationship itemToitemDisRelated = kernel(itemToitemDisRelated) weight = itemToitemNetRelated.multiply(itemToitemDisRelated) weight_sum = diags(1 / weight.sum(1).A.ravel()) weight = weight_sum @ weight y = weight.dot(relatedTarget.transpose()) return (y)
def largeMatrixDis(largeDisMatrix, num=2, netFilePlace="C:\\Users\\22560\\Desktop\\", prefix="item"): # load the social network with h5sparse.File(netFilePlace + prefix + "dot_cosine.h5") as h5f: (rowNum, colNum) = largeDisMatrix.shape sep = np.linspace(0, rowNum, endpoint=True, dtype=np.int64, num=num) yTy = (largeDisMatrix * largeDisMatrix).sum(1) print("############# please be patient ############## \n \n") for i, j in enumerate(sep): if i + 1 < len(sep): blockSlice = slice(j, sep[i + 1]) blockData = largeDisMatrix[blockSlice, :] negtive2xTy = -2 * blockData.dot(largeDisMatrix.transpose()) xTx = yTy[blockSlice] xTx = xTx.reshape((len(xTx), 1)) dis = yTy + negtive2xTy + xTx dis = csr_matrix(dis) sparse = h5f['dot_cosineData/data'][blockSlice] dis = dis.multiply(sparse) if i == 0: # if its the first loop # check if dot_cosine.h5 is exists or not # if exists , clean it # create the file dot_cosine.h5 with h5py.File(netFilePlace + prefix + "dis.h5") as h5file: for key in h5file.keys(): del h5file[key] with h5sparse.File(netFilePlace + prefix + "dis.h5") as h5file: h5file.create_dataset("disData/data", data=dis, chunks=(10000, ), maxshape=(None, )) else: with h5sparse.File(netFilePlace + prefix + "dis.h5") as h5file: h5file['disData/data'].append(dis) print("Dis for " + prefix + " is now preparing ") preparePercent = (1 + i) / (len(sep) - 1) preparePercent = round(preparePercent, 4) print(str(preparePercent), " percent of Distance data is prepared ") print("############# dis data for " + prefix + " prepared successful!! ###########")
def test_create_and_read_dataset(): h5_path = mkstemp(suffix=".h5")[1] sparse_matrix = ss.csr_matrix([[0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 0]], dtype=np.float64) with h5sparse.File(h5_path) as h5f: h5f.create_dataset('sparse/matrix', data=sparse_matrix) with h5sparse.File(h5_path) as h5f: assert (h5f['sparse']['matrix'][1:3] != sparse_matrix[1:3]).size == 0 assert (h5f['sparse']['matrix'][2:] != sparse_matrix[2:]).size == 0 assert (h5f['sparse']['matrix'][:2] != sparse_matrix[:2]).size == 0 assert (h5f['sparse']['matrix'][-2:] != sparse_matrix[-2:]).size == 0 assert (h5f['sparse']['matrix'][:-2] != sparse_matrix[:-2]).size == 0 assert (h5f['sparse']['matrix'].value != sparse_matrix).size == 0 os.remove(h5_path)
def test_create_dataset_from_dataset(): from_h5_path = mkstemp(suffix=".h5")[1] to_h5_path = mkstemp(suffix=".h5")[1] sparse_matrix = ss.csr_matrix([[0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 0]], dtype=np.float64) with h5sparse.File(from_h5_path) as from_h5f: from_dset = from_h5f.create_dataset('sparse/matrix', data=sparse_matrix) with h5sparse.File(to_h5_path) as to_h5f: to_h5f.create_dataset('sparse/matrix', data=from_dset) assert (to_h5f['sparse/matrix'].value != sparse_matrix).size == 0 os.remove(from_h5_path) os.remove(to_h5_path)
def run_model(mat_path, meta_path, model_instance, predictions_path, model_path, val, logger): with timer("read data"): meta = pd.read_hdf(meta_path, key="data") mat = h5sparse.File(mat_path, mode="r")["matrix"] with timer("split data"): if val: train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0] val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0] else: train_ind = np.where(meta.is_test == 0)[0] val_ind = np.where(meta.is_test == 1)[0] logger.info(f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}") meta_train = meta.iloc[train_ind] meta_val = meta.iloc[val_ind] X_train = mat[train_ind.min() : (train_ind.max() + 1)] X_val = mat[val_ind.min() : (val_ind.max() + 1)] del mat gc.collect() with timer("fit model"): model_instance.fit( X_train, meta_train["was_clicked"].values, group=group_lengths(meta_train["clickout_id"].values) ) joblib.dump(model_instance, model_path) val_pred = model_instance.predict(X_val) train_pred = model_instance.predict(X_train) logger.info("Train AUC {:.4f}".format(roc_auc_score(meta_train["was_clicked"].values, train_pred))) if val: logger.info("Val AUC {:.4f}".format(roc_auc_score(meta_val["was_clicked"].values, val_pred))) meta_val["click_proba"] = val_pred if val: logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba"))) meta_val.to_csv(predictions_path, index=False)
def open_gvm(fname): ''' Returns a gvm obtained from the input file. fname : str The name of the file to be obtained as a gvm. ''' with h5sparse.File(fname, 'r') as h: data = h['gvm'][()] if 'idx' in h: idx = h['idx'][()] try: idx = [i.decode('utf-8', 'ignore') for i in idx] except: pass else: idx = None if 'col' in h: col = h['col'][()] try: col = [i.decode('utf-8', 'ignore') for i in col] except: pass else: col = None return {'gvm': data, 'idx': np.array(idx), 'col': np.array(col)}
def open_gvm(fname): ''' Opens and returns the gvm at `fname`. Parameters: fname (str): The name of the file to be obtained as a gvm. Returns: gvm (h5sparse): gvm ''' with h5sparse.File(fname, 'r') as h: data = h['gvm'][()] if 'idx' in h: idx = h['idx'][()] try: idx = [i.decode('utf-8', 'ignore') for i in idx] except: pass else: idx = None if 'col' in h: col = h['col'][()] try: col = [i.decode('utf-8', 'ignore') for i in col] except: pass else: col = None return {'gvm': data, 'idx': np.array(idx), 'col': np.array(col)}
def __init__(self, h5_fname): """Creates a :class:`PredictSession` from a given HDF5 file Parameters ---------- h5_fname : string Name of the HDF5 file. """ self.h5_file = h5sparse.File(h5_fname, 'r') self.options = self.h5_file["config/options"].attrs self.nmodes = int(self.options['num_priors']) self.num_latent = int(self.options["num_latent"]) self.data_shape = self.h5_file["config/train/data"].shape assert self.nmodes == 2 self.samples = [] for name in self.h5_file.keys(): if not name.startswith("sample_"): continue sample = Sample(self.h5_file, name, self.num_latent) self.samples.append(sample) self.beta_shape = sample.beta_shape() if len(self.samples) == 0: raise ValueError("No samples found in " + h5_fname) self.samples.sort(key=lambda x: x.no) self.num_samples = len(self.samples)
def test_create_empty_sparse_dataset(): h5_path = mkstemp(suffix=".h5")[1] with h5sparse.File(h5_path) as h5f: h5f.create_dataset('sparse/matrix', sparse_format='csr') with h5sparse.File(h5_path) as h5f: assert 'sparse' in h5f assert 'matrix' in h5f['sparse'] assert h5f['sparse']['matrix'].format_str == 'csr' result_matrix = h5f['sparse']['matrix'][()] assert isinstance(result_matrix, ss.csr_matrix) assert result_matrix.shape == (0, 0) assert result_matrix.dtype == np.float64 assert h5f['sparse']['matrix'].shape == (0, 0) assert h5f['sparse']['matrix'].dtype == np.float64 os.remove(h5_path)
def test_numpy_array(): h5_path = mkstemp(suffix=".h5")[1] matrix = np.random.rand(3, 5) with h5sparse.File(h5_path) as h5f: h5f.create_dataset('matrix', data=matrix) assert 'matrix' in h5f np.testing.assert_equal(h5f['matrix'][()], matrix) os.remove(h5_path)
def test_bytestring(): h5_path = mkstemp(suffix=".h5")[1] strings = [str(i) for i in range(100)] data = json.dumps(strings).encode('utf8') with h5sparse.File(h5_path) as h5f: h5f.create_dataset('strings', data=data) assert 'strings' in h5f assert strings == json.loads(h5f['strings'][()].decode('utf8')) os.remove(h5_path)
def verify_hdf5(matrix_path, test_yaml_path): expected_values = yaml.load(open(test_yaml_path))['expected_output'] output_matrix = h5sparse.File(matrix_path)["data"].value assert numpy.count_nonzero( output_matrix) == expected_values["non_zero_count"] assert numpy.sum(output_matrix) == expected_values["sum"] assert tuple(output_matrix.shape) == tuple(expected_values["shape"])
def relationToNetwork(objectRelationship, numOfUser, ifHasitsOwn, ifBIGDATA, prefix, fileplace): # objectRelationship = userRelationship #relation is sth like: # userID friendID value # 1890 1625 1 # 1890 1807 1 # 1890 1816 1 # 1891 548 1 # 1891 564 1 if ifBIGDATA == False: columns = objectRelationship.columns.tolist() objectNetwork = csr_matrix((objectRelationship.loc[:, columns[2]], (objectRelationship.loc[:, columns[0]], objectRelationship.loc[:, columns[1]])), shape=(numOfUser, numOfUser), dtype=np.float32) ######### code only used for lastFM ########################## # #objectNetwork = objectNetwork # objectNetwork[objectNetwork > 0] = 0.8 # objectNetwork[objectNetwork==0] = 0.5 ############################################################## with h5py.File(fileplace + prefix + 'dot_cosine.h5') as h5f: for key in h5f.keys(): del h5f[key] with h5sparse.File(fileplace + prefix + 'dot_cosine.h5') as h5f: if ifHasitsOwn: h5f.create_dataset("dot_cosineData/data", data=objectNetwork, chunks=(10000, ), maxshape=(None, )) else: objectNetwork = objectNetwork + diags(np.ones(numOfUser)) ######### code only used for lastFM ########################## # objectNetwork[objectNetwork == 1.5] = 0.8 ############################################################## h5f.create_dataset("dot_cosineData/data", data=objectNetwork, chunks=(10000, ), maxshape=(None, )) else: # 要求数据的row 一定是按照顺序排好的 pass
def get_gvm_size(fname): ''' Returns the gvm shape. Parameters: fname (str): filepath of gvm Returns: gvm_size (tuple): width and height of gvm array ''' with h5sparse.File(fname, 'r') as h: return h['gvm'].shape
def write_gvm(gvm, output_fname, fmt='h5'): if fmt == 'pd': if type(gvm) != pd.DataFrame: temp = pd.DataFrame(gvm['gvm'].todense()) if ('idx' not in gvm) or np.all(np.array(gvm['idx'] == None)): gvm['idx'] = np.array(range(gvm['gvm'].shape[0])) if ('col' not in gvm) or np.all(np.array(gvm['col'] == None)): gvm['col'] = np.array(range(gvm['gvm'].shape[1])) temp.index = gvm['idx'] temp.columns = gvm['col'] gvm = temp gvm = gvm.replace(to_replace=False, value='') gvm.to_csv(output_fname, sep='\t') elif fmt == 'h5': if type(gvm) == pd.DataFrame: gvm = { 'gvm': csc_matrix(gvm.values), 'idx': gvm.index.values, 'col': gvm.columns.values } if ('idx' not in gvm) or np.all(np.array(gvm['idx'] == None)): gvm['idx'] = np.array(range(gvm['gvm'].shape[0])) if ('col' not in gvm) or np.all(np.array(gvm['col'] == None)): gvm['col'] = np.array(range(gvm['gvm'].shape[1])) if not (np.all([isinstance(i, int) for i in gvm['idx']]) or np.all([isinstance(i, float) for i in gvm['idx']])): try: gvm['idx'] = np.array( [i.encode('utf-8', 'ignore') for i in gvm['idx']], dtype=np.string_) except AttributeError: gvm['idx'] = np.array(gvm['idx'], dtype=np.string_) if not (np.all([isinstance(i, int) for i in gvm['col']]) or np.all([isinstance(i, float) for i in gvm['col']])): try: gvm['col'] = np.array( [i.encode('utf-8', 'ignore') for i in gvm['col']], dtype=np.string_) except AttributeError: gvm['idx'] = np.array(gvm['idx'], dtype=np.string_) with h5sparse.File(output_fname, 'w') as h: h.create_dataset('gvm', data=gvm['gvm']) h.create_dataset('idx', data=np.array(gvm['idx'])) h.create_dataset('col', data=np.array(gvm['col'])) else: raise ValueError('unrecognized format %s' % fmt)
def csr_matrix_to_h5(matrix, file, name): """ :type matrix: csr_matrix :param matrix: the matrix to parallelize :type file: str :param file: HDF5 file to hold the matrix :type name: str :param name: name of the matrix in the HDF5 file """ with h5sparse.File(file) as h5f: h5f.create_dataset(name, data=matrix)
def to_sparse_hdf5(df, cell_name): """Convert a datafraom to a sparse represenation in an hdf5 file.""" for major in ("csc", "csr"): path = _create_path("h5sparse", cell_name, major, "h5") matrix_class = getattr(scipy.sparse, major + "_matrix") sparse_matrix = matrix_class(df.as_matrix()) f = h5sparse.File(path, 'w') f.create_dataset("data", data=sparse_matrix) f.h5f.attrs["hdf5_version"] = h5py.version.hdf5_version f.h5f.attrs["h5py_version"] = h5py.version.version f.h5f.close()
def prepareY(doRregression, networkSavingplace, trainAfterTransform, userID, itemID, targetID, trainAfterDealingName): # read train data train = pd.read_csv(trainAfterTransform) train = train.sort_values(by=[userID, itemID]) # do not do regression at the first time trying if doRregression == True: weight = regression(train) train['weight'] = weight train.to_csv(trainAfterDealingName, index=False) gc.collect() print("train data has prepared !") # we are now start to prepare y user_num = train[userID].max() + 1 # calculate h later # h is the sum of ( median of useful DIS of user) and ( median of useful DIS of user) with h5sparse.File(networkSavingplace + "itemdis.h5") as item_dis,\ h5sparse.File(networkSavingplace + "userdis.h5") as user_dis: idis = item_dis['disData/data'].value.data.copy() idis = idis[idis > 0] idis.sort() udis = user_dis['disData/data'].value.data.copy() udis = udis[udis > 0] udis.sort() h = np.median(udis) + np.median(idis) del udis, idis yPrepareForSmallData(user_num, networkSavingplace, itemID, userID, targetID, train, h)
def marginal_likelihood(transition_counts_h5file, transition_counts_h5name, transition_probabilities_h5file, transition_probabilities_h5name, concentration_factors=None, smoothing=1.0): # TODO: add chunking and parallelization? """ :param transition_counts_h5file: :param transition_counts_h5name: :param transition_probabilities_h5file: :param transition_probabilities_h5name: :param concentration_factors: :param smoothing: :return: """ transition_counts_h5f = h5sparse.File( transition_counts_h5file)[transition_counts_h5name] transition_probabilities_h5f = h5sparse.File( transition_probabilities_h5file)[transition_probabilities_h5name] shape = transition_counts_h5f.h5py_group.attrs["h5sparse_shape"] ml = np.zeros(len(concentration_factors)) for row in range(shape[0]): transition_counts_row = transition_counts_h5f[row:row + 1] transition_probabilities_row = transition_probabilities_h5f[ row:row + 1] ml_row = np.array([ HypTrailsMarkovChain.marginal_likelihood( transition_counts_row, transition_probabilities_row * cf, smoothing) for cf in concentration_factors ]) ml += ml_row return ml
def convert_to_sparse_hdf5(df, major="csc"): """Convert a dataframe to a sparse represenation in an hdf5 file.""" path = _get_temp_path(".h5") matrix_class = getattr(scipy.sparse, major + "_matrix") sparse_matrix = matrix_class(df.as_matrix()) f = h5sparse.File(path, 'w', libver='latest') f.create_dataset("data", data=sparse_matrix) f.h5f.attrs["hdf5_version"] = h5py.version.hdf5_version f.h5f.attrs["h5py_version"] = h5py.version.version f.h5f.close() return path
def __init__(self, dataset, mat=None): if isinstance(dataset, str): self.dataset = h5sparse.File(dataset) else: self.dataset = dataset if mat is not None: self.data = mat else: self.data = self.dataset['sparse/matrix'][:] self.reference = self.dataset['reference/data'][:] self.x = self.dataset['reference/x'][:] self.y = self.dataset['reference/y'][:] self.z = self.dataset['reference/z'][:] self.voxel_num = self.dataset['reference/voxel_num'][:]
def update_context(self, context, data_definition, **kwargs): args = H5pyDataHandlerArgs(**kwargs) if args.create_dataset_context is None: return functions = context.setdefault(args.create_dataset_context, {}) assert data_definition.key not in functions # open h5 assert data_definition not in self.h5f_dict hdf_path = self._get_hdf_path(data_definition) assert not hdf_path.exists() h5f = h5sparse.File(hdf_path, 'w') self.h5f_dict[data_definition] = h5f functions[data_definition.key] = partial(h5f.create_dataset, 'data')
def __init__(self, window=None, epoch_size=100000): self.labels = array([1, 7, 41, 42, 43, 44, 61, 69, 71, 72, 74], dtype=uint8) self.sample_frequency = 44100 self.window = int( window * self.sample_frequency) if window is not None else None self.size = epoch_size # initialising dataset self.data_path = '/datadrive/musicnet.h5' self.labels_path = '/datadrive/labels.h5' with h5sparse.File(self.labels_path, 'r') as file: self.ids = list(file['sparse/matrix'].h5py_group)
def test_dataset_append(): h5_path = mkstemp(suffix=".h5")[1] sparse_matrix = ss.csr_matrix([[0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 0]], dtype=np.float64) to_append = ss.csr_matrix([[0, 1, 1], [1, 0, 0]], dtype=np.float64) appended_matrix = ss.vstack((sparse_matrix, to_append)) with h5sparse.File(h5_path) as h5f: h5f.create_dataset('matrix', data=sparse_matrix, chunks=(100000, ), maxshape=(None, )) h5f['matrix'].append(to_append) assert (h5f['matrix'].value != appended_matrix).size == 0 os.remove(h5_path)