def create_datasets(output, source, size): """Prepare datasets for merged file (based on one of input files). Keyword argument: output -- output merged hdf5 file source -- path to one of input hdf5 files size -- total number of entries per dataset """ data = load(source) for key in data: shape = list(data[key].shape) shape[0] = size output.create_dataset(key, shape, dtype=data[key].dtype, compression='gzip') data.close()
""" import os from parser import get_args_split as parser import msg import hdf5 from combine_big import load from split import generate_uneven_filelist from split import save_filelist if __name__ == '__main__': msg.box("HDF5 MANIPULATOR: SPLIT") args = parser() data = load(args.input) # TODO - come up with a clever way to generalize this... new_sizes = [(0, 15000), (15000, 17500), (17500, 20000)] new_names_ext = ['_train.hdf5', '_valid.hdf5', '_test.hdf5'] new_filelist = zip(new_names_ext, new_sizes) filelist = generate_uneven_filelist( args.prefix or os.path.splitext(args.input)[0], new_filelist) print("\nSaving output files:\n") for f, r in filelist.items(): msg.list_fileinfo(f, r) hdf5.save_subset_big(f, data, r[0], r[1])
from parser import get_args_split as parser import msg import hdf5 import h5py import check from combine_big import load from split import generate_filelist from split import save_filelist if __name__ == '__main__': msg.box("HDF5 MANIPULATOR: SPLIT") args = parser() data = load(args.input) filelist = generate_filelist( args.prefix or os.path.splitext(args.input)[0], check.get_size(data), int(args.size)) print "\nSaving output files:\n" for f, r in filelist.iteritems(): msg.list_fileinfo(f, r) hdf5.save_subset_big(f, data, r[0], r[1]) if args.filelist: save_filelist(args.filelist, filelist.keys()) data.close()
Rename dataset """ import os import sys import h5py from parser import get_args_rename as parser from combine_big import load import msg if __name__ == '__main__': msg.box("HDF5 MANIPULATOR: COMBINE") args = parser() f = load(args.input, 'r+') if args.dataset not in f: msg.error("There is no %(key)s in %(file)s." % {"key": args.dataset, "file": args.input}) sys.exit(1) if args.name in f: msg.error("There is %(key)s already in %(file)s." % {"key": args.name, "file": args.input}) sys.exit(1) f[args.name] = f[args.dataset] del f[args.dataset] f.close()
Rename dataset """ import os import sys import h5py from parser import get_args_rename as parser from combine_big import load import msg if __name__ == '__main__': msg.box("HDF5 MANIPULATOR: COMBINE") args = parser() f = load(args.input, 'r+') if args.dataset not in f: msg.error("There is no %(key)s in %(file)s." % { "key": args.dataset, "file": args.input }) sys.exit(1) if args.name in f: msg.error("There is %(key)s already in %(file)s." % { "key": args.name, "file": args.input }) sys.exit(1)