def merge_data(data_list, attrs_list): """Merge dictionaries with data. Keyword arguments: data_list -- the dictionary with data dictionaries """ data = None attrs = None for f in data_list: size = check.get_size(data_list[f]) if not data: print "\nThe following datasets were found in %s:\n" % f msg.list_dataset(data_list[f]) data = data_list[f] attrs = attrs_list[f] else: print "\nAdding %(n)d entries from %(f)s" % {"n": size, "f": f} check.check_keys(data, data_list[f]) check.check_shapes(data, data_list[f]) for key in data_list[f]: data[key] = np.append(data[key], data_list[f][key], axis=0) attrs['n_events'] += attrs_list[f]['n_events'] return data, attrs
def load(filename, mode='r'): """Load hdf5 file and print included datasets. Keyword arguments: filename -- file to load """ f = h5py.File(filename, mode) print "\nThe following datasets were found in %s:\n" % filename msg.list_dataset(f) return f
def get_data(filename, match, keys): """Load file, check if contains match, update datasets based on command line options. Return data dictionary. Keyword arguments: filename -- input hdf5 file match -- common key use to order data keys -- user-chosen datasets to save """ data = hdf5.load(filename) print("\nThe following datasets were found in %s:\n" % filename) msg.list_dataset(data) check.key_exists(match, data, filename) if keys: msg.info("Using only: " + keys) update_data(data, [k.strip() for k in keys.split(',')], args.match) return data
def get_data(filename, match, keys): """Load file, check if contains match, update datasets based on command line options. Return data dictionary. Keyword arguments: filename -- input hdf5 file match -- common key use to order data keys -- user-chosen datasets to save """ data = hdf5.load(filename) print "\nThe following datasets were found in %s:\n" % filename msg.list_dataset(data) check.key_exists(match, data, filename) if keys: msg.info("Using only: " + keys) update_data(data, [k.strip() for k in keys.split(',')], args.match) return data
msg.list_dataset(data) check.key_exists(match, data, filename) if keys: msg.info("Using only: " + keys) update_data(data, [k.strip() for k in keys.split(',')], args.match) return data if __name__ == '__main__': msg.box("HDF5 MANIPULATOR: COMBINE") args = parser() data1 = get_data(args.input1, args.match, args.keys1) data2 = get_data(args.input2, args.match, args.keys2) check.different_keys(data1, data2, args.match) data = merge_data(data1, data2, args.match, args.print_warnings, args.show_progress) print "\nThe following datasets will be saved in %s:\n" % args.output msg.list_dataset(data) hdf5.save(args.output, data) msg.info("Done")
#!/usr/bin/env python """ Print info on datasets in hdf5 file. """ import sys sys.path.append('..') import hdf5 import msg if __name__ == '__main__': if len(sys.argv) != 2: print("usage: ./print file") sys.exit(1) print("\nThe following datasets were found in %s:\n" % sys.argv[1]) msg.list_dataset(hdf5.load(sys.argv[1]))
msg.warning("%s requested, but not found." % k) continue else: msg.info("Copying %s" % k) source.copy(k, output) if __name__ == '__main__': msg.box("HDF5 MANIPULATOR: EXTRACT") args = parser() f = h5py.File(args.input, 'r') o = h5py.File(args.output, 'w') print("The following datasets were found in %s:\n" % args.input) msg.list_dataset(f) copy(f, o, [k.strip() for k in args.keys.split(',')]) if len(o): print("\nThe following dataset were saved in %s:\n" % args.output) msg.list_dataset(o) else: msg.warning("No datasets were copied.") f.close() o.close() msg.info("Done")
usage() if __name__ == '__main__': if len(sys.argv) < 4: usage() train_frac, val_frac = get_fractions() if train_frac + val_frac > 1.0: msg.error("Total fraction must be <= 1.0") sys.exit(1) f = h5py.File(sys.argv[1], 'r+') print "\nThe following datasets were found in %s:\n" % sys.argv[1] msg.list_dataset(f) N = check.get_size(f) nof_train = int(train_frac * N) nof_val = int(val_frac * N) nof_test = N - nof_train - nof_val print "\nThe following split will be used:\n" print "\t - training: %d entries" % nof_train print "\t - validation: %d entries" % nof_val print "\t - testing: %d entries" % nof_test train_dict = {name: (0, nof_train) for name in f.keys()} valid_dict = {name: (nof_train, nof_train + nof_val) for name in f.keys()}
if __name__ == '__main__': msg.box("HDF5 MANIPULATOR: COMBINE") args = parser() in1, in2 = load(args.input1), load(args.input2) out = h5py.File(args.output, 'w') match(in1, in2, out, args.match) check.same_sizes(in1, in2) keys1 = get_keys(in1, args.keys1, args.match) keys2 = get_keys(in2, args.keys2, args.match) check.check_duplicates(keys1, keys2) copy(in1, out, keys1) copy(in2, out, keys2) print "\nThe following datasets were saved in %s:\n" % args.output msg.list_dataset(out) in1.close() in2.close() out.close() msg.info("Done")
"key": key, "id": i }) sys.exit(1) if __name__ == '__main__': if len(sys.argv) < 3: print("usage: ./diff file1 file2 [fullcheck]") sys.exit(1) in1 = h5py.File(sys.argv[1], 'r') in2 = h5py.File(sys.argv[2], 'r') print("\nThe following datasets were found in %s:\n" % sys.argv[1]) msg.list_dataset(in1) print("\nThe following datasets were found in %s:\n" % sys.argv[2]) msg.list_dataset(in2) check.check_keys(in1, in2) check.same_sizes(in1, in2) check.check_shapes(in1, in2) for key in in1: try: if not np.array_equal(in1[key], in2[key]): sys.exit(1) msg.error("%s datasets are different." % key) else:
sys.path.append('..') import hdf5 import msg import check if __name__ == '__main__': if len(sys.argv) != 3: print("usage: ./diff file1 file2") sys.exit(1) data1 = hdf5.load(sys.argv[1]) data2 = hdf5.load(sys.argv[2]) print("\nThe following datasets were found in %s:\n" % sys.argv[1]) msg.list_dataset(data1) print("\nThe following datasets were found in %s:\n" % sys.argv[2]) msg.list_dataset(data2) check.check_keys(data1, data2) if check.get_size(data1) != check.get_size(data2): msg.error("Different number of entries.") sys.exit(1) check.check_shapes(data1, data2) for key in data1: if not np.equal(data1[key], data2[key]).all(): msg.error("Different entries for dataset: %s" % key) sys.exit(1)