def diff_h5(input1_path, input2_path, numdiff=10): input1_file = tables.open_file(input1_path, mode="r") input2_file = tables.open_file(input2_path, mode="r") # print "copying globals from", input1_path, # input1_file.root.globals._f_copy(output_file.root, recursive=True) # print "done." input1_entities = input1_file.root.entities input2_entities = input2_file.root.entities #noinspection PyProtectedMember ent_names1 = set(table._v_name for table in input1_entities) #noinspection PyProtectedMember ent_names2 = set(table._v_name for table in input2_entities) for ent_name in sorted(ent_names1 | ent_names2): print() print(ent_name) if ent_name not in ent_names1: print("missing in file 1") continue elif ent_name not in ent_names2: print("missing in file 2") continue table1 = getattr(input1_entities, ent_name) input1_rows = index_table_light(table1) table2 = getattr(input2_entities, ent_name) input2_rows = index_table_light(table2) input1_periods = input1_rows.keys() input2_periods = input2_rows.keys() if input1_periods != input2_periods: print("periods are different in both files for '%s'" % ent_name) for period in sorted(set(input1_periods) & set(input2_periods)): print("* period:", period) start, stop = input1_rows.get(period, (0, 0)) array1 = table1.read(start, stop) start, stop = input2_rows.get(period, (0, 0)) array2 = table2.read(start, stop) diff_array(array1, array2, numdiff) input1_file.close() input2_file.close()
def diff_h5(input1_path, input2_path, numdiff=10): input1_file = tables.open_file(input1_path, mode="r") input2_file = tables.open_file(input2_path, mode="r") input1_entities = input1_file.root.entities input2_entities = input2_file.root.entities # noinspection PyProtectedMember ent_names1 = set(table._v_name for table in input1_entities) # noinspection PyProtectedMember ent_names2 = set(table._v_name for table in input2_entities) for ent_name in sorted(ent_names1 | ent_names2): print() print(ent_name) if ent_name not in ent_names1: print("missing in file 1") continue elif ent_name not in ent_names2: print("missing in file 2") continue table1 = getattr(input1_entities, ent_name) input1_rows = index_table_light(table1) table2 = getattr(input2_entities, ent_name) input2_rows = index_table_light(table2) input1_periods = input1_rows.keys() input2_periods = input2_rows.keys() if input1_periods != input2_periods: print("periods are different in both files for '%s'" % ent_name) for period in sorted(set(input1_periods) & set(input2_periods)): print("* period:", period) start, stop = input1_rows.get(period, (0, 0)) array1 = table1.read(start, stop) start, stop = input2_rows.get(period, (0, 0)) array2 = table2.read(start, stop) diff_array(array1, array2, numdiff) input1_file.close() input2_file.close()
def merge_h5(input1_path, input2_path, output_path): input1_file = tables.openFile(input1_path, mode="r") input2_file = tables.openFile(input2_path, mode="r") output_file = tables.openFile(output_path, mode="w") output_globals = output_file.createGroup("/", "globals", "Globals") print "copying globals from", input1_path, copyTable(input1_file.root.globals.periodic, output_file, output_globals) print "done." input1_entities = input1_file.root.entities input2_entities = input2_file.root.entities fields1 = get_h5_fields(input1_file) fields2 = get_h5_fields(input2_file) ent_names1 = set(fields1.keys()) ent_names2 = set(fields2.keys()) output_entities = output_file.createGroup("/", "entities", "Entities") for ent_name in sorted(ent_names1 | ent_names2): print print ent_name ent_fields1 = fields1.get(ent_name, []) ent_fields2 = fields2.get(ent_name, []) output_fields = merge_items(ent_fields1, ent_fields2) output_table = output_file.createTable(output_entities, ent_name, np.dtype(output_fields)) if ent_name in ent_names1: table1 = getattr(input1_entities, ent_name) print " * indexing table from %s ..." % input1_path, input1_rows = index_table_light(table1) print "done." else: table1 = None input1_rows = {} if ent_name in ent_names2: table2 = getattr(input2_entities, ent_name) print " * indexing table from %s ..." % input2_path, input2_rows = index_table_light(table2) print "done." else: table2 = None input2_rows = {} print " * merging: ", input1_periods = input1_rows.keys() input2_periods = input2_rows.keys() output_periods = sorted(set(input1_periods) | set(input2_periods)) def merge_period(period_idx, period): if ent_name in ent_names1: start, stop = input1_rows.get(period, (0, 0)) input1_array = table1.read(start, stop) else: input1_array = None if ent_name in ent_names2: start, stop = input2_rows.get(period, (0, 0)) input2_array = table2.read(start, stop) else: input2_array = None if ent_name in ent_names1 and ent_name in ent_names2: output_array, _ = mergeArrays(input1_array, input2_array) elif ent_name in ent_names1: output_array = input1_array elif ent_name in ent_names2: output_array = input2_array else: raise Exception("this shouldn't have happened") output_table.append(output_array) output_table.flush() loop_wh_progress(merge_period, output_periods) print " done." input1_file.close() input2_file.close() output_file.close()
def diff_h5(input1_path, input2_path, numdiff=10): input1_file = tables.openFile(input1_path, mode="r") input2_file = tables.openFile(input2_path, mode="r") # print "copying globals from", input1_path, # input1_file.root.globals._f_copy(output_file.root, recursive=True) # print "done." input1_entities = input1_file.root.entities input2_entities = input2_file.root.entities fields1 = get_h5_fields(input1_file) fields2 = get_h5_fields(input2_file) ent_names1 = set(fields1.keys()) ent_names2 = set(fields2.keys()) for ent_name in sorted(ent_names1 | ent_names2): print print ent_name if ent_name not in ent_names1: print "missing in file 1" continue elif ent_name not in ent_names2: print "missing in file 2" continue ent_fields1 = fields1.get(ent_name, []) ent_fields2 = fields2.get(ent_name, []) fnames1 = set(fname for fname, _ in ent_fields1) fnames2 = set(fname for fname, _ in ent_fields2) table1 = getattr(input1_entities, ent_name) input1_rows = index_table_light(table1) table2 = getattr(input2_entities, ent_name) input2_rows = index_table_light(table2) input1_periods = input1_rows.keys() input2_periods = input2_rows.keys() if input1_periods != input2_periods: print "periods are different in both files for '%s'" % ent_name for period in sorted(set(input1_periods) & set(input2_periods)): print "* period:", period start, stop = input1_rows.get(period, (0, 0)) array1 = table1.read(start, stop) start, stop = input2_rows.get(period, (0, 0)) array2 = table2.read(start, stop) if len(array1) != len(array2): print "length is different: %d vs %d" % (len(array1), len(array2)) ids1 = array1['id'] ids2 = array2['id'] all_ids = np.union1d(ids1, ids2) notin1 = np.setdiff1d(ids1, all_ids) notin2 = np.setdiff1d(ids2, all_ids) if notin1: print "the following ids are not present in file 1:", \ notin1 elif notin2: print "the following ids are not present in file 2:", \ notin2 else: # some ids must be duplicated if len(ids1) > len(all_ids): print "file 1 contain duplicate ids:", uniques, dupes = unique_dupes(ids1) print dupes array1 = array1[uniques] if len(ids2) > len(all_ids): print "file 2 contain duplicate ids:", uniques, dupes = unique_dupes(ids2) print dupes array2 = array2[uniques] for fname in sorted(fnames1 | fnames2): print " - %s:" % fname, if fname not in fnames1: print "missing in file 1" continue elif fname not in fnames2: print "missing in file 2" continue col1, col2 = array1[fname], array2[fname] if np.array_equal(col1, col2): print "ok" else: print "different", if len(col1) != len(col2): print "(length)" else: diff = (col1 != col2).nonzero()[0] print "(%d differences)" % len(diff) ids = array1['id'] if len(diff) > numdiff: diff = diff[:numdiff] print PrettyTable([['id', fname + ' (file1)', fname + ' (file2)']] + [[ids[idx], col1[idx], col2[idx]] for idx in diff]) input1_file.close() input2_file.close()
def merge_group(parent1, parent2, name, output_file, index_col): print() print(name) print('=' * len(name)) group1 = getattr(parent1, name, None) group2 = getattr(parent2, name, None) if group1 is None and group2 is None: print("node not found in either input files, skipped") return output_group = output_file.create_group("/", name) fields1 = get_group_fields(group1) fields2 = get_group_fields(group2) ent_names1 = set(fields1.keys()) ent_names2 = set(fields2.keys()) for ent_name in sorted(ent_names1 | ent_names2): print() print(ent_name) ent_fields1 = fields1.get(ent_name, []) ent_fields2 = fields2.get(ent_name, []) output_fields = merge_items(ent_fields1, ent_fields2) output_table = output_file.create_table(output_group, ent_name, np.dtype(output_fields)) if ent_name in ent_names1: table1 = getattr(group1, ent_name) # noinspection PyProtectedMember print(" * indexing table from %s ..." % group1._v_file.filename, end=' ') input1_rows = index_table_light(table1, index_col) print("done.") else: table1 = None input1_rows = {} if ent_name in ent_names2: table2 = getattr(group2, ent_name) # noinspection PyProtectedMember print(" * indexing table from %s ..." % group2._v_file.filename, end=' ') input2_rows = index_table_light(table2, index_col) print("done.") else: table2 = None input2_rows = {} print(" * merging: ", end=' ') input1_periods = input1_rows.keys() input2_periods = input2_rows.keys() output_periods = sorted(set(input1_periods) | set(input2_periods)) # noinspection PyUnusedLocal def merge_period(period_idx, period): if ent_name in ent_names1: start, stop = input1_rows.get(period, (0, 0)) input1_array = table1.read(start, stop) else: input1_array = None if ent_name in ent_names2: start, stop = input2_rows.get(period, (0, 0)) input2_array = table2.read(start, stop) else: input2_array = None if ent_name in ent_names1 and ent_name in ent_names2: if 'id' in input1_array.dtype.names: assert 'id' in input2_array.dtype.names output_array, _ = merge_arrays(input1_array, input2_array) else: output_array = merge_array_records(input1_array, input2_array) elif ent_name in ent_names1: output_array = input1_array elif ent_name in ent_names2: output_array = input2_array else: raise Exception("this shouldn't have happened") output_table.append(output_array) output_table.flush() loop_wh_progress(merge_period, output_periods) print(" done.")
def merge_h5(input1_path, input2_path, output_path): input1_file = tables.open_file(input1_path, mode="r") input2_file = tables.open_file(input2_path, mode="r") output_file = tables.open_file(output_path, mode="w") print("copying globals from", input1_path, end=' ') #noinspection PyProtectedMember input1_file.root.globals._f_copy(output_file.root, recursive=True) print("done.") input1_entities = input1_file.root.entities input2_entities = input2_file.root.entities fields1 = get_h5_fields(input1_file) fields2 = get_h5_fields(input2_file) ent_names1 = set(fields1.keys()) ent_names2 = set(fields2.keys()) output_entities = output_file.create_group("/", "entities", "Entities") for ent_name in sorted(ent_names1 | ent_names2): print() print(ent_name) ent_fields1 = fields1.get(ent_name, []) ent_fields2 = fields2.get(ent_name, []) output_fields = merge_items(ent_fields1, ent_fields2) output_table = output_file.create_table(output_entities, ent_name, np.dtype(output_fields)) if ent_name in ent_names1: table1 = getattr(input1_entities, ent_name) print(" * indexing table from %s ..." % input1_path, end=' ') input1_rows = index_table_light(table1) print("done.") else: table1 = None input1_rows = {} if ent_name in ent_names2: table2 = getattr(input2_entities, ent_name) print(" * indexing table from %s ..." % input2_path, end=' ') input2_rows = index_table_light(table2) print("done.") else: table2 = None input2_rows = {} print(" * merging: ", end=' ') input1_periods = input1_rows.keys() input2_periods = input2_rows.keys() output_periods = sorted(set(input1_periods) | set(input2_periods)) #noinspection PyUnusedLocal def merge_period(period_idx, period): if ent_name in ent_names1: start, stop = input1_rows.get(period, (0, 0)) input1_array = table1.read(start, stop) else: input1_array = None if ent_name in ent_names2: start, stop = input2_rows.get(period, (0, 0)) input2_array = table2.read(start, stop) else: input2_array = None if ent_name in ent_names1 and ent_name in ent_names2: output_array, _ = merge_arrays(input1_array, input2_array) elif ent_name in ent_names1: output_array = input1_array elif ent_name in ent_names2: output_array = input2_array else: raise Exception("this shouldn't have happened") output_table.append(output_array) output_table.flush() loop_wh_progress(merge_period, output_periods) print(" done.") input1_file.close() input2_file.close() output_file.close()