def main(hdf5_file_name, paths=None): hf5p = h5py.File(hdf5_file_name, "r") if paths is None: paths = get_all_paths(hf5p["/"]) else: expanded_paths = [] for path in paths: expanded_paths += get_all_paths(hf5p[path]) paths = expanded_paths annotations_paths = [hp for hp in paths if "column_annotations" in hp] core_array_paths = [hp for hp in paths if "core_array" in hp] annotations_list = [] number_of_rows = hf5p[core_array_paths[0]].shape[0] i = 0 for path in annotations_paths: annotations = hf5p[path][...] nrows, ncolumns = annotations.shape for j in range(ncolumns): field_name = "" for k in range(nrows): if len(annotations[k, j].strip()) > 0: temp_string = annotations[k, j] + "." field_name += temp_string field_name = field_name[:-1] annotations_list += [{ "path": core_array_paths[i], "index": j, "field_name": field_name }] i += 1 import pprint pprint.pprint(annotations_list) header = [ca["field_name"] for ca in annotations_list] with open(hdf5_file_name + ".csv", "wb") as fw: csv_writer = csv.writer(fw) csv_writer.writerow(header) for l in range(number_of_rows): row_to_write = [] paths_core_vector = {} for path in core_array_paths: paths_core_vector[path] = hf5p[path][l, :] for annotation in annotations_list: row_to_write += [ paths_core_vector[annotation["path"]][annotation["index"]] ] csv_writer.writerow(row_to_write) if l % 1000 == 0 and l > 0: print("Wrote %s rows" % l)
def main(hdf5_file_name, paths=None): hf5p = h5py.File(hdf5_file_name, "r") if paths is None: paths = get_all_paths(hf5p["/"]) else: expanded_paths = [] for path in paths: expanded_paths += get_all_paths(hf5p[path]) paths = expanded_paths annotations_paths = [hp for hp in paths if "column_annotations" in hp] core_array_paths = [hp for hp in paths if "core_array" in hp] annotations_list = [] number_of_rows = hf5p[core_array_paths[0]].shape[0] i = 0 for path in annotations_paths: annotations = hf5p[path][...] nrows, ncolumns = annotations.shape for j in range(ncolumns): field_name = "" for k in range(nrows): if len(annotations[k,j].strip()) > 0: temp_string = annotations[k,j] + "." field_name += temp_string field_name = field_name[:-1] annotations_list += [{"path": core_array_paths[i], "index": j, "field_name": field_name}] i += 1 import pprint pprint.pprint(annotations_list) header = [ca["field_name"] for ca in annotations_list] with open(hdf5_file_name + ".csv", "wb") as fw: csv_writer = csv.writer(fw) csv_writer.writerow(header) for l in range(number_of_rows): row_to_write = [] paths_core_vector = {} for path in core_array_paths: paths_core_vector[path] = hf5p[path][l, :] for annotation in annotations_list: row_to_write += [paths_core_vector[annotation["path"]][annotation["index"]]] csv_writer.writerow(row_to_write) if l % 1000 == 0 and l > 0: print("Wrote %s rows" % l)
def main(starting_directory="X:\\healthfacts\\20160808\\"): file_summary_csv = os.path.join(starting_directory, "hdf5_files_summary.csv") header = ["full_directory", "directory", "file_name", "hdf5_path", "number_of_rows", "number_of_columns", "number_of_cells", "non_zero_entries", "fraction_non_zero"] with open(file_summary_csv, "wb") as fw: csv_writer = csv.writer(fw) csv_writer.writerow(header) for dir_name, subdir_list, file_list in os.walk(starting_directory): for file_name in file_list: base_name, ext = os.path.splitext(file_name) if ext == ".hdf5": hdf5_file_name = os.path.join(dir_name, file_name) h5 = h5py.File(hdf5_file_name) group_paths = upx.get_all_paths(h5["/"]) if group_paths is not None: for group_path in group_paths: if group_path.split("/")[-1] == "core_array": numeric_array = h5[group_path] non_zero = np.where(numeric_array[...] > 0) n_rows, n_columns = numeric_array.shape n_cells = n_rows * n_columns n_non_zero = len(non_zero[0]) if n_cells is None or n_cells == 0: fraction_non_zero = None else: fraction_non_zero = 1.0 * n_non_zero / n_cells row_to_write = [dir_name, os.path.split(dir_name)[-1], file_name, group_path, n_rows, n_columns, n_cells, n_non_zero, fraction_non_zero] print(row_to_write) csv_writer.writerow(row_to_write)
def main(hdf5_file_name, csv_file_name=None, threshold_value_to_include=0.01): fp5 = h5py.File(hdf5_file_name) paths = get_all_paths(fp5["/"]) core_array_paths = [p for p in paths if p.split("/")[-1] == "core_array"] stripped_split_paths = [p.split("/")[:-1] for p in core_array_paths] stripped_paths = ["/".join(sp) for sp in stripped_split_paths] if csv_file_name is None: csv_file_name = hdf5_file_name + ".summary.csv" with open(csv_file_name, "wb") as fw: csv_writer = csv.writer(fw) header = [ "path", "c1", "c2", "c3", "non-zero", "to_include", "fraction non-zero" ] csv_writer.writerow(header) for stripped_path in stripped_paths: print(stripped_path) column_annotation_path = stripped_path + "/column_annotations" core_array_path = stripped_path + "/core_array" column_annotations = fp5[column_annotation_path][...] for j in range(column_annotations.shape[1]): slice_of_interest = fp5[core_array_path][:, j] number_of_rows = slice_of_interest.shape[0] non_zero_values = np.where(slice_of_interest > 0) n_non_zero_values = len(non_zero_values[0]) column_name_to_write = column_annotations[:, j] column_names = column_name_to_write.transpose().tolist() fraction_non_zero = (1.0 * n_non_zero_values) / number_of_rows if fraction_non_zero >= threshold_value_to_include: to_include = "1" else: to_include = "" row_to_write = [stripped_path] + column_names + [ n_non_zero_values, to_include, fraction_non_zero ] csv_writer.writerow(row_to_write)
def main(hdf5_file_name, csv_file_name=None, threshold_value_to_include=0.01): fp5 = h5py.File(hdf5_file_name) paths = get_all_paths(fp5["/"]) core_array_paths = [p for p in paths if p.split("/")[-1] == "core_array"] stripped_split_paths = [p.split("/")[:-1] for p in core_array_paths] stripped_paths = ["/".join(sp) for sp in stripped_split_paths] if csv_file_name is None: csv_file_name = hdf5_file_name + ".summary.csv" with open(csv_file_name, "wb") as fw: csv_writer = csv.writer(fw) header = ["path", "c1", "c2", "c3", "non-zero", "to_include", "fraction non-zero"] csv_writer.writerow(header) for stripped_path in stripped_paths: print(stripped_path) column_annotation_path = stripped_path + "/column_annotations" core_array_path = stripped_path + "/core_array" column_annotations = fp5[column_annotation_path][...] for j in range(column_annotations.shape[1]): slice_of_interest = fp5[core_array_path][:, j] number_of_rows = slice_of_interest.shape[0] non_zero_values = np.where(slice_of_interest > 0) n_non_zero_values = len(non_zero_values[0]) column_name_to_write = column_annotations[:,j] column_names = column_name_to_write.transpose().tolist() fraction_non_zero = (1.0 * n_non_zero_values) / number_of_rows if fraction_non_zero >= threshold_value_to_include: to_include = "1" else: to_include = "" row_to_write = [stripped_path] + column_names + [n_non_zero_values, to_include, fraction_non_zero] csv_writer.writerow(row_to_write)
def main(file_name): f5 = h5py.File(file_name, "r") all_paths = upx.get_all_paths(f5["./"]) column_annotation_paths = [p for p in all_paths if p[ -1 * len("/column_annotations"):] == "/column_annotations"] core_array_paths = [c for c in all_paths if c[ -1 * len("/core_array"):] == "/core_array"] i = 0 annotation_csv_dict = {} annotation_path_list = [] for column_annotation_path in column_annotation_paths: annotation_path = column_annotation_path[:-1 * len("/column_annotations")] print(annotation_path) annotation_path_list += [annotation_path] annotation_csv_dict[annotation_path] = file_name + "." + str(i) + ".csv" column_annotation = f5[column_annotation_path][...] column_annotation_list = column_annotation[0, :].tolist() column_annotation_fields = column_annotation[1, :].tolist() fields = np.unique(column_annotation_list).tolist() with open(annotation_csv_dict[annotation_path], "wb") as fw: csv_writer = csv.writer(fw) csv_writer.writerow(fields) core_array_rows = f5[core_array_paths[i]].shape[0] #print(core_array_rows) #exit() #core_array_rows = 1000 for j in range(core_array_rows):#range(core_array_shape[0]): result = {} core_array_row = f5[core_array_paths[i]][j, :] for k in range(len(column_annotation_list)): if core_array_row[k] == 1: result[column_annotation_list[k]] = column_annotation_fields[k] else: if core_array_row[k] > 0: if column_annotation_list[k] not in result: if int(core_array_row[k]) - core_array_row[k] > 0.0: result[column_annotation_list[k]] = core_array_row[k] else: result[column_annotation_list[k]] = int(core_array_row[k]) if j % 1000 == 0 and j > 0: print("Read '%s' rows" % j) row_to_write = [] for field in fields: if field in result: row_to_write += [result[field]] else: row_to_write += [''] csv_writer.writerow(row_to_write) i += 1 # Build a single CSV file csv_dict_path = {} for annotation_path in annotation_csv_dict: f = open(annotation_csv_dict[annotation_path], "rb") csv_dict_path[annotation_path] = csv.reader(f) header = [] for annotation_path in annotation_path_list: ap_header = csv_dict_path[annotation_path].next() header += ap_header master_csv_file_name = file_name + ".recode.csv" with open(master_csv_file_name, "wb") as fw: master_csv_writer = csv.writer(fw) master_csv_writer.writerow(header) for i in range(core_array_rows): master_row = [] for path in annotation_path_list: master_row += csv_dict_path[path].next() master_csv_writer.writerow(master_row)
def main(file_name): f5 = h5py.File(file_name, "r") all_paths = upx.get_all_paths(f5["./"]) column_annotation_paths = [ p for p in all_paths if p[-1 * len("/column_annotations"):] == "/column_annotations" ] core_array_paths = [ c for c in all_paths if c[-1 * len("/core_array"):] == "/core_array" ] i = 0 annotation_csv_dict = {} annotation_path_list = [] for column_annotation_path in column_annotation_paths: annotation_path = column_annotation_path[:-1 * len("/column_annotations")] print(annotation_path) annotation_path_list += [annotation_path] annotation_csv_dict[annotation_path] = file_name + "." + str( i) + ".csv" column_annotation = f5[column_annotation_path][...] column_annotation_list = column_annotation[0, :].tolist() column_annotation_fields = column_annotation[1, :].tolist() fields = np.unique(column_annotation_list).tolist() with open(annotation_csv_dict[annotation_path], "wb") as fw: csv_writer = csv.writer(fw) csv_writer.writerow(fields) core_array_rows = f5[core_array_paths[i]].shape[0] #print(core_array_rows) #exit() #core_array_rows = 1000 for j in range(core_array_rows): #range(core_array_shape[0]): result = {} core_array_row = f5[core_array_paths[i]][j, :] for k in range(len(column_annotation_list)): if core_array_row[k] == 1: result[column_annotation_list[ k]] = column_annotation_fields[k] else: if core_array_row[k] > 0: if column_annotation_list[k] not in result: if int(core_array_row[k] ) - core_array_row[k] > 0.0: result[column_annotation_list[ k]] = core_array_row[k] else: result[column_annotation_list[k]] = int( core_array_row[k]) if j % 1000 == 0 and j > 0: print("Read '%s' rows" % j) row_to_write = [] for field in fields: if field in result: row_to_write += [result[field]] else: row_to_write += [''] csv_writer.writerow(row_to_write) i += 1 # Build a single CSV file csv_dict_path = {} for annotation_path in annotation_csv_dict: f = open(annotation_csv_dict[annotation_path], "rb") csv_dict_path[annotation_path] = csv.reader(f) header = [] for annotation_path in annotation_path_list: ap_header = csv_dict_path[annotation_path].next() header += ap_header master_csv_file_name = file_name + ".recode.csv" with open(master_csv_file_name, "wb") as fw: master_csv_writer = csv.writer(fw) master_csv_writer.writerow(header) for i in range(core_array_rows): master_row = [] for path in annotation_path_list: master_row += csv_dict_path[path].next() master_csv_writer.writerow(master_row)