def find_triangle_headers(dh, **kwargs): test_settings = False if 'test_settings' in kwargs: test_settings = kwargs['test_settings'] return_meta = False meta_dh = DataHolder(dh.name + "_meta") if 'return_meta' in kwargs: return_meta = kwargs["return_meta"] for ds in dh.data_struct_list: # only do this for potential triangles: if ds.df_data.shape[0] >= pp.MIN_ROWS_TRIANGLE or test_settings: headers, pd_ind = TriangleHeaderFinder.find_ds_headers(ds) HeaderFinder.insert_headers(headers, pd_ind, ds.df_data, ds.df_profiles) ds.df_data = ds.df_data.reindex(sorted(ds.df_data.columns), axis=1) ds.df_profiles = ds.df_profiles.reindex(sorted(ds.df_profiles.columns), axis=1) # now remove unnecessary rows ds = TriangleHeaderFinder.remove_stray_rows(ds, pd_ind) if return_meta and ds is not None: for split in ds.col_split_ds(): if not np.all(split.df_profiles == SheetTypeDefinitions.EMPTY_STRING) and not (np.all(split.df_data == "")): meta_dh.add_ds(split) if return_meta: return dh, meta_dh else: return dh
def chop_triangles_horizontally(dh): """ checks for repeating header and splits ds:s :param dh: DataHolder :return: DataHolder """ chop, chop_lists = TriangleChopper.make_occurrence_list(dh) if not chop: return dh else: new_dh = DataHolder(dh.name) for ind, ds in enumerate(dh): occurrence_list = chop_lists[ind] if any(occurrence_list): for i in range(1, np.max(occurrence_list) + 1): bools = np.logical_or(occurrence_list == 0, occurrence_list == i) df_data = ds.df_data[ds.df_data.columns[bools]].copy() df_profiles = ds.df_profiles[ ds.df_profiles.columns[bools]].copy() new_dh.add_sheet(ds.name, df_data, df_profiles, orig_sheet_name=ds.orig_sheet_name) else: new_dh.add_ds(ds) return new_dh
def find_headers(dh): meta_dh = DataHolder(dh.name + "_meta") for ds in dh: df_data, df_profiles = ds.df_data, ds.df_profiles bin_mat = df_profiles.values[:pp.N_POSSIBLE_HEADER_ROWS, ] == 1 one_nums = np.sum(bin_mat, axis=1) # subtract identical strings for i in range(pp.N_POSSIBLE_HEADER_ROWS): sub = df_data.shape[1] - len(df_data.iloc[i, :].unique()) one_nums[i] -= sub header_ind = np.argmax(one_nums) pd_ind = df_profiles.index[header_ind] headers = df_data.loc[[pd_ind]] HeaderFinder.insert_headers(headers, pd_ind, df_data, df_profiles) meta_ds = HeaderFinder.remove_leading_rows(ds, pd_ind) meta_dh.add_ds(meta_ds) return dh, meta_dh
def perform_vertical_chop(dh, chop_bools, chop_lists): new_dh = DataHolder(dh.name) for ind, ds in enumerate(dh): if chop_bools[ind]: cut = chop_lists[ind] # Don't cut too much if len(cut) < pp.MAX_NUM_VERTICAL_CHOPS: cut = [0] + cut.tolist() for i in range(len(cut) - 1): temp_df_data = ds.df_data.iloc[cut[i]:cut[i + 1], :] temp_df_profiles = ds.df_profiles.iloc[ cut[i]:cut[i + 1], :] new_ds = DataStruct(temp_df_data, temp_df_profiles, ds.name, orig_sheet_name=ds.orig_sheet_name) new_dh.add_ds(new_ds) else: new_dh.add_ds(ds) else: new_dh.add_ds(ds) return new_dh