예제 #1
0
    def find_triangle_headers(dh, **kwargs):
        test_settings = False
        if 'test_settings' in kwargs:
            test_settings = kwargs['test_settings']
        return_meta = False
        meta_dh = DataHolder(dh.name + "_meta")
        if 'return_meta' in kwargs:
            return_meta = kwargs["return_meta"]

        for ds in dh.data_struct_list:
            # only do this for potential triangles:
            if ds.df_data.shape[0] >= pp.MIN_ROWS_TRIANGLE or test_settings:
                headers, pd_ind = TriangleHeaderFinder.find_ds_headers(ds)
                HeaderFinder.insert_headers(headers, pd_ind, ds.df_data, ds.df_profiles)
                ds.df_data = ds.df_data.reindex(sorted(ds.df_data.columns), axis=1)
                ds.df_profiles = ds.df_profiles.reindex(sorted(ds.df_profiles.columns), axis=1)
                # now remove unnecessary rows
                ds = TriangleHeaderFinder.remove_stray_rows(ds, pd_ind)
                if return_meta and ds is not None:
                    for split in ds.col_split_ds():
                        if not np.all(split.df_profiles == SheetTypeDefinitions.EMPTY_STRING) and not (np.all(split.df_data == "")):
                            meta_dh.add_ds(split)
        if return_meta:
            return dh, meta_dh
        else:
            return dh
    def chop_triangles_horizontally(dh):
        """
        checks for repeating header and splits ds:s
        :param dh: DataHolder
        :return: DataHolder
        """
        chop, chop_lists = TriangleChopper.make_occurrence_list(dh)

        if not chop:
            return dh
        else:
            new_dh = DataHolder(dh.name)
            for ind, ds in enumerate(dh):
                occurrence_list = chop_lists[ind]
                if any(occurrence_list):
                    for i in range(1, np.max(occurrence_list) + 1):
                        bools = np.logical_or(occurrence_list == 0,
                                              occurrence_list == i)
                        df_data = ds.df_data[ds.df_data.columns[bools]].copy()
                        df_profiles = ds.df_profiles[
                            ds.df_profiles.columns[bools]].copy()
                        new_dh.add_sheet(ds.name,
                                         df_data,
                                         df_profiles,
                                         orig_sheet_name=ds.orig_sheet_name)

                else:
                    new_dh.add_ds(ds)
            return new_dh
예제 #3
0
 def find_headers(dh):
     meta_dh = DataHolder(dh.name + "_meta")
     for ds in dh:
         df_data, df_profiles = ds.df_data, ds.df_profiles
         bin_mat = df_profiles.values[:pp.N_POSSIBLE_HEADER_ROWS, ] == 1
         one_nums = np.sum(bin_mat, axis=1)
         # subtract identical strings
         for i in range(pp.N_POSSIBLE_HEADER_ROWS):
             sub = df_data.shape[1] - len(df_data.iloc[i, :].unique())
             one_nums[i] -= sub
         header_ind = np.argmax(one_nums)
         pd_ind = df_profiles.index[header_ind]
         headers = df_data.loc[[pd_ind]]
         HeaderFinder.insert_headers(headers, pd_ind, df_data, df_profiles)
         meta_ds = HeaderFinder.remove_leading_rows(ds, pd_ind)
         meta_dh.add_ds(meta_ds)
     return dh, meta_dh
 def perform_vertical_chop(dh, chop_bools, chop_lists):
     new_dh = DataHolder(dh.name)
     for ind, ds in enumerate(dh):
         if chop_bools[ind]:
             cut = chop_lists[ind]
             # Don't cut too much
             if len(cut) < pp.MAX_NUM_VERTICAL_CHOPS:
                 cut = [0] + cut.tolist()
                 for i in range(len(cut) - 1):
                     temp_df_data = ds.df_data.iloc[cut[i]:cut[i + 1], :]
                     temp_df_profiles = ds.df_profiles.iloc[
                         cut[i]:cut[i + 1], :]
                     new_ds = DataStruct(temp_df_data,
                                         temp_df_profiles,
                                         ds.name,
                                         orig_sheet_name=ds.orig_sheet_name)
                     new_dh.add_ds(new_ds)
             else:
                 new_dh.add_ds(ds)
         else:
             new_dh.add_ds(ds)
     return new_dh