def build_triangle_from_table(dh): new_dh = DataHolder(dh.name) pool = DebuggablePool(pp.N_CORES) # First find all date cols and see if one of them has target structure. for dh_ind, ds in enumerate(dh.data_struct_list): id_col, hori_date_col, vert_date_col = TriangleFromTableBuilder.do_the_magic( ds, pool) # cut each id into one row cut_list = TriangleFromTableBuilder.make_cut_list( ds.df_data[id_col]) # use the cut_list to insert all elements tr_cols = pd.Series(ds.df_profiles.iloc[0, :] == SheetTypeDefinitions.TRIANGLE_ELEMENT, index=ds.df_profiles.columns) pad_header_mapping = TriangleFromTableBuilder.make_pad_header_mapping( ds, hori_date_col) vert_col_tup = (vert_date_col, ds.df_data[vert_date_col]) hori_col_tup = (hori_date_col, ds.df_data[hori_date_col]) id_col_tup = (id_col, ds.df_data[id_col]) func = partial(TriangleFromTableBuilder.apply_cuts, cut_list, vert_col_tup, hori_col_tup, id_col_tup, pad_header_mapping) tr_col_tup_list = [(col_name, ds.df_data[col_name]) for col_name in tr_cols.index[tr_cols]] out = pool.map(func, tr_col_tup_list) #for name, tr_col in ds.df_data[tr_cols.index[tr_cols]].iteritems(): for temp_df_data, temp_df_profiles, name in out: new_dh.add_sheet(name, temp_df_data, temp_df_profiles) #new_dh.add_sheet(name, temp_df_data, temp_df_profiles) pool.close() return new_dh
def name_and_scrub_triangle(dh, new_dh_dict, meta_dh=None): new_dh = DataHolder(dh.name) word_set_list = list() for ds in dh: word_set_list.append( SubTriangler.identify_category_name(ds, meta_dh)) if meta_dh != None: if meta_dh.n > 0: SubTriangler.divide_meta_data(dh, meta_dh, word_set_list) # Find the most unique name for i in range(len(word_set_list)): ds = dh.data_struct_list[i] difference = word_set_list[i].copy() for j in range(len(word_set_list)): if j != i and ds.orig_sheet_name == dh.data_struct_list[ j].orig_sheet_name: difference = difference.difference(word_set_list[j]) if len(difference) > 0: stringified = sorted([str(el) for el in difference]) name = " ".join(stringified) name = name.translate(SubTriangler.remove_digits) else: name = str(i) if ds.name != ds.orig_sheet_name: name = ds.name + " " + name new_dh.add_sheet(name, ds.df_data, ds.df_profiles, orig_sheet_name=ds.orig_sheet_name) new_dh_dict[dh.name] = new_dh
def setUp(self): # self.trngs = [{'headers': ["Year", "unit"], # 'categories': [ # {'name': 'Claim - Incurred', # 'type': 'sum', # 'from': [ps.CAT_PAID_NAME, ps.CAT_RESERVED_NAME]}, # {'name': ps.CAT_PAID_NAME, # 'type': 'independent', # 'from': []}, # {'name': ps.CAT_RESERVED_NAME, # 'type': 'independent', # 'from': []} # ], # "group_id": 0, # "type": "aggregate" # }, # {'headers': ["Year", "unit"], # 'categories': [ # {'name': ps.CAT_PREMIUM_NAME, # 'type': 'independent', # 'from': []}], # "group_id": 0, # "type": "aggregate" # }] #info_dict = {'tri_type': 'aggregate', 'n_outputs': 1} #print(info_dict) #user_defined_triangles = OutputTriangleParser.generate_output_triangles(info_dict) self.names = ["Premium_", "Premium", "Total Outstanding 2004", "Paid", "Total Incurred"] self.dh = DataHolder("test") self.dh.add_sheet(self.names[0], pd.DataFrame(data=[0]), pd.DataFrame(data=[0])) self.dh.add_sheet(self.names[1], pd.DataFrame(data=[0]), pd.DataFrame(data=[0])) self.dh.add_sheet(self.names[2], pd.DataFrame(data=[0]), pd.DataFrame(data=[0])) self.dh.add_sheet(self.names[3], pd.DataFrame(data=[0]), pd.DataFrame(data=[0])) self.dh.add_sheet(self.names[4], pd.DataFrame(data=[0]), pd.DataFrame(data=[0]))
def make_sol_dict(): """ Run present pipeline and save the merge results :return: """ file_names = ["FORMAT3_Copy of KommuneMTPLforTriangle.xls", "C Triangulations analysis R2017 GC20161109.xls", "EVOLUTION 2017 _ M+F - Triangles cat nat brut net.xls", "Bsp8 _ Dreiecke aus GCNA für CU1.4.1.xls", "Analysis MTPL MOD.xls", "Bsp6 _ Dreiecke aus GCNA für CU1.4.1.xls", "FORMAT6_sinistres.xls", "FORMAT1_LOSSES-MTPL-OVER-500-GROUP-2005_modified.xls"] solutions_dict = dict() raw_dict = dict() for file_name in file_names: sr_list, file_name = ExcelLoader.load_excel(pdir.RESOURCES_DIR + "/raw_test_files/" + file_name) dh = DataHolder() for sr in sr_list: dh.add_sheet(sr.sheet_name, pd.DataFrame(columns=sr.headers, data=sr.row_vals), pd.DataFrame(columns=sr.headers, data=sr.xls_types), orig_sheet_name=sr.sheet_name) dh = SheetPreProcessor.separate_components(dh) raw_dict[file_name] = dh.encode() dh = HorizontalMerger.horizontal_merge(dh) #temp_path = pdir.RESOURCES_DIR + "/temp/" #dh.write_excel(temp_path + file_name) solutions_dict[file_name] = dh solutions_dict = MergePararametersOptimizer.make_ind_col_dict(solutions_dict) with open(pdir.RESOURCES_DIR + "/test/merge_solutions.obj", "wb") as temp_file: pickle.dump(solutions_dict, temp_file) with open(pdir.RESOURCES_DIR + "/test/raw_test.obj", "wb") as temp_file: pickle.dump(raw_dict, temp_file)
def numify_dates(dh): meta_dh = DataHolder(dh.name + "_meta") for ds in dh: d_cols = DateColIdentifier.identify_marked_date_cols(ds) date_data = ds.df_data[d_cols.index[d_cols]] meta_data = pd.DataFrame() meta_profiles = pd.DataFrame() for name, col in date_data.iteritems(): ds.df_profiles.loc[:, name] = SheetTypeDefinitions.STRING_DATE types = col.map(lambda x: isinstance(x, float)) if types.all(): #do nothing basically ds.df_data[name] = col.astype(int) else: meta_data[name] = col.copy() meta_profiles[name] = ds.df_profiles[name].copy() temp_col = col.copy() temp_col[types] = temp_col[types] #SOME PROBLEM WITH KEEPING TYEPS INT IN SHEET W temp_col[np.logical_not(types)] = temp_col[np.logical_not(types)].map(lambda x: longest_numeral(x)) temp_col = temp_col.astype(int) ds.df_data[name] = temp_col # meta_backed = False # for index, val in col.iteritems(): # if not isinstance(val, int): # if not meta_backed: # meta_backed = True # meta_data[name] = col.copy() # meta_profiles[name] = ds.df_profiles[name].copy() # num = longest_numeral(val) # ds.df_data.loc[index, name] = num meta_dh.add_sheet("date_backup", meta_data, meta_profiles, orig_sheet_name=ds.orig_sheet_name) return dh, meta_dh
def find_triangle_headers(dh, **kwargs): test_settings = False if 'test_settings' in kwargs: test_settings = kwargs['test_settings'] return_meta = False meta_dh = DataHolder(dh.name + "_meta") if 'return_meta' in kwargs: return_meta = kwargs["return_meta"] for ds in dh.data_struct_list: # only do this for potential triangles: if ds.df_data.shape[0] >= pp.MIN_ROWS_TRIANGLE or test_settings: headers, pd_ind = TriangleHeaderFinder.find_ds_headers(ds) HeaderFinder.insert_headers(headers, pd_ind, ds.df_data, ds.df_profiles) ds.df_data = ds.df_data.reindex(sorted(ds.df_data.columns), axis=1) ds.df_profiles = ds.df_profiles.reindex(sorted(ds.df_profiles.columns), axis=1) # now remove unnecessary rows ds = TriangleHeaderFinder.remove_stray_rows(ds, pd_ind) if return_meta and ds is not None: for split in ds.col_split_ds(): if not np.all(split.df_profiles == SheetTypeDefinitions.EMPTY_STRING) and not (np.all(split.df_data == "")): meta_dh.add_ds(split) if return_meta: return dh, meta_dh else: return dh
def merge_with_merges_list(dh, merges): new_dh = DataHolder(dh.name) for merge in merges: profiles = None data = None name_set = set() for ind in merge: ds = dh.id_dict[ind] name_set.update([ds.name]) if profiles is None: profiles = ds.df_profiles.copy() data = ds.df_data.copy() else: temp_profiles = ds.df_profiles.copy() temp_data = ds.df_data.copy() # TODO: generalize to other positions then the first position #if profiles.shape[1] > temp_profiles.shape[1]: # for header in profiles.columns[temp_profiles.shape[1]:]: # temp_profiles[header] = SheetTypeDefinitions.ZERO_FLOAT # temp_data[header] = 0.0 profiles = pd.concat([profiles, temp_profiles], sort=True) profiles.fillna(SheetTypeDefinitions.ZERO_FLOAT, inplace=True) data = pd.concat([data, temp_data], sort=True) new_dh.add_sheet("_".join(list(name_set)), data, profiles, orig_sheet_name=ds.orig_sheet_name) return new_dh
def vertical_category_division(ds, new_dh_dict, meta_dh): # find the category column # Should be strings (for now) (used) # Kind of periodic (thus repetitive entries) (used) # some entries may change slightly (used) # period may change slightly (not checked for now)(should be checked in new if statment) # Should get tag matches in dict (not checked for now) df_data = ds.df_data df_profiles = ds.df_profiles orig_name = ds.orig_sheet_name for col_name, col in df_data.iteritems(): string_ratio = np.sum( df_profiles[col_name].values == SheetTypeDefinitions.STRING ) / df_profiles[col_name].values.size if string_ratio > pp.MIN_STRING_RATIO_CAT_COL: # check periodic potential string_col = col.astype(str) unique, counts = np.unique(string_col, return_counts=True) ratio = np.max(counts) / col.size if ratio < pp.MAX_RATIO_LARGEST_CAT and ratio > pp.MIN_RATIO_LARGEST_CAT and len( unique) < pp.MAX_N_CATS: if col_name in new_dh_dict: new_dh = new_dh_dict[col_name] else: new_dh = DataHolder(col_name) new_dh_dict[col_name] = new_dh #period_label_bool = counts * period > string_col.size - period # now get the remaining #sub_period_label = unique[period_label_bool == False] match_dict = SubTriangler.component_finder(unique) # now load the new_dh for name in match_dict: cond = np.array([ string_col.values == sub_name for sub_name in match_dict[name] ]).any(axis=0) sub_df_data = df_data[cond].drop( columns=[string_col.name]) sub_df_profiles = df_profiles[cond].drop( columns=[string_col.name]) if name == "" or np.sum(cond) < 4: new_ds = DataStruct(sub_df_data, sub_df_profiles, name, orig_sheet_name=orig_name) for split in new_ds.col_split_ds(): if not np.all(split.df_profiles == SheetTypeDefinitions.EMPTY_STRING ) and not (np.all( split.df_data == "")): meta_dh.add_ds(split) else: new_dh.add_sheet(ds.name + " - " + name, sub_df_data, sub_df_profiles, orig_sheet_name=orig_name)
def testTurnTriangle(self): dh = DataHolder("test") data = pd.DataFrame(data={'col1': ["", "", 0], 'col2': [0, 0, 2], '1991': [0, 0, 0], '1992': [1, 0, 0]}) prof = pd.DataFrame(data={'col1': [1, 1, 6], 'col2': [6, 6, 6], '1991': [6, 6, 6], '1992': [2, 6, 6]}) dh.add_sheet("test", data, prof) data = pd.DataFrame(data={'col1': ["", "", 0], 'col2': [1, 1, 0], '1991': [0, 0, 0], '1992': [1, 0, 0]}) prof = pd.DataFrame(data={'col1': [1, 1, 6], 'col2': [6, 6, 6], '1991': [6, 6, 6], '1992': [2, 6, 6]}) dh.add_sheet("test", data, prof) data = pd.DataFrame(data={'col1': ["", "", 0], 'col2': [3, 1, 0], '1991': [0, 0, 0], '1992-': [1, 0, 0]}) prof = pd.DataFrame(data={'col1': [1, 1, 6], 'col2': [6, 6, 6], '1991': [6, 6, 6], '1992-': [2, 6, 6]}) dh.add_sheet("test", data, prof) tr_cols_dict = {tuple(dh.data_struct_list[0].df_data.columns) + dh.data_struct_list[0].df_data.shape: pd.Series([False, True, True, True], index=dh.data_struct_list[0].df_data.columns), tuple(dh.data_struct_list[2].df_data.columns) + dh.data_struct_list[0].df_data.shape: pd.Series([False, True, True, True], index=dh.data_struct_list[2].df_data.columns) } dh_copy = dh.copy_without_memory() TriangleStripper.turn_triangle(dh, tr_cols_dict, alt_min_score=0.6) for ds in dh_copy.data_struct_list: tr_cols = tr_cols_dict[tuple(ds.df_data.columns) + ds.df_data.shape] tri_part = ds.df_data[tr_cols.index[tr_cols]].values ds.df_data[tr_cols.index[tr_cols]] = np.transpose(tri_part) for ds, ds_copy in zip(dh.data_struct_list, dh_copy.data_struct_list): self.assertTrue(ds.df_data.equals(ds_copy.df_data)) self.assertTrue(ds.df_profiles.equals(ds_copy.df_profiles))
def test_fill_hollow_str_cols(self): dh = DataHolder("test") df_data = pd.DataFrame(data={'col1': [1, "", "", "j", "", "", "6", "", "b", "g", "", "", "j", "", "", "6", "", "b", "", ""], '1992': ["", "", "", "j", "", "", "6", "", "b", "g", "", "", "j", "", "", "6", "", "b", "g", "hrumpff"]}) df_profiles = pd.DataFrame(data={'col1': [2, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0], '1992': [0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1]}) dh.add_sheet("test", df_data, df_profiles) StringFiller.fill_hollow_str_cols(dh) dh_sol = DataHolder("test") df_data_sol = pd.DataFrame(data={'col1': [1, 1, 1, "j", "j", "j", "6", "6", "b", "g", "g", "g", "j", "j", "j", "6", "6", "b", "b", "b"], '1992': ["", "", "", "j", "j", "j", "6", "6", "b", "g", "g", "g", "j", "j", "j", "6", "6", "b", "g", "hrumpff"]}) df_profiles_sol = pd.DataFrame(data={'col1': [2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], '1992': [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}) dh_sol.add_sheet("test", df_data_sol, df_profiles_sol) self.assertTrue(dh.equals(dh_sol))
def setUp(self): self.names = ["first", "second"] self.dh = DataHolder("test") d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["3", "4"]}) d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) self.dh.add_sheet(self.names[0], d1, d2, orig_sheet_name="1") d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["1", "1"]}) d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [1, 1]}) self.dh.add_sheet(self.names[0], d1, d2, orig_sheet_name="2") d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["15", "16"]}) d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [15, 16]}) self.dh.add_sheet(self.names[1], d1, d2, orig_sheet_name="2")
def main(file, settings): print(file) sr_list, file_name = ExcelLoader.load_excel(file) dh = DataHolder(file_name) for sr in sr_list: dh.add_sheet(sr.sheet_name, pd.DataFrame(columns=sr.headers, data=sr.row_vals), pd.DataFrame(columns=sr.headers, data=sr.xls_types), orig_sheet_name=sr.sheet_name) dummy, new_dh = TrianglePipeline.table_triangle_pipeline_dh(dh) temp_path = pdir.RESOURCES_DIR + "/temp/" new_dh.write_excel(temp_path + file_name)
def run_test_per_file_name(file_name): print(file_name) logger = logging.getLogger("svm_writer") sr_list, file_name = ExcelLoader.load_excel(file_name) dh = DataHolder(file_name.split(".")[0]) for sr in sr_list: dh.add_sheet(sr.sheet_name, pd.DataFrame(columns=sr.headers, data=sr.row_vals), pd.DataFrame(columns=sr.headers, data=sr.xls_types), orig_sheet_name=sr.sheet_name) dh = SheetPreProcessor.pre_strip(dh) DateColIdentifier.identify_and_gen_date_cols(dh, replace_col=False, svm_logger=logger)
def testFindTriangleHeaders(self): names = ["First", "Second"] dh = DataHolder("test") d1 = pd.DataFrame(data={'col1' + ps.HEADER_PLACE_HOLDER: ["1", "2", 1], 'col2' + ps.HEADER_PLACE_HOLDER: [3, "2", "3b"], 'col3' + ps.HEADER_PLACE_HOLDER: ["brum2", "4", 4], 'col4' + ps.HEADER_PLACE_HOLDER: [24, "4", "brum25"], }) d2 = d1.copy() d2.iloc[:, :]=1 dh.add_sheet(names[0], d1, d2) dh = TriangleHeaderFinder.find_triangle_headers(dh, test_settings=True) headers = list(dh.data_struct_list[0].df_data.columns) self.assertEqual(headers, ["col11", "col23", "col3brum2", "col424"])
def setUp(self): self.trngs = [{ 'headers': ["Year", "unit"], 'categories': [{ 'name': 'Claim - Incurred', 'type': 'sum', 'from': [ps.CAT_PAID_NAME, ps.CAT_RESERVED_NAME] }, { 'name': ps.CAT_PAID_NAME, 'type': 'independent', 'from': [] }, { 'name': ps.CAT_RESERVED_NAME, 'type': 'independent', 'from': [] }], "group_id": 0, "type": "single loss" }, { 'headers': ["Year", "unit"], 'categories': [{ 'name': ps.CAT_PREMIUM_NAME, 'type': 'independent', 'from': [] }], "group_id": 0, "type": "single loss" }] self.names = [ "Premium_", "Premium", "Total Outstanding 2004", "Paid", "Total Incurred" ] self.dh = DataHolder() self.dh.add_sheet(self.names[0], pd.DataFrame(data=[0]), pd.DataFrame(data=[0])) self.dh.add_sheet(self.names[1], pd.DataFrame(data=[0]), pd.DataFrame(data=[0])) self.dh.add_sheet(self.names[2], pd.DataFrame(data=[0]), pd.DataFrame(data=[0])) self.dh.add_sheet(self.names[3], pd.DataFrame(data=[0]), pd.DataFrame(data=[0])) self.dh.add_sheet(self.names[4], pd.DataFrame(data=[0]), pd.DataFrame(data=[0]))
def find_headers(dh): meta_dh = DataHolder(dh.name + "_meta") for ds in dh: df_data, df_profiles = ds.df_data, ds.df_profiles bin_mat = df_profiles.values[:pp.N_POSSIBLE_HEADER_ROWS, ] == 1 one_nums = np.sum(bin_mat, axis=1) # subtract identical strings for i in range(pp.N_POSSIBLE_HEADER_ROWS): sub = df_data.shape[1] - len(df_data.iloc[i, :].unique()) one_nums[i] -= sub header_ind = np.argmax(one_nums) pd_ind = df_profiles.index[header_ind] headers = df_data.loc[[pd_ind]] HeaderFinder.insert_headers(headers, pd_ind, df_data, df_profiles) meta_ds = HeaderFinder.remove_leading_rows(ds, pd_ind) meta_dh.add_ds(meta_ds) return dh, meta_dh
def post(self, request): sr_list = jsonpickle.decode(request.data['sr_list']) dhName = request.data['dhName'] selected_sheets = request.data['selected_sheets'] data_holder = DataHolder(dhName) for sr in sr_list: if sr.sheet_name in selected_sheets: data_holder.add_sheet(sr.sheet_name, pd.DataFrame(columns=sr.headers, data=sr.row_vals), pd.DataFrame(columns=sr.headers, data=sr.xls_types), orig_sheet_name=sr.sheet_name) encoded = data_holder.encode() return Response(encoded, status=200)
def test_col_identification(self): df_data = pd.DataFrame(data={'col1': ["1", "", "1991", "1992", "2007", "rew", "1993", "1994", "1995x"], 'col2': [43, 1994, 2015, 1994, 7, 2015, 1994, 1999, 2015], '1991': [1993, 1, 6, 1993, 1, 6, 1993, 1, 6], '1992': ["g", "r", "h", "j", "t", "f", "6", "p", "6"], '1993': ["1", "1993", "6", "1993", "", "rew", "1993", "1994", ""]}) df_profiles = pd.DataFrame(data={'col1': [1, 1, 1, 1, 1, 1, 1, 1, 1], 'col2': [2, 2, 2, 2, 2, 2, 2, 2, 2], '1991': [2, 2, 2, 2, 2, 2, 2, 2, 2], '1992': [1, 1, 1, 1, 1, 1, 1, 1, 1], '1993': [1, 1, 1, 1, 1, 1, 1, 1, 1]}) dh = DataHolder('test') dh.add_sheet('test', df_data, df_profiles) dh = DateFiller.identify_and_gen_date_cols(dh, replace_col=False) dh = ColTypeIdentifier.identify_col_types(dh) profiles = dh.data_struct_list[0].df_profiles self.assertTrue(profiles.iloc[1,1], SheetTypeDefinitions.STRING_DATE) self.assertTrue(profiles.iloc[0, 2], SheetTypeDefinitions.TRIANGLE_ELEMENT) self.assertTrue(profiles.iloc[0, 3], SheetTypeDefinitions.ID_ELEMENT)
def chop_triangles_horizontally(dh): """ checks for repeating header and splits ds:s :param dh: DataHolder :return: DataHolder """ chop, chop_lists = TriangleChopper.make_occurrence_list(dh) if not chop: return dh else: new_dh = DataHolder(dh.name) for ind, ds in enumerate(dh): occurrence_list = chop_lists[ind] if any(occurrence_list): for i in range(1, np.max(occurrence_list) + 1): bools = np.logical_or(occurrence_list == 0, occurrence_list == i) df_data = ds.df_data[ds.df_data.columns[bools]].copy() df_profiles = ds.df_profiles[ ds.df_profiles.columns[bools]].copy() new_dh.add_sheet(ds.name, df_data, df_profiles, orig_sheet_name=ds.orig_sheet_name) else: new_dh.add_ds(ds) return new_dh
def post(self, request): user_defined_triangles = request.data.get("output") input_json = request.data.get("input") dh = DataHolder.decode(input_json) #dh, group_ids, sheet_names = RowParser.set_card_ids(user_defined_triangles, dh) change = request.data.get("change") filename = request.data.get("filename") # Update connection with the change variable RowParser.make_changes(dh, user_defined_triangles, change) user_defined_triangles = RowParser.parse_output_from_triangle_forms( user_defined_triangles, dh) SheetWriter.trngs_to_existing_excel( user_defined_triangles, pdir.TEMP_DIR + ps.OUTPUT_NAME + filename) return Response({'output': user_defined_triangles})
def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) if self.request.session.get('data_holder'): sr_list = jsonpickle.decode( self.request.session.get('data_holder')) data_holder = DataHolder() for sr in sr_list: data_holder.add_sheet( sr.sheet_name, pd.DataFrame(columns=sr.headers, data=sr.row_vals), pd.DataFrame(columns=sr.headers, data=sr.xls_types)) context[ "diff_dicts"], data_holder = CleaningPipeline.clean_data_dh( data_holder) else: sheets = DataSheet.objects.filter( owner=self.request.user).order_by('sheet_name') context["diff_dicts"], data_holder = CleaningPipeline.clean_data( sheets) #sheets = DataSheet.objects.filter(owner=self.request.user).order_by('sheet_name') #context["diff_dicts"], mem_dict = CleaningPipeline.clean_data(sheets) return context
def find_triangles(dh, **kwargs): return_meta = False if 'return_meta' in kwargs: return_meta = kwargs['return_meta'] triangle_dh = DataHolder(dh.name) rest_dh = DataHolder(dh.name + '_non-triangular') #bool_array = np.zeros(triangle_dh.n, dtype=bool) for dh_ind, ds in enumerate(dh.data_struct_list): df_data, df_profiles = ds.df_data, ds.df_profiles # now select triangles in som smart way bool = TriangleFinder.is_triangle(ds, **kwargs) if bool: TriangleFinder.add_triangle_to_dh(ds, triangle_dh) else: rest_dh.add_sheet(ds.name, df_data, df_profiles, orig_sheet_name=ds.orig_sheet_name) # Now get the triangle similiar data structs triangle_similar = TriangleFinder.find_triangles_by_similarity( triangle_dh, rest_dh) if len(triangle_similar) > 0: rest_copy = rest_dh.copy_without_memory() rest_dh = DataHolder(rest_copy.name) for ds in rest_copy: if ds.id in triangle_similar: TriangleFinder.add_triangle_to_dh(ds, triangle_dh) else: rest_dh.add_sheet(ds.name, ds.df_data, ds.df_profiles, orig_sheet_name=ds.orig_sheet_name) if return_meta: return triangle_dh, rest_dh else: return triangle_dh
def run_test_per_file_name(in_obj, in_tup, form): """ Performs a fixture test for one file. The encapsulators determine the scope of the test. :param in_obj: :param in_tup: :param form: :return: """ if not pp.LOG_SVM_FEATURES: print(in_tup) sr_list, file_name = ExcelLoader.load_excel(pdir.RESOURCES_DIR + "/raw_test_files/" + in_tup[0]) dh = DataHolder(file_name.split(".")[0]) for sr in sr_list: dh.add_sheet(sr.sheet_name, pd.DataFrame(columns=sr.headers, data=sr.row_vals), pd.DataFrame(columns=sr.headers, data=sr.xls_types), orig_sheet_name=sr.sheet_name) #Choose encapsulator class to determine test scope #dhce = DataHolderCallTestEncapsulator(pdir.RESOURCES_DIR + '/temp/pickles/', pdir.RESOURCES_DIR + '/test/pickles/') #dhce = DataHolderCallSaveEncapsulator(pdir.RESOURCES_DIR + '/test/pickles/') #dhce = DataHolderCallOutputEncapsulator(pdir.RESOURCES_DIR + '/left_triangles/' + in_tup[0], in_tup[1]) dhce = DataHolderCallEncapsulator() #dhce = DataHolderCallTimeEncapsulator() if form == "triangle_table": dummy, dh = TrianglePipeline.table_triangle_pipeline_dh(dh, dhce) elif form == "triangle": dummy, dh = TrianglePipeline.triangle_pipeline_dh( dh, dhce, tri_type=in_tup[1]["tri_type"], n_outputs=in_tup[1]["n_outputs"]) elif form == "cleaning": dummy, dh = CleaningPipeline.clean_data_dh(dh) ToDiscComparer.compare_to_disc(in_obj, file_name, dh)
def test_serialization(self): self.dh.data_struct_list[0].roles.append("Claims Paid") self.dh.data_struct_list[0].df_data.sort_values("col1", ascending=False, inplace=True) serialized = self.dh.encode() data_framed = DataHolder.decode(serialized) assert_frame_equal(self.dh.data_struct_list[0].df_data, data_framed.data_struct_list[0].df_data) assert_frame_equal(self.dh.data_struct_list[1].df_data, data_framed.data_struct_list[1].df_data) assert_frame_equal(self.dh.data_struct_list[2].df_data, data_framed.data_struct_list[2].df_data) assert_frame_equal(self.dh.data_struct_list[0].df_profiles, data_framed.data_struct_list[0].df_profiles) assert_frame_equal(self.dh.data_struct_list[1].df_profiles, data_framed.data_struct_list[1].df_profiles) assert_frame_equal(self.dh.data_struct_list[2].df_profiles, data_framed.data_struct_list[2].df_profiles) self.assertEqual(data_framed.data_struct_list[0].roles[0], "Claims Paid") # Test conservation of ids for ind in range(len(self.dh.data_struct_list)): self.assertEqual(data_framed.data_struct_list[ind].id, self.dh.data_struct_list[ind].id)
def encapsulate_call(self, function, dh, **kwargs): out = function(dh, **kwargs) if isinstance(out, DataHolder): dh = out elif isinstance(out, tuple): dh = out[0] # Read the true solution sol = DataHolder.from_pickle_file(self.sol_path + dh.name + "_" + str(self.counter) + ".pickle") dh = dh.merge_in_original_sheets(save_sheet_names=True) if not dh.equals(sol): print(function) dh.write_excel(self.out_path + "candidate.xls") sol.write_excel(self.out_path + "solution.xls") self.assertTrue(dh.equals(sol)) self.counter += 1 return out
def perform_horizontal_merge(dh, distances): new_dh = DataHolder(dh.name) # make a greedy merge merged_set = set() for part in distances: if part[0] > pp.MAX_HORIZONTAL_MERGE_DISTANCE: break merge = [part[1]] + part[2] if len(set(merge).intersection(merged_set)) == 0: if len(part[2]) > 1: # Merge vertically! df_data_list = [ dh.id_dict[df_id].df_data for df_id in part[2] ] df_profiles_list = [ dh.id_dict[df_id].df_profiles for df_id in part[2] ] df_data = pd.concat(df_data_list, axis=0, sort=True) df_profiles = pd.concat(df_profiles_list, axis=0, sort=True) else: df_data = dh.id_dict[part[2][0]].df_data df_profiles = dh.id_dict[part[2][0]].df_profiles df_data = pd.concat([dh.id_dict[part[1]].df_data, df_data], axis=1, sort=True) df_profiles = pd.concat( [dh.id_dict[part[1]].df_profiles, df_profiles], axis=1, sort=True) df_data = df_data.reindex(sorted(df_data.columns), axis=1) df_profiles = df_profiles.reindex(sorted(df_profiles.columns), axis=1) merged_set.update(merge) new_dh.add_sheet( dh.id_dict[merge[0]].name, df_data, df_profiles, orig_sheet_name=dh.id_dict[merge[0]].orig_sheet_name) # add the remaining for id_key in dh.id_dict: if id_key not in merged_set: new_dh.add_sheet( dh.id_dict[id_key].name, dh.id_dict[id_key].df_data, dh.id_dict[id_key].df_profiles, orig_sheet_name=dh.id_dict[id_key].orig_sheet_name) return new_dh
def perform_vertical_chop(dh, chop_bools, chop_lists): new_dh = DataHolder(dh.name) for ind, ds in enumerate(dh): if chop_bools[ind]: cut = chop_lists[ind] # Don't cut too much if len(cut) < pp.MAX_NUM_VERTICAL_CHOPS: cut = [0] + cut.tolist() for i in range(len(cut) - 1): temp_df_data = ds.df_data.iloc[cut[i]:cut[i + 1], :] temp_df_profiles = ds.df_profiles.iloc[ cut[i]:cut[i + 1], :] new_ds = DataStruct(temp_df_data, temp_df_profiles, ds.name, orig_sheet_name=ds.orig_sheet_name) new_dh.add_ds(new_ds) else: new_dh.add_ds(ds) else: new_dh.add_ds(ds) return new_dh
def testDistrMatching(self): test_distr = np.array([1,1,1,1,1,2,2,2,2,3,3,3,4,4,5]) test_distr = test_distr/mad(test_distr) test_distr = test_distr - np.mean(test_distr) distr = { ps.CAT_RESERVED_NAME: test_distr } dh = DataHolder("test") dh.add_sheet(self.names[0], pd.DataFrame(data=[1, 1, 1, 1, 2, 2, 2, 3, 3, 4]), pd.DataFrame(data=[SheetTypeDefinitions.TRIANGLE_ELEMENT]*10)) dh.add_sheet(self.names[0], pd.DataFrame(data=[5, 9, 3, 7, 18]), pd.DataFrame(data=[SheetTypeDefinitions.TRIANGLE_ELEMENT]*5)) matches = list() for id in dh.id_dict: matches.append(InputMatcher.compare_with_distribution(id, ps.CAT_RESERVED_NAME, dh, distr)) # First entry is more similar to the reference, therefore, first match should be better self.assertTrue(matches[0] > matches[1])
def test_output(self): path = pdir.RESOURCES_DIR + "left_triangles/outtake/" #path = pdir.RESOURCES_DIR + "left_triangles/" for file in os.listdir(path): if file.endswith(".pickle"): print(file) with open(path + file, 'rb') as f: read_data = pickle.load(f) dh = DataHolder.decode(read_data["DataHolder"]) info_dict = read_data["extra_content"] print(info_dict) user_defined_triangles = OutputTriangleParser.generate_output_triangles( info_dict) data_holder, group_ids, sheet_names = RowParser.set_card_ids( user_defined_triangles, dh) user_defined_triangles = InputMatcher.match_triangles_to_output( user_defined_triangles, data_holder) user_defined_triangles = RowParser.parse_output_from_triangle_forms( user_defined_triangles, data_holder) head, sep, tail = file.partition(".xls") SheetWriter.trngs_to_excel(user_defined_triangles, head + sep)
def post(self, request): # Need to post - str_data_holder, output triangles (templates) str_data_holder = request.data.get('str_data_holder') data_holder = DataHolder.decode(str_data_holder) response_data = {} if data_holder is None: raise ValueError("No data holder found") elif data_holder.n == 0: raise ValueError("No sheets in data holder") #Recieve triangle formats user_defined_triangles = request.data.get('templates') try: #DataHolder manipulation data_holder, group_ids, sheet_names = RowParser.set_card_ids( user_defined_triangles, data_holder) user_defined_triangles = InputMatcher.match_triangles_to_output( user_defined_triangles, data_holder) user_defined_triangles = RowParser.parse_output_from_triangle_forms( user_defined_triangles, data_holder) except DataHolderException as err: data = {} data['message'] = err.message data['dh'] = err.dh return Response({'response_error': data}) #SheetWriter.trngs_to_existing_excel(user_defined_triangles, pdir.TEMP_DIR + ps.OUTPUT_NAME + filename) response_data["group_ids"] = group_ids response_data['output_triangles'] = user_defined_triangles response_data[ "unit_triangles"] = ChangeDimensionAPIView.make_unit_triangle_list( data_holder) return Response({'data': response_data})