def explore_1(): dataset_name = "rki_ed_1" rdc_threshold = 0.3 min_instances_slice = 0.01 if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): df, value_dict, parametric_types = ed_data.get_rki_ed_1() spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict) spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) spn = fn.marg(spn, keep=[0,2,3,4,5]) fn.print_statistics(spn) p = io.get_path("_results/ed_data_explore") #vz.visualize_overall_distribution(spn, value_dict) from spn.experiments.AQP.Ranges import NominalRange target_conds = [{0 : NominalRange([5,6])}, {0 : NominalRange([0,1,2,3,4])}] #target_conds = [{0 : NominalRange([5,6]), 1 : NominalRange([0,1,2,3,4,5,6,7,8,9,10,11])}, {0 : NominalRange([0,1,2,3,4]), 1 : NominalRange([0,1,2,3,4,5,6,7,8,9,10,11])}] vz.visualize_target_based_conds_overall_distribution_compact(spn, target_conds, value_dict, target_names=["Wochenende", "Unter der Woche"], save_path=p+dataset_name+"_weekend_measures.pdf")
def test_rule_clustering( ): #todo spflow automatically summarizes chains of sums dataset_name = 'gender' recalc_SPN = True rdc_threshold, min_instances_slice = 0.2, 0.1 if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice) or recalc_SPN: print("Creating SPN ...") # get data df, value_dict, parametric_types = synthetic_data.get_synthetic_data( dataset_name) # Creates the SPN and saves to a file spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], clustering='rule_clustering') # Load SPN spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) fn.print_statistics(spn) pass
def explore_2(): dataset_name = "rki_ed_2" rdc_threshold = 0.3 min_instances_slice = 0.01 if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): df, value_dict, parametric_types = ed_data.get_rki_ed_2() spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict) spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) fn.print_statistics(spn) vz.visualize_overall_distribution(spn, value_dict)
def explore_3(): dataset_name = "rki_ed_3" rdc_threshold = 0.3 min_instances_slice = 0.01 if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): df, value_dict, parametric_types = ed_data.get_rki_ed_3() spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict) spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) fn.print_statistics(spn) print(value_dict) p = io.get_path("_results/ed_data_explore") vz.visualize_likeliness_heatmap(spn, target_id_x=0, target_id_y=1, value_dict=value_dict, save_path=p+dataset_name+"_hour_dep.pdf") vz.visualize_likeliness_heatmap(spn, target_id_x=0, target_id_y=5, value_dict=value_dict, save_path=p+dataset_name+"_hour_day.pdf") vz.visualize_likeliness_heatmap(spn, target_id_x=0, target_id_y=6, value_dict=value_dict, save_path=p+dataset_name+"_hour_month.pdf")
print('Data not compatible for topdown') # labeled = rule_ex.df2labeled(intra_df, value_dict) rules_intra = intra_df.head(len(topdown_rules)) metrics = ['sup', 'conf', 'conviction', 'F'] eval_intra = evaluate_rules(df, rules_intra, value_dict, metrics=metrics, beta=beta) eval_top = evaluate_rules(onehot_df, topdown_rules, vd_onehot, metrics=metrics, beta=beta) eval_intra['method'] = 'IntraNode' eval_top['method'] = 'Topdown' comp = pd.concat([eval_intra, eval_top], ) comp.drop_duplicates(['head', 'body'], inplace=True) comp = comp.sort_values('F', ascending=False) comp.to_csv(res_path + 'comparison_{}.csv'.format(dataset_name)) # mean F of first N rules print(comp.groupby('method').mean()) hyperparam_grid_search(df, spn, value_dict) fn.print_statistics(spn)