def test_rule_clustering( ): #todo spflow automatically summarizes chains of sums dataset_name = 'gender' recalc_SPN = True rdc_threshold, min_instances_slice = 0.2, 0.1 if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice) or recalc_SPN: print("Creating SPN ...") # get data df, value_dict, parametric_types = synthetic_data.get_synthetic_data( dataset_name) # Creates the SPN and saves to a file spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], clustering='rule_clustering') # Load SPN spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) fn.print_statistics(spn) pass
def explore_1(): dataset_name = "rki_ed_1" rdc_threshold = 0.3 min_instances_slice = 0.01 if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): df, value_dict, parametric_types = ed_data.get_rki_ed_1() spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict) spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) spn = fn.marg(spn, keep=[0,2,3,4,5]) fn.print_statistics(spn) p = io.get_path("_results/ed_data_explore") #vz.visualize_overall_distribution(spn, value_dict) from spn.experiments.AQP.Ranges import NominalRange target_conds = [{0 : NominalRange([5,6])}, {0 : NominalRange([0,1,2,3,4])}] #target_conds = [{0 : NominalRange([5,6]), 1 : NominalRange([0,1,2,3,4,5,6,7,8,9,10,11])}, {0 : NominalRange([0,1,2,3,4]), 1 : NominalRange([0,1,2,3,4,5,6,7,8,9,10,11])}] vz.visualize_target_based_conds_overall_distribution_compact(spn, target_conds, value_dict, target_names=["Wochenende", "Unter der Woche"], save_path=p+dataset_name+"_weekend_measures.pdf")
def spn_hyperparam_opt(df, value_dict, test_frac=0.5): print('============= SPN Hyperparameter Optimization ================') error_types = ['AE', 'MAE', 'MRE'] rows = [] train, test = train_test_split(df, test_size=test_frac, random_state=100) dataset_name = 'UCI_half' np.random.seed(5) from spn.structure.Base import get_nodes_by_type, Node #rdc, mis = [0.1, 0.2, 0.3], [0.001, 0.01, 0.1] rdc, mis = np.linspace(0., 0.7, 5), np.linspace(0., 0.5, 5) for rdc_threshold in rdc: for min_instances_slice in mis: # for i in range(100): # rdc_threshold, min_instances_slice = np.random.uniform(0., 0.7), np.random.uniform(0., 0.5) row = { 'rdc_threshold': rdc_threshold, 'min_instances_slice': min_instances_slice } if True or not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): print( "======================== Creating SPN ... ===============" ) parametric_types = [Categorical for _ in train.columns] # Creates the SPN and saves to a file spn_handler.create_parametric_spns( train.values, parametric_types, dataset_name, value_dict=value_dict, rdc_thresholds=[rdc_threshold], min_instances_slices=[min_instances_slice], silence_warnings=True, nrows=only_n_rows, ) spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) num_nodes = len(get_nodes_by_type(spn, Node)) row['num_nodes'] = num_nodes error_values = get_error_totals(calc_itemsets_df( train, spn, min_sup, test=test, value_dict=value_dict), min_sup, errors=error_types) for e_name, e_val in zip(error_types, error_values): row[e_name] = e_val rows.append(row) spn_hyperparam_results = pd.DataFrame(rows) spn_hyperparam_results.sort_values( by=['rdc_threshold', 'min_instances_slice'], inplace=True) return spn_hyperparam_results
def explore_2(): dataset_name = "rki_ed_2" rdc_threshold = 0.3 min_instances_slice = 0.01 if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): df, value_dict, parametric_types = ed_data.get_rki_ed_2() spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict) spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) fn.print_statistics(spn) vz.visualize_overall_distribution(spn, value_dict)
def explore_3(): dataset_name = "rki_ed_3" rdc_threshold = 0.3 min_instances_slice = 0.01 if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): df, value_dict, parametric_types = ed_data.get_rki_ed_3() spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict) spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) fn.print_statistics(spn) print(value_dict) p = io.get_path("_results/ed_data_explore") vz.visualize_likeliness_heatmap(spn, target_id_x=0, target_id_y=1, value_dict=value_dict, save_path=p+dataset_name+"_hour_dep.pdf") vz.visualize_likeliness_heatmap(spn, target_id_x=0, target_id_y=5, value_dict=value_dict, save_path=p+dataset_name+"_hour_day.pdf") vz.visualize_likeliness_heatmap(spn, target_id_x=0, target_id_y=6, value_dict=value_dict, save_path=p+dataset_name+"_hour_month.pdf")
def cross_eval(transactional_df, dataset_name, min_sup_steps, value_dict, recalc_spn = False, rdc_threshold = 0.1, min_instances_slice = 0.05): print('================= Cross Eval =====================') #1 apriori_train #2 apriori_test #3 SPN train # calc: 1v1 (=0), 3v1 (generalization error), 1v2 (GT difference betweeen train/test) # 3v2 (does the SPN generalize apriori?, compare with 1v2) train, test = train_test_split(transactional_df, test_size=0.5, random_state=100) # rstate = 100 for reproducability if recalc_spn or not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): print("======================== Creating SPN ... ===============") parametric_types = [Categorical for _ in train.columns] spn_handler.create_parametric_spns(train.values, parametric_types, dataset_name, value_dict=value_dict, rdc_thresholds=[rdc_threshold], min_instances_slices=[min_instances_slice]) spn_train, _, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) print('Num. nodes: {}'.format(fn.get_num_nodes(spn_train))) rows, error_names = [], ['AE', 'MAE', 'MRE', 'Missing Sets', 'Excess Sets', 'Number of Sets'] for min_sup_eval in min_sup_steps: # one_v_one = get_error_totals(calc_itemsets_df(train, spn_train, min_sup_eval, GT_use='apriori', PRED_use='apriori'), # min_sup=min_sup_eval) one_v_two = calc_itemsets_df(train, spn_train, min_sup_eval, test=test, test_use='apriori', train_use='apriori') three_v_one = calc_itemsets_df(train, spn_train, min_sup_eval, value_dict=value_dict) three_v_two = calc_itemsets_df(train, spn_train, min_sup_eval, test, value_dict=value_dict,) if min_sup_eval == min(min_sup_steps): # do scatter plots for spn_vs_train and spn_vs_test scatter_plots(one_v_two, 'train_vs_test.pdf', reg_line=False, dataset_name=dataset_name) scatter_plots(three_v_one, 'rdc={}_mis={}_GT=train.pdf'.format(rdc_threshold,min_instances_slice), reg_line=False, dataset_name=dataset_name) scatter_plots(three_v_two, 'rdc={}_mis={}_GT=test.pdf'.format(rdc_threshold,min_instances_slice), reg_line=False, dataset_name=dataset_name) results = { ind: get_error_totals(df, min_sup_eval, error_names) for ind, df in {'train_vs_test': one_v_two, 'spn_vs_train': three_v_one, 'spn_vs_test': three_v_two}.items() } for ind, errors in results.items(): d = dict(zip(error_names, errors)) d.update({'compare': ind, 'min_sup': min_sup_eval,}) rows.append(d) evals = pd.DataFrame(data = rows, ).set_index(['min_sup', 'compare']) return evals
dataset_name = 'adult_one_hot' rdc_threshold, min_instances_slice = 0.1, 0.05 min_sup=0.1 recalc_spn = True only_n_rows = None df, value_dict, parametric_types = real_data.get_real_data(dataset_name, only_n_rows=only_n_rows) # SPN generation if recalc_spn or not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): print("======================== Creating SPN ... ===============") parametric_types = [Categorical for _ in df.columns] # Creates the SPN and saves to a file spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, value_dict=value_dict, rdc_thresholds=[rdc_threshold], min_instances_slices=[min_instances_slice], silence_warnings=True, nrows=only_n_rows) # Load SPN spn, value_dict, parametric_types = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) all_itemsets = calc_itemsets_df(df, spn, min_sup, value_dict=value_dict) print('================ Calculating Rules and Metrics ===============') spn_apriori_df = all_itemsets.reset_index()[['itemsets', 'support_pred']].rename(columns={'support_pred': 'support'}) normal_apriori_df = all_itemsets.reset_index()[['itemsets', 'support']]
#plot corr matrix df = pd.DataFrame(data, columns=[value_dict[i][1] for i in range(num_vars)]) print(df.corr()) # parameters for the construction rdc_threshold = 0.1 min_instances_slice = 0.1 if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): print("Creating SPN ...") # get data # df, value_dict, parametric_types = real_data.get_titanic() spn, value_dict, _ = spn_handler.create_parametric_spns( data, data_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict, save=False) # # Load SPN # spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) # Print some statistics fn.print_statistics(spn) visualize_expected_sub_populations(spn, value_dict, 10) visualize_sub_populations(spn, value_dict, 10) subpops = fn.get_sub_populations(spn, ) print(subpops) print('============') pprint(subpops) fn.plot_spn(spn, "icecream_spn.pdf", value_dict)
if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): print("Creating SPN ...") #get data df, value_dict, parametric_types = real_data.get_titanic() #print data (top 5 rows) io.print_pretty_table(df.head(5)) #print value-dict print(value_dict) #Creates the SPN and saves to a file spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name) #Load SPN spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) #Print some statistics fn.print_statistics(spn) #Example value dict generation path = os.path.dirname( os.path.realpath(__file__)) + "/../../_data/titanic/train.csv" df = pd.read_csv(path) #print data (top 5 rows)