Пример #1
0
def test_rule_clustering(
):  #todo spflow automatically summarizes chains of sums
    dataset_name = 'gender'
    recalc_SPN = True
    rdc_threshold, min_instances_slice = 0.2, 0.1

    if not spn_handler.exist_spn(dataset_name, rdc_threshold,
                                 min_instances_slice) or recalc_SPN:
        print("Creating SPN ...")

        # get data
        df, value_dict, parametric_types = synthetic_data.get_synthetic_data(
            dataset_name)

        # Creates the SPN and saves to a file
        spn_handler.create_parametric_spns(df.values,
                                           parametric_types,
                                           dataset_name, [rdc_threshold],
                                           [min_instances_slice],
                                           clustering='rule_clustering')

    # Load SPN
    spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold,
                                              min_instances_slice)
    fn.print_statistics(spn)

    pass
def explore_1():
    
    
    dataset_name = "rki_ed_1"
    rdc_threshold = 0.3
    min_instances_slice = 0.01
    if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice):
        df, value_dict, parametric_types = ed_data.get_rki_ed_1()
        spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict)
    spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice)
    
    spn = fn.marg(spn, keep=[0,2,3,4,5])
    
    
    
    
    fn.print_statistics(spn)
    
    p = io.get_path("_results/ed_data_explore")
    
    #vz.visualize_overall_distribution(spn, value_dict)
    
    from spn.experiments.AQP.Ranges import NominalRange
    
    target_conds = [{0 : NominalRange([5,6])}, {0 : NominalRange([0,1,2,3,4])}]
    #target_conds = [{0 : NominalRange([5,6]), 1 : NominalRange([0,1,2,3,4,5,6,7,8,9,10,11])}, {0 : NominalRange([0,1,2,3,4]), 1 : NominalRange([0,1,2,3,4,5,6,7,8,9,10,11])}]
    vz.visualize_target_based_conds_overall_distribution_compact(spn, target_conds, value_dict, target_names=["Wochenende", "Unter der Woche"], save_path=p+dataset_name+"_weekend_measures.pdf")
Пример #3
0
def spn_hyperparam_opt(df, value_dict, test_frac=0.5):
    print('============= SPN Hyperparameter Optimization ================')
    error_types = ['AE', 'MAE', 'MRE']
    rows = []
    train, test = train_test_split(df, test_size=test_frac, random_state=100)
    dataset_name = 'UCI_half'
    np.random.seed(5)
    from spn.structure.Base import get_nodes_by_type, Node
    #rdc, mis = [0.1, 0.2, 0.3], [0.001, 0.01, 0.1]
    rdc, mis = np.linspace(0., 0.7, 5), np.linspace(0., 0.5, 5)
    for rdc_threshold in rdc:
        for min_instances_slice in mis:
            # for i in range(100):
            # rdc_threshold, min_instances_slice = np.random.uniform(0., 0.7), np.random.uniform(0., 0.5)
            row = {
                'rdc_threshold': rdc_threshold,
                'min_instances_slice': min_instances_slice
            }
            if True or not spn_handler.exist_spn(dataset_name, rdc_threshold,
                                                 min_instances_slice):
                print(
                    "======================== Creating SPN ... ==============="
                )
                parametric_types = [Categorical for _ in train.columns]
                # Creates the SPN and saves to a file
                spn_handler.create_parametric_spns(
                    train.values,
                    parametric_types,
                    dataset_name,
                    value_dict=value_dict,
                    rdc_thresholds=[rdc_threshold],
                    min_instances_slices=[min_instances_slice],
                    silence_warnings=True,
                    nrows=only_n_rows,
                )
            spn, value_dict, _ = spn_handler.load_spn(dataset_name,
                                                      rdc_threshold,
                                                      min_instances_slice)
            num_nodes = len(get_nodes_by_type(spn, Node))
            row['num_nodes'] = num_nodes
            error_values = get_error_totals(calc_itemsets_df(
                train, spn, min_sup, test=test, value_dict=value_dict),
                                            min_sup,
                                            errors=error_types)
            for e_name, e_val in zip(error_types, error_values):
                row[e_name] = e_val
            rows.append(row)
    spn_hyperparam_results = pd.DataFrame(rows)
    spn_hyperparam_results.sort_values(
        by=['rdc_threshold', 'min_instances_slice'], inplace=True)
    return spn_hyperparam_results
def explore_2():
    
    
    dataset_name = "rki_ed_2"
    rdc_threshold = 0.3
    min_instances_slice = 0.01
    if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice):
        df, value_dict, parametric_types = ed_data.get_rki_ed_2()
        spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict)
    spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice)
    
    fn.print_statistics(spn)
    
    
    vz.visualize_overall_distribution(spn, value_dict)
def explore_3():
    
    dataset_name = "rki_ed_3"
    rdc_threshold = 0.3
    min_instances_slice = 0.01
    if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice):
        df, value_dict, parametric_types = ed_data.get_rki_ed_3()
        spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict)
    spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice)
    
    fn.print_statistics(spn)
    print(value_dict)
    
    p = io.get_path("_results/ed_data_explore")
    
    vz.visualize_likeliness_heatmap(spn, target_id_x=0, target_id_y=1, value_dict=value_dict, save_path=p+dataset_name+"_hour_dep.pdf")
    vz.visualize_likeliness_heatmap(spn, target_id_x=0, target_id_y=5, value_dict=value_dict, save_path=p+dataset_name+"_hour_day.pdf")  
    vz.visualize_likeliness_heatmap(spn, target_id_x=0, target_id_y=6, value_dict=value_dict, save_path=p+dataset_name+"_hour_month.pdf")  
Пример #6
0
def cross_eval(transactional_df, dataset_name, min_sup_steps, value_dict,
               recalc_spn = False, rdc_threshold = 0.1, min_instances_slice = 0.05):
    print('================= Cross Eval =====================')
    #1 apriori_train
    #2 apriori_test
    #3 SPN train
    # calc: 1v1 (=0), 3v1 (generalization error), 1v2 (GT difference betweeen train/test)
    # 3v2 (does the SPN generalize apriori?, compare with 1v2)
    train, test = train_test_split(transactional_df, test_size=0.5, random_state=100) # rstate = 100 for reproducability
    if recalc_spn or not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice):
        print("======================== Creating SPN ... ===============")
        parametric_types = [Categorical for _ in train.columns]
        spn_handler.create_parametric_spns(train.values, parametric_types, dataset_name, value_dict=value_dict,
                                           rdc_thresholds=[rdc_threshold],
                                           min_instances_slices=[min_instances_slice])
    spn_train, _, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice)
    print('Num. nodes: {}'.format(fn.get_num_nodes(spn_train)))

    rows, error_names = [], ['AE', 'MAE', 'MRE', 'Missing Sets', 'Excess Sets', 'Number of Sets']
    for min_sup_eval in min_sup_steps:
        # one_v_one = get_error_totals(calc_itemsets_df(train, spn_train, min_sup_eval, GT_use='apriori', PRED_use='apriori'),
        #                              min_sup=min_sup_eval)
        one_v_two = calc_itemsets_df(train, spn_train, min_sup_eval, test=test, test_use='apriori', train_use='apriori')
        three_v_one = calc_itemsets_df(train, spn_train, min_sup_eval, value_dict=value_dict)
        three_v_two = calc_itemsets_df(train, spn_train, min_sup_eval, test, value_dict=value_dict,)

        if min_sup_eval == min(min_sup_steps):
            # do scatter plots for spn_vs_train and spn_vs_test
            scatter_plots(one_v_two, 'train_vs_test.pdf', reg_line=False, dataset_name=dataset_name)
            scatter_plots(three_v_one, 'rdc={}_mis={}_GT=train.pdf'.format(rdc_threshold,min_instances_slice), reg_line=False, dataset_name=dataset_name)
            scatter_plots(three_v_two, 'rdc={}_mis={}_GT=test.pdf'.format(rdc_threshold,min_instances_slice), reg_line=False, dataset_name=dataset_name)

        results = {
            ind: get_error_totals(df, min_sup_eval, error_names) for ind, df in
            {'train_vs_test': one_v_two, 'spn_vs_train': three_v_one, 'spn_vs_test': three_v_two}.items()
        }
        for ind, errors in results.items():
            d = dict(zip(error_names, errors))
            d.update({'compare': ind,  'min_sup': min_sup_eval,})
            rows.append(d)

    evals = pd.DataFrame(data = rows, ).set_index(['min_sup', 'compare'])
    return evals
Пример #7
0
dataset_name = 'adult_one_hot'
rdc_threshold, min_instances_slice = 0.1, 0.05
min_sup=0.1
recalc_spn = True
only_n_rows = None
df, value_dict, parametric_types = real_data.get_real_data(dataset_name, only_n_rows=only_n_rows)


# SPN generation
if recalc_spn or not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice):
    print("======================== Creating SPN ... ===============")
    parametric_types = [Categorical for _ in df.columns]
    # Creates the SPN and saves to a file
    spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, value_dict=value_dict,
                                       rdc_thresholds=[rdc_threshold],
                                       min_instances_slices=[min_instances_slice],
                                       silence_warnings=True,
                                       nrows=only_n_rows)




# Load SPN
spn, value_dict, parametric_types = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice)


all_itemsets = calc_itemsets_df(df, spn, min_sup, value_dict=value_dict)

print('================ Calculating Rules and Metrics ===============')
spn_apriori_df = all_itemsets.reset_index()[['itemsets', 'support_pred']].rename(columns={'support_pred': 'support'})
normal_apriori_df = all_itemsets.reset_index()[['itemsets', 'support']]
Пример #8
0
#plot corr matrix
df = pd.DataFrame(data, columns=[value_dict[i][1] for i in range(num_vars)])
print(df.corr())

# parameters for the construction
rdc_threshold = 0.1
min_instances_slice = 0.1
if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice):
    print("Creating SPN ...")

    # get data
    # df, value_dict, parametric_types = real_data.get_titanic()

    spn, value_dict, _ = spn_handler.create_parametric_spns(
        data,
        data_types,
        dataset_name, [rdc_threshold], [min_instances_slice],
        value_dict,
        save=False)
# # Load SPN
# spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice)
# Print some statistics
fn.print_statistics(spn)
visualize_expected_sub_populations(spn, value_dict, 10)
visualize_sub_populations(spn, value_dict, 10)
subpops = fn.get_sub_populations(spn, )

print(subpops)
print('============')
pprint(subpops)
fn.plot_spn(spn, "icecream_spn.pdf", value_dict)
Пример #9
0
    if not spn_handler.exist_spn(dataset_name, rdc_threshold,
                                 min_instances_slice):
        print("Creating SPN ...")

        #get data
        df, value_dict, parametric_types = real_data.get_titanic()

        #print data (top 5 rows)
        io.print_pretty_table(df.head(5))

        #print value-dict
        print(value_dict)

        #Creates the SPN and saves to a file
        spn_handler.create_parametric_spns(df.values, parametric_types,
                                           dataset_name)

    #Load SPN
    spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold,
                                              min_instances_slice)

    #Print some statistics
    fn.print_statistics(spn)

    #Example value dict generation

    path = os.path.dirname(
        os.path.realpath(__file__)) + "/../../_data/titanic/train.csv"
    df = pd.read_csv(path)

    #print data (top 5 rows)