def explore_1():
    
    
    dataset_name = "rki_ed_1"
    rdc_threshold = 0.3
    min_instances_slice = 0.01
    if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice):
        df, value_dict, parametric_types = ed_data.get_rki_ed_1()
        spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict)
    spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice)
    
    spn = fn.marg(spn, keep=[0,2,3,4,5])
    
    
    
    
    fn.print_statistics(spn)
    
    p = io.get_path("_results/ed_data_explore")
    
    #vz.visualize_overall_distribution(spn, value_dict)
    
    from spn.experiments.AQP.Ranges import NominalRange
    
    target_conds = [{0 : NominalRange([5,6])}, {0 : NominalRange([0,1,2,3,4])}]
    #target_conds = [{0 : NominalRange([5,6]), 1 : NominalRange([0,1,2,3,4,5,6,7,8,9,10,11])}, {0 : NominalRange([0,1,2,3,4]), 1 : NominalRange([0,1,2,3,4,5,6,7,8,9,10,11])}]
    vz.visualize_target_based_conds_overall_distribution_compact(spn, target_conds, value_dict, target_names=["Wochenende", "Unter der Woche"], save_path=p+dataset_name+"_weekend_measures.pdf")
Пример #2
0
def test_rule_clustering(
):  #todo spflow automatically summarizes chains of sums
    dataset_name = 'gender'
    recalc_SPN = True
    rdc_threshold, min_instances_slice = 0.2, 0.1

    if not spn_handler.exist_spn(dataset_name, rdc_threshold,
                                 min_instances_slice) or recalc_SPN:
        print("Creating SPN ...")

        # get data
        df, value_dict, parametric_types = synthetic_data.get_synthetic_data(
            dataset_name)

        # Creates the SPN and saves to a file
        spn_handler.create_parametric_spns(df.values,
                                           parametric_types,
                                           dataset_name, [rdc_threshold],
                                           [min_instances_slice],
                                           clustering='rule_clustering')

    # Load SPN
    spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold,
                                              min_instances_slice)
    fn.print_statistics(spn)

    pass
def explore_2():
    
    
    dataset_name = "rki_ed_2"
    rdc_threshold = 0.3
    min_instances_slice = 0.01
    if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice):
        df, value_dict, parametric_types = ed_data.get_rki_ed_2()
        spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict)
    spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice)
    
    fn.print_statistics(spn)
    
    
    vz.visualize_overall_distribution(spn, value_dict)
def explore_3():
    
    dataset_name = "rki_ed_3"
    rdc_threshold = 0.3
    min_instances_slice = 0.01
    if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice):
        df, value_dict, parametric_types = ed_data.get_rki_ed_3()
        spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict)
    spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice)
    
    fn.print_statistics(spn)
    print(value_dict)
    
    p = io.get_path("_results/ed_data_explore")
    
    vz.visualize_likeliness_heatmap(spn, target_id_x=0, target_id_y=1, value_dict=value_dict, save_path=p+dataset_name+"_hour_dep.pdf")
    vz.visualize_likeliness_heatmap(spn, target_id_x=0, target_id_y=5, value_dict=value_dict, save_path=p+dataset_name+"_hour_day.pdf")  
    vz.visualize_likeliness_heatmap(spn, target_id_x=0, target_id_y=6, value_dict=value_dict, save_path=p+dataset_name+"_hour_month.pdf")  
        print('Data not compatible for topdown')
    # labeled = rule_ex.df2labeled(intra_df, value_dict)

    rules_intra = intra_df.head(len(topdown_rules))
    metrics = ['sup', 'conf', 'conviction', 'F']

    eval_intra = evaluate_rules(df,
                                rules_intra,
                                value_dict,
                                metrics=metrics,
                                beta=beta)
    eval_top = evaluate_rules(onehot_df,
                              topdown_rules,
                              vd_onehot,
                              metrics=metrics,
                              beta=beta)
    eval_intra['method'] = 'IntraNode'
    eval_top['method'] = 'Topdown'
    comp = pd.concat([eval_intra, eval_top], )
    comp.drop_duplicates(['head', 'body'], inplace=True)
    comp = comp.sort_values('F', ascending=False)

    comp.to_csv(res_path + 'comparison_{}.csv'.format(dataset_name))

    # mean F of first N rules
    print(comp.groupby('method').mean())

    hyperparam_grid_search(df, spn, value_dict)

    fn.print_statistics(spn)