Exemplo n.º 1
0
beta = 0.2
metrics = ['sup', 'conf', 'lift', 'F', 'leverage', 'recall', 'interestingness', 'PiSh', 'jaccard', 'cosine_distance']

if dataset_name == 'lending':
    n_rows = 9000

def load_pickle(p):
    with open(p, 'rb') as f:
        return pickle.load(f)

def dump_pickle(obj, p):
    with open(p, 'wb') as f:
        pickle.dump(obj, f)

# get data
df, value_dict, parametric_types = real_data.get_real_data(dataset_name, only_n_rows=n_rows, seed=1, onehot=False)
spn = spn_handler.load_or_create_spn(df, value_dict, parametric_types, dataset_name, rdc_threshold, min_instances_slice,
                               nrows=n_rows, seed=1, force_create=recalc_SPN, clustering='km_rule_clustering')

onehot_df, vd_onehot, pt_onehot = real_data.get_real_data(dataset_name, only_n_rows=n_rows, seed=1, onehot = True)
spn_one_hot = spn_handler.load_or_create_spn(onehot_df, vd_onehot, pt_onehot, dataset_name + '_one_hot', rdc_threshold, min_instances_slice,
                               nrows=n_rows, seed=1, force_create=recalc_SPN, clustering='rule_clustering')

# if dataset_name == 'lending':
#     targts = [0, 7]
# else:
targts = []
for t in df.nunique()[df.nunique() < 4].index:
    targts.append(list(df.columns).index(t))
# else:
#     for t in df.nunique()[df.nunique() < 8].index:
Exemplo n.º 2
0
import pandas as pd
import numpy as np

from data import real_data
from data import synthetic_data
from simple_spn import spn_handler
from spn.structure.leaves.parametric.Parametric import Categorical
from spn_apriori.itemsets_utils import simple_interpretable_rules, _get_interpretable_best_lift_rules, calc_itemsets_df
import simple_spn.functions as fn

dataset_name = 'adult_one_hot'
rdc_threshold, min_instances_slice = 0.1, 0.05
min_sup=0.1
recalc_spn = True
only_n_rows = None
df, value_dict, parametric_types = real_data.get_real_data(dataset_name, only_n_rows=only_n_rows)


# SPN generation
if recalc_spn or not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice):
    print("======================== Creating SPN ... ===============")
    parametric_types = [Categorical for _ in df.columns]
    # Creates the SPN and saves to a file
    spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, value_dict=value_dict,
                                       rdc_thresholds=[rdc_threshold],
                                       min_instances_slices=[min_instances_slice],
                                       silence_warnings=True,
                                       nrows=only_n_rows)


Exemplo n.º 3
0
def test_get_lending():
    df, value_dict, param_types = real_data.get_real_data('lending', only_n_rows=10000, seed=5)
    io.print_pretty_table(df.head(10))
    assert len(df.columns) == len(value_dict)
Exemplo n.º 4
0
def test_get_Ecommerce():
    df, value_dict, param_types = real_data.get_real_data('Ecommerce')
    io.print_pretty_table(df.head(10))
    assert len(df.columns) == len(value_dict)
Exemplo n.º 5
0
def test_get_OnlineRetail():
    df, value_dict,  param_types = real_data.get_real_data('OnlineRetail')
    io.print_pretty_table(df.head(10))
import os

from data import real_data, synthetic_data
from simple_spn import spn_handler
from spn_apriori.itemsets_utils import cross_eval
from spn.structure.leaves.parametric.Parametric import Categorical

dataset_name = 'adult_one_hot'
recalc_spn = False


rdc_range = [0.1, 0.2, 0.3]
mis_range = [0.1, 0.01, 0.001]
min_sup_range = [0.01, 0.03, 0.1, 0.3]

transactional_df, value_dict, parametric_types = real_data.get_real_data(dataset_name)
# transactional_df, value_dict, parametric_types = synthetic_data.get_synthetic_data(dataset_name)


# eval different hyper params
cross_eval_hyperparams = []
for rdc in rdc_range:
    for mis in mis_range:
        print('SPN Params: rdc {}, mis {}'.format(rdc, mis))
        eval_spn_params = cross_eval(transactional_df, dataset_name, min_sup_range, value_dict, recalc_spn=recalc_spn,
                                     min_instances_slice=mis, rdc_threshold=rdc)
        eval_spn_params.reset_index(inplace=True)
        eval_spn_params['SPN Params'] = [(rdc, mis)] * len(eval_spn_params)  # assigning a list doesnt work
        cross_eval_hyperparams.append(eval_spn_params)
cross_eval_hyperparams = pd.concat(cross_eval_hyperparams, ignore_index=True).set_index(
    ['SPN Params', 'min_sup', 'compare'])