def test_get_titanic(): df, value_dict, param_types = real_data.get_titanic() io.print_pretty_table(df.head(10)) print(value_dict) print(param_types)
def __print_items(freq_candidates): freq_sets = [] for (sup, conds) in freq_candidates: str_conds = [] for cond in conds: str_conds.append(cond[0] + "=" + cond[1]) freq_sets.append(["(" + ", ".join(str_conds) + ")", sup]) #freq_sets = sorted(freq_sets, key=lambda x : x[1], reverse=True) candidate_df = pd.DataFrame(freq_sets, columns=["frequent set", "s_support"]) io.print_pretty_table(candidate_df)
def test_value_dict(): import os import pandas as pd from util import io path = os.path.dirname( os.path.realpath(__file__)) + "/../../_data/titanic/train.csv" df = pd.read_csv(path) df = df[["Survived", "Sex", "Age", "Fare", "Pclass"]] df, val_dict, param_types = fn.transform_dataset(df) io.print_pretty_table(df) print(val_dict) print(param_types)
def _print_items(freq_items): feature_dict = {0: ("g", ("m ", "w ")), 1: ("c", ("no ", "yes")), 2: ("s", ("no ", "yes")), 3: ("w", ("no ", "yes"))} freq_sets = [] for (sup, conds) in freq_items: str_conds=[] for cond in conds: str_conds.append(feature_dict[cond[0]][0] + "=" + feature_dict[cond[0]][1][cond[1]]) freq_sets.append(["(" + ", ".join(str_conds) + ")", sup]) #freq_sets = sorted(freq_sets, key=lambda x : x[1], reverse=True) rule_df = pd.DataFrame(freq_sets, columns=["frequent set", "s_support"]) io.print_pretty_table(rule_df)
def get_rki_export(): path = os.path.dirname(os.path.realpath(__file__)) + "/../../_data/ed/rki_pre_process/epias_of_rki.2018-11.350000.json" df = pd.read_json(path) cols_circumstances = ["aufnahmezeitpunkt_datum", "aufnahmezeitpunkt_stunde", "behandlung_fachabteilung", "id_einrichtung", "zuweisungsart"] cols_patient = ["geschlecht", "altersklasse","plzbereich"] cols_pain = ["schmerz", 'diagnosen', "leitsymptom", "leitsymptom_gruppe", 'tetanus', 'triagesystem'] cols_measures = ["untersuchung_bga", "untersuchung_echokardiographie", "untersuchung_ekg", 'untersuchung_roentgen_thorax', "vitalwerte"] io.print_pretty_table(df[cols_circumstances].head(10)) io.print_pretty_table(df[cols_patient].head(10)) io.print_pretty_table(df[cols_pain].head(10)) io.print_pretty_table(df[cols_measures].head(10))
def get_eseg_export(): path = os.path.dirname(os.path.realpath(__file__)) + "/../../_data/ed/eseg_export_sample/example2019-09.json" df = pd.read_json(path) cols_circumstances = ["sys_date", "sys_hour", "sys_department", "sys_transport", 'sys_disposition', 'sys_ed', 'sys_lab', 'sys_labinfections'] cols_patient = ["sys_gender", "sys_age16","sys_plz3"] cols_pain = ["sys_complaint", 'sys_diagnosis_icd4', "sys_triage", "sys_isolation", 'sys_ecg', 'sys_echo', 'sys_xraythorax'] cols_measures = ["sys_heartrate", "sys_temperature", "sys_respiratoryrate",'sys_bloodpressuresystolic'] io.print_pretty_table(df[cols_circumstances].head(10)) io.print_pretty_table(df[cols_patient].head(10)) io.print_pretty_table(df[cols_pain].head(10)) io.print_pretty_table(df[cols_measures].head(10))
def create_p_value_dataset(): from data import R_wrapper_data from evaluation import evaluator from methods import R_wrapper from ml import dataset_creator, pre_processing from util import io tss = R_wrapper_data.get_noufaily_configuration(25, num_ts=10, num_weeks=624, k=5, random_seed=100) baseline = 7 #Init 7 time points method_descriptions1 = [ { "method": R_wrapper.get_EARS_score, "parameters": { "method": "C1", "baseline": baseline, "alpha": 0.005 } }, { "method": R_wrapper.get_EARS_score, "parameters": { "method": "C2", "baseline": baseline, "alpha": 0.005 } }, { "method": R_wrapper.get_EARS_score, "parameters": { "method": "C3", "baseline": baseline, "alpha": 0.005 } }, #{"method": R_wrapper.get_EARS_score, "parameters" : {"method":"C4", "baseline":6, "alpha":0.05}}, { "method": R_wrapper.get_Bayes_score, "parameters": { "b": 0, "w": baseline, "actY": True, "alpha": 0.005 } }, { "method": R_wrapper.get_RKI_score, "parameters": { "b": 0, "w": baseline, "actY": True } }, ] method_results = evaluator.evaluate_method_results(tss, method_descriptions1) ds = None for ts in tss: if ds is None: ds = dataset_creator.pValue_dataset( ts, method_results, pre_process=pre_processing.peak) else: new_ds = dataset_creator.pValue_dataset( ts, method_results, pre_process=pre_processing.peak) ds.df = ds.df.append(new_ds.df) ds.df.drop(["ground_truth"], inplace=True, axis=1) ds.df.replace({"target": "False"}, {"target": 0}, inplace=True) ds.df.replace({"target": "True"}, {"target": 1}, inplace=True) io.print_pretty_table(ds.df.head(100)) from spn.structure.leaves.parametric.Parametric import Categorical, Gaussian return ds.df.values, [ Gaussian, Gaussian, Gaussian, Gaussian, Gaussian, Categorical ]
def extract_rules(spn, feature_id=1): from spn.experiments.AQP.Ranges import NominalRange from spn.algorithms import Inference from simple_spn.internal.InferenceRange import categorical_likelihood_range from spn.structure.Base import Sum, Product from spn.algorithms.Inference import sum_likelihood, prod_likelihood from spn.structure.leaves.parametric.Parametric import Categorical inference_support_ranges = {Categorical : categorical_likelihood_range, Sum : sum_likelihood, Product : prod_likelihood} freq_items = get_frequent_items(spn, min_support=0.0) freq_items_filtered = freq_items#filter(lambda x : any(cond[0] == feature_id for cond in x[1]), freq_items) freq_items_sorted = sorted(freq_items_filtered, key=lambda x: x[0], reverse=True) #evidence = numpy.empty((3,3,) feature_dict = {0: ("g", ("m ", "w ")), 1: ("c", ("no ", "yes")), 2: ("s", ("no ", "yes")), 3: ("w", ("no ", "yes"))} freq_sets = [] for (sup, conds) in freq_items_sorted: str_conds=[] ranges = [None] * len(spn.scope) for cond in conds: ranges[cond[0]] = NominalRange([cond[1]]) str_conds.append(feature_dict[cond[0]][0] + "=" + feature_dict[cond[0]][1][cond[1]]) ranges = np.array([ranges]) sup_spn = Inference.likelihood(spn, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:,0][0] freq_sets.append(["(" + ", ".join(str_conds) + ")", sup, sup_spn]) rules = sorted(freq_sets, key=lambda x : x[2], reverse=True) rule_df = pd.DataFrame(rules, columns=["frequent set", "s_support", "g_support"]) io.print_pretty_table(rule_df.head(400)) exit() rules = [] for (sup, conds) in freq_items_sorted: rule_body = [] rule_head = [] conf = np.nan ranges = [None] * len(spn.scope) for cond in conds: if cond[0] == feature_id: rule_head.append(feature_dict[cond[0]][0] + "=" + feature_dict[cond[0]][1][cond[1]]) else: rule_body.append(feature_dict[cond[0]][0] + "=" + feature_dict[cond[0]][1][cond[1]]) ranges[cond[0]] = NominalRange([cond[1]]) #Optimization possible ranges = np.array([ranges]) prob_with_feature = Inference.likelihood(spn, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:,0][0] ranges[0][feature_id] = None prob_without_feature = Inference.likelihood(spn, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:,0][0] spn_sup = prob_without_feature spn_conf = prob_with_feature / prob_without_feature rules.append([" AND ".join(rule_body) + "-->" + " AND ".join(rule_head), sup, conf, spn_sup, spn_conf, spn_sup*spn_conf]) rules = sorted(rules, key=lambda x : x[5], reverse=True) rule_df = pd.DataFrame(rules, columns=["Rule", "c_Support", "c_Confidence", "spn_Support", "spn_Confidence", "score"]) #rule_df.drop_duplicates(subset=["Rule"], keep = True, inplace = True) io.print_pretty_table(rule_df.head(400)) pass
def test_get_lending(): df, value_dict, param_types = real_data.get_real_data('lending', only_n_rows=10000, seed=5) io.print_pretty_table(df.head(10)) assert len(df.columns) == len(value_dict)
def test_get_Ecommerce(): df, value_dict, param_types = real_data.get_real_data('Ecommerce') io.print_pretty_table(df.head(10)) assert len(df.columns) == len(value_dict)
def test_get_OnlineRetail(): df, value_dict, param_types = real_data.get_real_data('OnlineRetail') io.print_pretty_table(df.head(10))
dataset_name = "titanic" #parameters for the construction rdc_threshold = 0.3 min_instances_slice = 0.01 if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): print("Creating SPN ...") #get data df, value_dict, parametric_types = real_data.get_titanic() #print data (top 5 rows) io.print_pretty_table(df.head(5)) #print value-dict print(value_dict) #Creates the SPN and saves to a file spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name) #Load SPN spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) #Print some statistics fn.print_statistics(spn)