예제 #1
0
def test_expect():
    spn = example_spns.get_gender_spn()

    rang = [None, None, None]
    expect = fn.expect(spn, feature_id=2, rang=rang)
    print(expect)

    rang = [NominalRange([0]), None, None]
    expect = fn.expect(spn, feature_id=2, rang=rang)
    print(expect)

    rang = [NominalRange([1]), None, None]
    expect = fn.expect(spn, feature_id=2, rang=rang)
    print(expect)

    rang = [None, NominalRange([0]), None]
    expect = fn.expect(spn, feature_id=2, rang=rang)
    print(expect)

    feature_scope = {2}
    data = np.array([[np.nan, np.nan, np.nan]])
    expect = fn.expects_spnflow(spn, feature_scope, data)
    print(expect)

    feature_scope = {2}
    data = np.array([np.nan, np.nan, np.nan])
    expect = fn.expect_spnflow(spn, feature_scope, data)
    print(expect)
예제 #2
0
def test_marg():
    spn = example_spns.get_gender_spn()

    spn1 = fn.marg(spn, [2])
    fn.plot_spn(spn1, "marg1.pdf")

    spn2 = fn.marg(spn, [0])
    fn.plot_spn(spn2, "marg2.pdf")

    spn3 = fn.marg(spn, [1])
    fn.plot_spn(spn3, "marg3.pdf")

    spn4 = fn.marg(spn, [1, 2])
    fn.plot_spn(spn4, "marg4.pdf")

    rang = [None, NominalRange([1]), None]
    prob, spn5 = fn.marg_rang(spn, rang)
    fn.plot_spn(spn5, "marg5.pdf")

    rang = [None, NominalRange([1]), NumericRange([[10, 12]])]
    prob, spn6 = fn.marg_rang(spn, rang)
    fn.plot_spn(spn6, "marg6.pdf")

    rang = [NominalRange([0]), NominalRange([1]), None]
    prob = fn.prob(spn, rang)
    print(prob)
    prob = fn.prob(spn6, rang)
    print(prob)
예제 #3
0
def test_classify():

    from util import io
    from data import real_data

    loc = "_spns"
    ident = "rdc=" + str(0.3) + "_mis=" + str(0.1)
    spn, _ = io.load(ident, "titanic", loc)
    value_dict = real_data.get_titanic_value_dict()
    #spn = fn.marg(spn, keep=[0,1,2,4,5,7])

    ranges = np.array(
        [[None, NominalRange([1]), None, None, None, None, None, None],
         [None, NominalRange([0]), None, None, None, None, None, None],
         [None, NominalRange([0]), None, None, None, None, None, None]])
    res = fn.classifies(spn, target_id=0, ranges=ranges, value_dict=value_dict)
    print(res)

    res = fn.classify(spn, target_id=0)
    print(res)

    df, _ = real_data.get_titanic()
    a = {v[1]: v[2] for _, v in value_dict.items() if v[0] == "discrete"}
    df = df.replace(a)

    preds = fn.classify_dataset(spn,
                                target_id=0,
                                df=df,
                                transform=True,
                                value_dict=value_dict)
    print(preds)
def explore_1():
    
    
    dataset_name = "rki_ed_1"
    rdc_threshold = 0.3
    min_instances_slice = 0.01
    if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice):
        df, value_dict, parametric_types = ed_data.get_rki_ed_1()
        spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict)
    spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice)
    
    spn = fn.marg(spn, keep=[0,2,3,4,5])
    
    
    
    
    fn.print_statistics(spn)
    
    p = io.get_path("_results/ed_data_explore")
    
    #vz.visualize_overall_distribution(spn, value_dict)
    
    from spn.experiments.AQP.Ranges import NominalRange
    
    target_conds = [{0 : NominalRange([5,6])}, {0 : NominalRange([0,1,2,3,4])}]
    #target_conds = [{0 : NominalRange([5,6]), 1 : NominalRange([0,1,2,3,4,5,6,7,8,9,10,11])}, {0 : NominalRange([0,1,2,3,4]), 1 : NominalRange([0,1,2,3,4,5,6,7,8,9,10,11])}]
    vz.visualize_target_based_conds_overall_distribution_compact(spn, target_conds, value_dict, target_names=["Wochenende", "Unter der Woche"], save_path=p+dataset_name+"_weekend_measures.pdf")
예제 #5
0
def test_sampling():
    spn = example_spns.get_gender_spn()
    '''
    Always same random number generator
    '''

    samples = fn.sampling(spn, n_samples=10, random_seed=1)
    print(samples)

    samples = fn.sampling_rang(spn,
                               rang=[None, None, None, None],
                               n_samples=10,
                               random_seed=1)
    print(samples)

    samples = fn.sampling_rang(
        spn,
        rang=[None, None, NumericRange([[10, 11], [29, 30]])],
        n_samples=10,
        random_seed=1)
    print(samples)

    samples = fn.sampling_rang(
        spn,
        rang=[NominalRange([0]), None,
              NumericRange([[14, 15], [29, 30]])],
        n_samples=10,
        random_seed=1)
    print(samples)
def reduce_spn():
    from spn.experiments.AQP.Ranges import NominalRange, NumericRange
    from spn.structure.Base import Sum, Product
    from spn.algorithms.Inference import sum_likelihood, prod_likelihood

    from spn.structure.leaves.parametric.Parametric import Gaussian, Categorical
    from spn.structure.leaves.parametric.InferenceRange import categorical_likelihood_range
    from simple_spn.UpdateRange import categorical_update_range

    evidence = [NominalRange([0]), None, None, None]

    inference_support_ranges = {
        Gaussian: None,
        Categorical: categorical_likelihood_range,
        Sum: sum_likelihood,
        Product: prod_likelihood
    }

    distribution_update_ranges = {
        Gaussian: None,
        Categorical: categorical_update_range
    }

    #spn_util.plot_spn(spn, "old.pdf")

    prob, spn = spn_for_evidence(
        spn,
        evidence,
        node_likelihood=inference_support_ranges,
        distribution_update_ranges=distribution_update_ranges)
    print(prob)
예제 #7
0
def classifies(spn, target_id, ranges, value_dict=None):
    if value_dict is None: value_dict = generate_adhoc_value_dict(spn)
    if ranges is None: ranges = np.array([[None] * (np.max(spn.scope) + 1)])
    assert (not any(ranges[:, target_id]))
    ps = []
    for v in range(len(value_dict[target_id][2])):
        ranges[:, target_id] = NominalRange([v])
        ps.append(probs(spn, ranges))
    return np.argmax(ps, axis=0)
def naive_approach(spn, min_support=0.1, value_dict=None):

    if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn)
    n_rv = np.max(spn.scope) + 1

    ranges = np.full(shape=(n_rv, n_rv), fill_value=None)
    for i in range(len(ranges)):
        if len(value_dict[i][2]) == 2:
            ranges[i][i] = NominalRange([1])

    freq_sets = []
    new_freq_sets = []
    for i in range(len(spn.scope)):
        print("Iteration: " + str(i))

        if len(ranges) == 0: break
        probs = fn.probs(spn, ranges)

        new_freq_sets = []
        for i, prob in enumerate(probs):
            if prob >= min_support:
                ids = [
                    i for i, cond in enumerate(ranges[i]) if cond is not None
                ]
                new_freq_sets.append([prob, ids])

        freq_sets += new_freq_sets

        ranges = []
        for prob, ids in new_freq_sets:
            for i in range(n_rv):
                if i not in ids:
                    rang = np.array([None] * n_rv)
                    rang[ids] = NominalRange([1])
                    rang[i] = NominalRange([1])
                    ranges.append(rang)
        ranges = np.array(ranges)

    return freq_sets
예제 #9
0
def not_rang(rang, value_dict):
    assert len(rang) == len(value_dict)
    res = [None] * len(rang)
    for i, r in enumerate(rang):
        if not r is np.NaN:
            if not isinstance(r, Range):
                vals = list(value_dict[i][2].keys())
                vals.pop(r)
            else:
                vals = list(value_dict[i][2].keys())
                for v in r.get_ranges():
                    vals.pop(v)
            res[i] = NominalRange(vals)
    return res
예제 #10
0
def test_prob():
    spn = example_spns.get_gender_spn()

    rang = [None, None, None]
    prob = fn.prob(spn, rang)
    print(prob)

    rang = [NominalRange([0]), NominalRange([1]), NumericRange([[20]])]
    prob = fn.prob(spn, rang)
    print(prob)

    ranges = np.array([[None, None, NumericRange([[0, 20]])],
                       [NominalRange([0]), None, None],
                       [None, NominalRange([1]), None]])
    probs = fn.probs(spn, ranges)
    print(probs)

    inst = [0, np.nan, np.nan]
    prob = fn.prob_spflow(spn, inst)
    print(prob)

    data = np.array([[0, np.nan, np.nan], [0, 1, np.nan]])
    probs = fn.probs_spflow(spn, data)
    print(probs)
예제 #11
0
def _generate_conds(target_id, value_dict, numeric_intervals=10):
    conds = []
    labels = []
    if value_dict[target_id][0] == "discrete":
        for val in sorted(value_dict[target_id][2]):
            conds.append(NominalRange([val]))
            labels.append(value_dict[target_id][2][val])
    elif value_dict[target_id][0] == "numeric":
        val_space = np.linspace(value_dict[target_id][2][0],
                                value_dict[target_id][2][1],
                                numeric_intervals + 1)
        for interval in zip(val_space[1:], val_space[:-1]):
            conds.append(NumericRange([list(interval)]))
            labels.append(str(list(interval)))
    else:
        raise Exception(
            "Not implemented for other than discrete or numeric ...: " +
            str(value_dict[target_id][0]))
    return conds, labels
예제 #12
0
def classify_dataset(spn,
                     target_id,
                     df,
                     transform=False,
                     value_dict=None,
                     epsilon=0.01):
    if value_dict is None: value_dict = generate_adhoc_value_dict(spn)
    sorted_scope = sorted(spn.scope)

    if transform:
        inv_val_dict = {
            v[1]: {v2: k2
                   for k2, v2 in v[2].items()}
            for _, v in value_dict.items() if v[0] == "discrete"
        }
        for col_name, map_dict in inv_val_dict.items():
            df[col_name] = df[col_name].map(map_dict)

    values = np.array(df.values)
    ranges = np.full(shape=(len(values), np.max(spn.scope) + 1),
                     fill_value=None)
    for i, col in enumerate(values.T):
        f_id = sorted_scope[i]
        if f_id == target_id: continue

        if value_dict[f_id][0] == "discrete":
            for j, v in enumerate(col):
                ranges[j, f_id] = NominalRange([v])
        elif value_dict[f_id][0] == "numeric":
            bound = epsilon * (value_dict[f_id][2][1] - value_dict[f_id][2][0])
            for j, v in enumerate(col):
                ranges[j, f_id] = NumericRange([[v - bound, v + bound]])
        else:
            raise Exception("Unknown attribute-type: " +
                            str(value_dict[f_id][0]))

    return classifies(spn, target_id, ranges, value_dict)
def feature_importance(spn, target_id, rang=None, value_dict=None, numeric_prec=50):
    
    if value_dict is None : value_dict = fn.generate_adhoc_value_dict(spn)
    if rang is not None : assert(rang[target_id] is None)
    if rang is not None: _, spn = fn.marg_rang(spn, rang)
    
    n_vals = len(value_dict[target_id][2])
    
    
    overall_pops = []
    for v in range(n_vals):
        tmp_rang = [None] * (np.max(spn.scope)+1)
        tmp_rang[target_id] = NominalRange([v])
        p, spn1 = fn.marg_rang(spn, tmp_rang)
        overall_pop = fn.get_overall_population(spn1, value_dict=value_dict, numeric_prec=numeric_prec)
        overall_pops.append([p, overall_pop])
    
    fis = []
    for f_id in spn1.scope:
        dists = [[p, overall_pop[f_id]] for p, overall_pop in overall_pops]
        fi = _compare_distributions(dists, value_dict[f_id])
        fis.append(fi)
    
    return fis  
예제 #14
0
def visualize_Density(spn):

    from spn.experiments.AQP.Ranges import NominalRange, NumericRange
    from spn.algorithms import Inference
    from simple_spn.InferenceRange import categorical_likelihood_range, gaussian_likelihood_range
    from spn.structure.Base import Sum, Product
    from spn.algorithms.Inference import sum_likelihood, prod_likelihood
    from spn.structure.leaves.parametric.Parametric import Gaussian, Categorical
    from simple_spn.UpdateRange import categorical_update_range

    inference_support_ranges = {
        Gaussian: None,
        Categorical: categorical_likelihood_range,
        Sum: sum_likelihood,
        Product: prod_likelihood
    }

    distribution_update_ranges = {
        Gaussian: None,
        Categorical: categorical_update_range
    }

    import matplotlib.pyplot as plt
    _, axes = plt.subplots(1,
                           5,
                           figsize=(15, 10),
                           squeeze=False,
                           sharey=False,
                           sharex=True)

    space_start = 0.00
    space_end = 1.0
    steps = 100
    max_y = 5

    for i in range(5):
        x_vals = np.linspace(space_start, space_end, num=steps)
        ranges = []
        for x_val in x_vals:
            r = [None] * i + [NumericRange([[x_val]])] + [None] * (5 - i)
            ranges.append(r)

        ranges = np.array(ranges)

        inference_support_ranges = {
            Gaussian: gaussian_likelihood_range,
            Categorical: categorical_likelihood_range,
            Sum: sum_likelihood,
            Product: prod_likelihood
        }

        y_vals = Inference.likelihood(
            spn,
            data=ranges,
            dtype=np.float64,
            node_likelihood=inference_support_ranges)[:, 0]

        axes[0][i].plot(x_vals, y_vals)
        axes[0][i].set_title("Method " + str(i) + " All")
        axes[0][i].set_ylim([0, max_y])

    evidence = [None, None, None, None, None, NominalRange([0])]
    prob_no_alarm, spn_no_alarm = spn_for_evidence(
        spn,
        evidence,
        node_likelihood=inference_support_ranges,
        distribution_update_ranges=distribution_update_ranges)
    print(prob_no_alarm)

    for i in range(5):
        x_vals = np.linspace(space_start, space_end, num=steps)
        ranges = []
        for x_val in x_vals:
            r = [None] * i + [NumericRange([[x_val]])] + [None] * (5 - i)
            ranges.append(r)

        ranges = np.array(ranges)

        inference_support_ranges = {
            Gaussian: gaussian_likelihood_range,
            Categorical: categorical_likelihood_range,
            Sum: sum_likelihood,
            Product: prod_likelihood
        }

        y_vals = Inference.likelihood(
            spn_no_alarm,
            data=ranges,
            dtype=np.float64,
            node_likelihood=inference_support_ranges)[:, 0]

        axes[0][i].plot(x_vals, y_vals, label="No Alarm", linestyle=":")

    evidence = [None, None, None, None, None, NominalRange([1])]
    prob_alarm, spn_alarm = spn_for_evidence(
        spn,
        evidence,
        node_likelihood=inference_support_ranges,
        distribution_update_ranges=distribution_update_ranges)
    print(prob_alarm)

    for i in range(5):
        x_vals = np.linspace(space_start, space_end, num=steps)
        ranges = []
        for x_val in x_vals:
            r = [None] * i + [NumericRange([[x_val]])] + [None] * (5 - i)
            ranges.append(r)

        ranges = np.array(ranges)

        inference_support_ranges = {
            Gaussian: gaussian_likelihood_range,
            Categorical: categorical_likelihood_range,
            Sum: sum_likelihood,
            Product: prod_likelihood
        }

        y_vals = Inference.likelihood(
            spn_alarm,
            data=ranges,
            dtype=np.float64,
            node_likelihood=inference_support_ranges)[:, 0]

        axes[0][i].plot(x_vals, y_vals, label="Alarm")

    plt.legend()
    plt.tight_layout()

    plt.savefig("pdp.pdf")

    plt.show()

    spn_util.plot_spn(spn, "pval.pdf")

    tmp = get_nodes_with_weight(spn, 5)

    for (weight, node) in tmp:
        print(str(round(node.p[1], 2)) + "\t" + str(weight))
예제 #15
0
def visualize_Density_2d(spn):

    from spn.experiments.AQP.Ranges import NominalRange, NumericRange
    from spn.algorithms import Inference
    from simple_spn.InferenceRange import categorical_likelihood_range, gaussian_likelihood_range
    from simple_spn.UpdateRange import categorical_update_range
    from spn.experiments.AQP.Ranges import NominalRange, NumericRange
    from spn.structure.Base import Sum, Product
    from spn.algorithms.Inference import sum_likelihood, prod_likelihood
    from spn.structure.leaves.parametric.Parametric import Gaussian, Categorical

    distribution_update_ranges = {
        Gaussian: None,
        Categorical: categorical_update_range
    }

    inference_support_ranges = {
        Gaussian: gaussian_likelihood_range,
        Categorical: categorical_likelihood_range,
        Sum: sum_likelihood,
        Product: prod_likelihood
    }

    import matplotlib.pyplot as plt
    _, axes = plt.subplots(1,
                           3,
                           figsize=(15, 10),
                           squeeze=False,
                           sharey=False,
                           sharex=True)
    x_vals = np.linspace(0, 1, num=50)
    y_vals = np.linspace(0, 1, num=50)
    X, Y = np.meshgrid(x_vals, y_vals)

    ranges = []
    vals = []
    for y_val in y_vals:
        print(y_val)
        ranges = []
        for x_val in x_vals:
            ranges.append([
                NumericRange([[x_val]]),
                NumericRange([[y_val]]), None, None, None, None
            ])

        ranges = np.array(ranges)
        densities = Inference.likelihood(
            spn,
            data=ranges,
            dtype=np.float64,
            node_likelihood=inference_support_ranges)[:, 0]

        for i, d in enumerate(densities):
            if d > 5:
                densities[i] = 5

        vals.append(densities)

    vals = np.array(vals)
    axes[0][0].contour(X, Y, vals)
    axes[0][0].set_xlabel("Method1")
    axes[0][0].set_ylabel("Method2")
    axes[0][0].set_title("Overall")

    evidence = [None, None, None, None, None, NominalRange([0])]
    prob_no_alarm, spn_no_alarm = spn_for_evidence(
        spn,
        evidence,
        node_likelihood=inference_support_ranges,
        distribution_update_ranges=distribution_update_ranges)
    print(prob_no_alarm)

    ranges = []
    vals = []
    for y_val in y_vals:
        print(y_val)
        ranges = []
        for x_val in x_vals:
            ranges.append([
                NumericRange([[x_val]]),
                NumericRange([[y_val]]), None, None, None, None
            ])

        ranges = np.array(ranges)
        densities = Inference.likelihood(
            spn_no_alarm,
            data=ranges,
            dtype=np.float64,
            node_likelihood=inference_support_ranges)[:, 0]

        for i, d in enumerate(densities):
            if d > 5:
                densities[i] = 5

        vals.append(densities)

    vals = np.array(vals)
    axes[0][1].contour(X, Y, vals)
    axes[0][1].set_xlabel("Method1")
    axes[0][1].set_ylabel("Method2")
    axes[0][1].set_title("Keine Epidemie")

    evidence = [None, None, None, None, None, NominalRange([1])]
    prob_alarm, spn_alarm = spn_for_evidence(
        spn,
        evidence,
        node_likelihood=inference_support_ranges,
        distribution_update_ranges=distribution_update_ranges)
    print(prob_alarm)

    ranges = []
    vals = []
    for y_val in y_vals:
        print(y_val)
        ranges = []
        for x_val in x_vals:
            ranges.append([
                NumericRange([[x_val]]),
                NumericRange([[y_val]]), None, None, None, None
            ])

        ranges = np.array(ranges)
        densities = Inference.likelihood(
            spn_alarm,
            data=ranges,
            dtype=np.float64,
            node_likelihood=inference_support_ranges)[:, 0]

        for i, d in enumerate(densities):
            if d > 5:
                densities[i] = 5

        vals.append(densities)

    vals = np.array(vals)
    axes[0][2].contour(X, Y, vals)
    axes[0][2].set_xlabel("Method1")
    axes[0][2].set_ylabel("Method2")
    axes[0][2].set_title("Epidemie")

    plt.savefig("cdp.pdf")

    plt.show()
예제 #16
0
def visualized_target_based_expected_sub_populations(spn,
                                                     target_id,
                                                     value_dict=None,
                                                     top=None,
                                                     rang=None,
                                                     numeric_prec=10,
                                                     save_path=None):

    if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn)
    if rang is not None: spn = fn.marg_rang(spn, rang)

    n_vals = len(value_dict[target_id][2])

    ps = []
    all_lines = []
    for v in range(n_vals):

        tmp_rang = [None] * (np.max(spn.scope) + 1)
        tmp_rang[target_id] = NominalRange([v])
        p, spn1 = fn.marg_rang(spn, tmp_rang)
        ps.append(p)
        sub_pops = fn.get_sub_populations(spn1, sort=True, top=top)
        sub_pops = [[p * p1, dists] for p1, dists in sub_pops]

        lines = []
        for [p, dists] in sub_pops:
            line = []
            for dist in dists:
                f_id = dist.scope[0]

                if value_dict[f_id][0] == "discrete":
                    rang = [None] * (np.max(spn.scope) + 1)
                    expect = fn.expect(dist, f_id, rang)
                    y_val = np.linspace(0, 1,
                                        len(value_dict[f_id][2]))[int(expect)]
                    line.append(y_val)

                elif value_dict[f_id][0] == "numeric":
                    rang = [None] * (np.max(spn.scope) + 1)
                    expect = fn.expect(dist, f_id, rang)

                    mi = value_dict[f_id][2][0]
                    ma = value_dict[f_id][2][1]
                    y_val = (expect - mi) / (ma - mi)
                    line.append(y_val)
                else:
                    raise Exception("Unknown attribute-type: " +
                                    str(value_dict[dist.scope[0]]))

            lines.append([p, line])
        all_lines.append(lines)

    fig, axes = plt.subplots(n_vals,
                             1,
                             figsize=(16, 6 * n_vals),
                             squeeze=False)
    for i, lines in enumerate(all_lines):

        plot = axes[i][0]
        plot.set_yticklabels([])
        for [p, line] in lines:
            x_vals = []
            y_vals = []
            for i in range(len(line) - 1):
                y_val = line[i]
                next_y_val = line[i + 1]

                for r in np.linspace(0, 1, numeric_prec):
                    x_vals.append(i + r)
                    y_vals.append(y_val + (next_y_val - y_val) * r +
                                  np.random.normal() * 0.025)

            plot.plot(x_vals, y_vals, linewidth=p * 100)

        x_feature_ids = sorted(list(set(spn.scope) - set([target_id])))
        plot.set_xticks(np.arange(len(x_feature_ids)))
        if value_dict is not None:
            plot.set_xticklabels(
                [value_dict[scope][1] for scope in x_feature_ids])

        for j, feature_id in enumerate(x_feature_ids):

            if value_dict[feature_id][0] == "discrete":
                for i, y_val in enumerate(
                        np.linspace(0, 1, len(value_dict[feature_id][2]))):
                    val_name = value_dict[feature_id][2][i]
                    plot.text(j, y_val, val_name)
            elif value_dict[feature_id][0] == "numeric":
                mi = value_dict[feature_id][2][0]
                ma = value_dict[feature_id][2][1]
                for i, y_val in enumerate(np.linspace(0, 1, 5)):
                    val_name = round(y_val * (ma - mi) + mi, 4)
                    plot.text(j, y_val, val_name)
            else:
                raise Exception(
                    "Not implemented for other than discrete or numeric")

    pad_row = 5
    for i, (ax, p) in enumerate(zip(axes[:, 0], ps)):
        info = value_dict[target_id][1] + "=" + value_dict[target_id][2][
            i] + " " + str(round(p * 100, 4)) + "%\n"
        ax.annotate(info,
                    xy=(0, 0.5),
                    xytext=(-ax.yaxis.labelpad - pad_row, 0),
                    xycoords=ax.yaxis.label,
                    textcoords='offset points',
                    size='large',
                    ha='right',
                    va='center')
    plt.tight_layout()
    fig.subplots_adjust(left=0.15)

    if save_path is None:
        plt.show()
    else:
        plt.savefig(save_path)
예제 #17
0
def demo_visualize_density():
    #data, parametric_types = real_data.get_p_value_dataset()
    #learn_SPN.create_parametric_spns(data, parametric_types, [0.3], [0.01], folder="p_value_test")

    loc = "_spns"
    ident = "rdc=" + str(0.3) + "_mis=" + str(0.01)
    spn, _ = io.load(ident, "p_value_test", loc)
    value_dict = real_data.get_p_value_test_value_dict()

    rang = None
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density1.pdf"
    visualize_density(spn,
                      value_dict,
                      rang=rang,
                      max_density=10,
                      save_path=save_path)

    rang = [None] * 5 + [NominalRange([0])]
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density2.pdf"
    visualize_density(spn,
                      value_dict,
                      rang=rang,
                      max_density=10,
                      save_path=save_path)

    rang = None
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density3.pdf"
    visualize_density_target(spn,
                             5,
                             value_dict,
                             rang=rang,
                             max_density=10,
                             save_path=save_path)

    loc = "_spns"
    ident = "rdc=" + str(0.3) + "_mis=" + str(0.01)
    spn, _ = io.load(ident, "titanic", loc)
    value_dict = real_data.get_titanic_value_dict()

    rang = None
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density5.pdf"
    visualize_density(spn, value_dict, max_density=0.1, save_path=save_path)

    rang = None
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density6.pdf"
    visualize_density_target(spn,
                             0,
                             value_dict,
                             max_density=0.1,
                             save_path=save_path)

    rang = None
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density7.pdf"
    visualize_density_target(spn,
                             2,
                             value_dict,
                             max_density=0.1,
                             save_path=save_path)

    rang = [None] * 2 + [NominalRange([0])] + [None] * 5
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density8.pdf"
    visualize_density_target(spn,
                             0,
                             value_dict,
                             max_density=0.1,
                             save_path=save_path)
예제 #18
0
def visualize_target_based_overall_distribution_single(spn,
                                                       target_id,
                                                       value_dict=None,
                                                       rang=None,
                                                       numeric_prec=50,
                                                       save_path=None):

    if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn)
    if rang is not None: assert (rang[target_id] is None)
    if rang is not None: _, spn = fn.marg_rang(spn, rang)

    n_vals = len(value_dict[target_id][2])

    ncols = len(spn.scope) - 1
    nrows = n_vals
    figsize_x = ncols * 3
    figsize_y = nrows * 2
    fig, axes = plt.subplots(nrows,
                             ncols,
                             figsize=(figsize_x, figsize_y),
                             squeeze=False)

    ps = []
    for v in range(n_vals):
        tmp_rang = [None] * (np.max(spn.scope) + 1)
        tmp_rang[target_id] = NominalRange([v])

        p, spn1 = fn.marg_rang(spn, tmp_rang)
        ps.append(p)
        overall_population = fn.get_overall_population(
            spn1, value_dict=value_dict, numeric_prec=numeric_prec)

        for i, f_id in enumerate(sorted(spn1.scope)):
            dist = overall_population[f_id]

            if dist["feature_type"] == "discrete":
                viz_helper.bar_plot(axes[v][i],
                                    dist["y_means"],
                                    dist["x_labels"],
                                    y_err=np.sqrt(dist["y_vars"]),
                                    y_label="probability",
                                    ylim=[0, 1])

            elif dist["feature_type"] == "numeric":
                viz_helper.line_plot(axes[v][i],
                                     dist["x_vals"],
                                     dist["y_means"],
                                     y_errs=np.sqrt(dist["y_vars"]),
                                     y_label="density")
            else:
                raise Exception("Unknown attribute-type: " +
                                str(value_dict[dist.scope[0]]))

    pad_col = 5
    feature_names = [value_dict[x][1] for x in sorted(spn1.scope)]
    for ax, col in zip(axes[0], feature_names):
        ax.annotate(col,
                    xy=(0.5, 1),
                    xytext=(0, pad_col),
                    xycoords='axes fraction',
                    textcoords='offset points',
                    size='large',
                    ha='center',
                    va='baseline')

    pad_row = 5
    for i, p in enumerate(ps):
        axes[i][0].annotate(str(round(p * 100, 4)) + "%\n" +
                            value_dict[target_id][1] + "=" +
                            value_dict[target_id][2][i],
                            xy=(0, 0.5),
                            xytext=(-axes[i][0].yaxis.labelpad - pad_row, 0),
                            xycoords=axes[i][0].yaxis.label,
                            textcoords='offset points',
                            size='large',
                            ha='right',
                            va='center')

    plt.tight_layout()
    fig.subplots_adjust(left=0.15, top=0.9)

    if save_path is None:
        plt.show()
    else:
        plt.savefig(save_path)
예제 #19
0
def visualize_target_based_overall_distribution_compact(
        spn,
        target_id,
        value_dict=None,
        rang=None,
        numeric_prec=50,
        save_path=None):

    if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn)
    if rang is not None: assert (rang[target_id] is None)
    if rang is not None: _, spn = fn.marg_rang(spn, rang)

    n_vals = len(value_dict[target_id][2])

    ncols = len(spn.scope) - 1
    nrows = 1
    figsize_x = ncols * 3
    figsize_y = nrows * 3
    fig, axes = plt.subplots(nrows,
                             ncols,
                             figsize=(figsize_x, figsize_y),
                             squeeze=False)

    ps = []
    plot_data = {f_id: [] for f_id in spn.scope if f_id != target_id}
    for v in range(n_vals):
        tmp_rang = [None] * (np.max(spn.scope) + 1)
        tmp_rang[target_id] = NominalRange([v])

        p, spn1 = fn.marg_rang(spn, tmp_rang)
        ps.append(p)
        overall_population = fn.get_overall_population(
            spn1, value_dict=value_dict, numeric_prec=numeric_prec)
        for f_id in spn1.scope:
            plot_data[f_id].append(overall_population[f_id])

    for i, f_id in enumerate(plot_data):

        if value_dict[f_id][0] == "discrete":
            y_means = []
            y_errs = []
            legend_labels = []
            for j, dist in enumerate(plot_data[f_id]):
                y_means.append(dist["y_means"])
                y_errs.append(dist["y_vars"])
                legend_labels.append(
                    str(value_dict[target_id][1]) + "=" +
                    str(value_dict[target_id][2][j]))
            viz_helper.multiple_bar_plot(axes[0][i],
                                         y_means,
                                         dist["x_labels"],
                                         y_errs=np.sqrt(y_errs),
                                         legend_labels=legend_labels,
                                         y_label="probability",
                                         ylim=[0, 1])

        elif value_dict[f_id][0] == "numeric":
            for j, dist in enumerate(plot_data[f_id]):
                viz_helper.line_plot(axes[0][i],
                                     dist["x_vals"],
                                     dist["y_means"],
                                     y_errs=np.sqrt(dist["y_vars"]),
                                     label=str(value_dict[target_id][1]) +
                                     "=" + str(value_dict[target_id][2][j]),
                                     y_label="density")
        else:
            raise Exception("Unknown attribute-type: " +
                            str(value_dict[dist.scope[0]]))

    pad_col = 5
    feature_names = [value_dict[x][1] for x in sorted(spn1.scope)]
    for ax, col in zip(axes[0], feature_names):
        ax.annotate(col,
                    xy=(0.5, 1),
                    xytext=(0, pad_col),
                    xycoords='axes fraction',
                    textcoords='offset points',
                    size='large',
                    ha='center',
                    va='baseline')

    pad_row = 5
    info = ""
    for i, prob in enumerate(ps):
        info += str(value_dict[target_id][1]) + "=" + str(
            value_dict[target_id][2][i]) + " " + str(round(prob * 100,
                                                           4)) + "%\n"
    axes[0][0].annotate(info,
                        xy=(0, 0.5),
                        xytext=(-axes[0][0].yaxis.labelpad - pad_row, 0),
                        xycoords=axes[0][0].yaxis.label,
                        textcoords='offset points',
                        size='large',
                        ha='right',
                        va='center')
    axes[0][0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.25))
    plt.tight_layout()
    fig.subplots_adjust(left=0.15, top=0.9)

    if save_path is None:
        plt.show()
    else:
        plt.savefig(save_path)
예제 #20
0
def visualize_density_target(spn,
                             target_id,
                             value_dict,
                             rang=None,
                             n_steps=50,
                             max_density=None,
                             save_path=None):

    #Only select numeric features
    selected_features = []
    for feature_id in spn.scope:
        if value_dict[feature_id][0] == "numeric":
            selected_features.append(feature_id)

    #Create ranges
    if rang is None:
        rang = np.array([None] * (max(spn.scope) + 1))

    results = []
    assert (value_dict[target_id][0] == "discrete")
    for v in value_dict[target_id][2]:
        rang[target_id] = NominalRange([v])

        ranges = []
        for i, feature_id in enumerate(selected_features):
            for x_val in np.linspace(value_dict[feature_id][2][0],
                                     value_dict[feature_id][2][1],
                                     num=n_steps):
                n_rang = rang.copy()
                n_rang[feature_id] = NumericRange([[x_val]])
                ranges.append(n_rang)

        results.append(fn.probs(spn, np.array(ranges)))

    #Visualize
    ncols = len(results)
    nrows = len(selected_features)
    figsize_x = 16
    figsize_y = 6 * len(selected_features)
    _, axes = plt.subplots(nrows,
                           ncols,
                           figsize=(figsize_x, figsize_y),
                           squeeze=False,
                           sharey=True,
                           sharex=False)

    for j, res in enumerate(results):
        for i, feature_id in enumerate(selected_features):
            plot = axes[i][j]

            x_vals = np.linspace(value_dict[feature_id][2][0],
                                 value_dict[feature_id][2][1],
                                 num=n_steps)
            y_vals = res[n_steps * i:n_steps * i + n_steps]
            plot.plot(x_vals, y_vals)

            if max_density is not None:
                plot.set_ylim(0, max_density)
            plot.set_title(value_dict[feature_id][1] + " - " +
                           value_dict[target_id][1] + "=" +
                           value_dict[target_id][2][j])

    plt.tight_layout()
    if save_path is None:
        plt.show()
    else:
        plt.savefig(save_path)
예제 #21
0
def evaluate_discrete_leaf(leaf, f_vals):
    f_id = leaf.scope[0]
    ranges = np.array([f_id * [None] + [NominalRange([x])] for x in f_vals])
    return probs(leaf, ranges)
예제 #22
0
 from spn.structure.Base import Sum, Product
 from spn.algorithms.Inference import sum_likelihood, prod_likelihood
 
 from spn.structure.leaves.parametric.Parametric import Gaussian, Categorical
 from spn.structure.leaves.parametric.InferenceRange import categorical_likelihood_range
 from simple_spn.internal.UpdateRange import categorical_update_range
 
 
 inference_support_ranges = {Gaussian        : None, 
                             Categorical     : categorical_likelihood_range,
                             Sum             : sum_likelihood,
                             Product         : prod_likelihood}
 distribution_update_ranges = {Gaussian        : None, 
                               Categorical     : categorical_update_range}
 
 evidence = [None, NominalRange([1]), None, None]
 prob, pos_spn = spn_for_evidence(flat_spn, evidence, node_likelihood=inference_support_ranges, distribution_update_ranges=distribution_update_ranges)
 #spn_util.plot_spn(pos_spn, "positive_flat_rule_spn.pdf")
 
 
 evidence = [None, NominalRange([0]), None, None]
 prob, neg_spn = spn_for_evidence(flat_spn, evidence, node_likelihood=inference_support_ranges, distribution_update_ranges=distribution_update_ranges)
 #spn_util.plot_spn(neg_spn, "negative_flat_rule_spn.pdf")
 
 
 
 
 
 
 
 
예제 #23
0
def extract_rules(spn, feature_id=1):
    
    from spn.experiments.AQP.Ranges import NominalRange
    from spn.algorithms import Inference
    from simple_spn.internal.InferenceRange import categorical_likelihood_range
    from spn.structure.Base import Sum, Product
    from spn.algorithms.Inference import sum_likelihood, prod_likelihood
    from spn.structure.leaves.parametric.Parametric import Categorical
    
    inference_support_ranges = {Categorical     : categorical_likelihood_range,
                                    Sum             : sum_likelihood,
                                    Product         : prod_likelihood}
    
    
    
    
    
    freq_items = get_frequent_items(spn, min_support=0.0)
    freq_items_filtered = freq_items#filter(lambda x : any(cond[0] == feature_id for cond in x[1]), freq_items)
    freq_items_sorted = sorted(freq_items_filtered, key=lambda x: x[0], reverse=True)
    
    #evidence = numpy.empty((3,3,)
    
    
    feature_dict = {0: ("g", ("m  ", "w  ")), 1: ("c", ("no ", "yes")), 2: ("s", ("no ", "yes")), 3: ("w", ("no ", "yes"))}
    freq_sets = []
    for (sup, conds) in freq_items_sorted:
        
        str_conds=[]
        ranges = [None] * len(spn.scope)
        for cond in conds:
            ranges[cond[0]] = NominalRange([cond[1]])
            str_conds.append(feature_dict[cond[0]][0] + "=" + feature_dict[cond[0]][1][cond[1]])
            
        ranges = np.array([ranges])
        sup_spn = Inference.likelihood(spn, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:,0][0]
        

        freq_sets.append(["(" + ", ".join(str_conds) + ")", sup, sup_spn]) 
        
        
    rules = sorted(freq_sets, key=lambda x : x[2], reverse=True)
    rule_df = pd.DataFrame(rules, columns=["frequent set", "s_support", "g_support"])
    
    io.print_pretty_table(rule_df.head(400))
    
    
    exit()
    
    
    
    
    rules = []
    for (sup, conds) in freq_items_sorted:
        
        rule_body = []
        rule_head = []
        conf = np.nan
        
        ranges = [None] * len(spn.scope)
        
        
        
        
        for cond in conds:
            if cond[0] == feature_id:
                rule_head.append(feature_dict[cond[0]][0] + "=" + feature_dict[cond[0]][1][cond[1]])
            else:
                rule_body.append(feature_dict[cond[0]][0] + "=" + feature_dict[cond[0]][1][cond[1]])
            
            ranges[cond[0]] = NominalRange([cond[1]])
        
        
        #Optimization possible
        ranges = np.array([ranges])
        prob_with_feature = Inference.likelihood(spn, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:,0][0]
        
        ranges[0][feature_id] = None
        prob_without_feature = Inference.likelihood(spn, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:,0][0]
        
        spn_sup = prob_without_feature
        spn_conf = prob_with_feature / prob_without_feature
        
        
        rules.append([" AND ".join(rule_body) + "-->" + " AND ".join(rule_head), sup, conf, spn_sup, spn_conf, spn_sup*spn_conf])
    
    
    rules = sorted(rules, key=lambda x : x[5], reverse=True)
    
    
     
    rule_df = pd.DataFrame(rules, columns=["Rule", "c_Support", "c_Confidence", "spn_Support", "spn_Confidence", "score"])
    
    #rule_df.drop_duplicates(subset=["Rule"], keep = True, inplace = True) 
    
    io.print_pretty_table(rule_df.head(400))
    

    
    pass
예제 #24
0
 from spn.experiments.AQP.Ranges import NominalRange, NumericRange
 
 #Import inference
 from spn.algorithms import Inference
 from spn.algorithms.Inference import sum_likelihood, prod_likelihood
 
 
 inference_support_ranges = {PiecewiseLinear : piecewise_likelihood_range, 
                             Categorical     : categorical_likelihood_range,
                             IdentityNumeric : identity_likelihood_range,
                             Sum             : sum_likelihood,
                             Product         : prod_likelihood}
 
 #Use None instead of np.nan
 ranges = np.array([[None, None, None],                                                          #Without any conditions
                    [NominalRange([0]), None, None],                                             #Only male
                    [NominalRange([0]), NominalRange([1]), None],                                #Only male and student
                    [NominalRange([0]), NominalRange([1]), NumericRange([[21,100]])],            #Only male and student and older than 21
                    [NominalRange([0]), NominalRange([1]), NumericRange([[10,15], [25,100]])]]   #Only male and student and age between 10 and 17 or 21 and 100
 )                  
 probabilities = Inference.likelihood(root_node, ranges, dtype=np.float64, node_likelihood=inference_support_ranges)
 
 print("Probabilities:")
 print(probabilities)
 print()
 
 
 
 #Sampling for given ranges
 from spn.algorithms import SamplingRange
 from spn.structure.leaves.piecewise.SamplingRange import sample_piecewise_node
예제 #25
0
    
    x = [0.,  1.,  2.,  3., 4.]
    y = [0., 0., 0., 10., 0.]
    x, y = np.array(x), np.array(y)
    auc = np.trapz(y, x)
    y = y / auc
    node4 = PiecewiseLinear(x_range=x, y_range=y, bin_repr_points=x[1:-1], scope=[1])
    
    root_node = 0.49 * (node1 * node3) + 0.51 * (node2 * node4)
    
       
    #Set context
    #meta_types = [MetaType.DISCRETE, MetaType.REAL]
    #domains = [[0,1],[0.,4.]]
    #ds_context = Context(meta_types=meta_types, domains=domains)
    
    
    inference_support_ranges = {PiecewiseLinear : piecewise_likelihood_range, 
                                Categorical     : categorical_likelihood_range}
    
    node_sample = {Categorical : sample_categorical_node,
                   PiecewiseLinear : sample_piecewise_node}
    
    ranges = [NominalRange([0]),None]
    samples = SamplingRange.sample_instances(root_node, 2, 30, rand_gen, ranges=ranges, node_sample=node_sample, node_likelihood=inference_support_ranges)#, return_Zs, return_partition, dtype)
    print("Samples: " + str(samples))
    
    ranges = [NominalRange([0]),NumericRange([[3., 3.1], [3.5, 4.]])]
    samples = SamplingRange.sample_instances(root_node, 2, 30, rand_gen, ranges=ranges, node_sample=node_sample, node_likelihood=inference_support_ranges)#, return_Zs, return_partition, dtype)
    print("Samples: " + str(samples))
예제 #26
0
    samples = fn.sampling_rang(spn,
                               rang=[None, None, None, None],
                               n_samples=10,
                               random_seed=1)
    print(samples)

    samples = fn.sampling_rang(
        spn,
        rang=[None, None, NumericRange([[10, 11], [29, 30]])],
        n_samples=10,
        random_seed=1)
    print(samples)

    samples = fn.sampling_rang(
        spn,
        rang=[NominalRange([0]), None,
              NumericRange([[14, 15], [29, 30]])],
        n_samples=10,
        random_seed=1)
    print(samples)

    #Test probabilities
    rang = [None, None, None]
    prob = fn.prob(spn, rang)
    print(prob)

    rang = [NominalRange([0]), NominalRange([1]), NumericRange([[20]])]
    prob = fn.prob(spn, rang)
    print(prob)

    ranges = np.array([[None, None, NumericRange([[0, 20]])],