def interestingness_matrix(spn, value_dict=None, numeric_prec=20): if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn) sub_pops = fn.get_sub_populations(spn) all_scores = [] for i, f_id in enumerate(sorted(spn.scope)): dists = np.array([dists[i] for _, dists in sub_pops]) scores = _compare_distributions(dists, value_dict[f_id], numeric_prec) all_scores.append(scores) all_scores = np.array(all_scores).T return sub_pops, all_scores
def topdown_interesting_rules( spn, value_dict, metrics=['sup', 'conf', 'head_sup', 'F', 'cosine_distance'], full_value_dict=None, beta=1., labeled=True): subpops = fn.get_sub_populations(spn, ) l = [] for sub in subpops: l.extend(get_interesting_leaves(spn, sub, value_dict, top=6)) sorted(l, key=lambda x: x[2]) # rules = [[get_leaf_rules(leaf), diff, weight] for leaf, diff, weight in l] rules = [] for leaf, diff, weight in l: leafrules = get_leaf_rules(leaf) for r in leafrules: if head_compatible_body(r[1], r[0], one_hot_vd=value_dict, full_value_dict=full_value_dict): rules.append([r, diff, weight]) # rrules, rheads, rsup, rconf = [], [], [], [] final_rules = [] for lst in rules: #get confidence rule, head = lst[0] if len(rule) == 0 or len(head) == 0: continue stats = rule_stats( spn, rule, head, metrics=metrics, beta=beta, ) if stats[metrics.index('F')] > 0.03: # if True: final_rules.append((head, rule, *stats)) if labeled: final_rules_labeled = [(*get_labeled_rule(r[0], r[1], value_dict), *r[2:]) for r in final_rules] rule_df = pd.DataFrame(final_rules_labeled, columns=['head', 'body', *metrics]) rule_df = rule_df.drop_duplicates(['body', 'head']) return rule_df
def visualized_target_based_expected_sub_populations(spn, target_id, value_dict=None, top=None, rang=None, numeric_prec=10, save_path=None): if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn) if rang is not None: spn = fn.marg_rang(spn, rang) n_vals = len(value_dict[target_id][2]) ps = [] all_lines = [] for v in range(n_vals): tmp_rang = [None] * (np.max(spn.scope) + 1) tmp_rang[target_id] = NominalRange([v]) p, spn1 = fn.marg_rang(spn, tmp_rang) ps.append(p) sub_pops = fn.get_sub_populations(spn1, sort=True, top=top) sub_pops = [[p * p1, dists] for p1, dists in sub_pops] lines = [] for [p, dists] in sub_pops: line = [] for dist in dists: f_id = dist.scope[0] if value_dict[f_id][0] == "discrete": rang = [None] * (np.max(spn.scope) + 1) expect = fn.expect(dist, f_id, rang) y_val = np.linspace(0, 1, len(value_dict[f_id][2]))[int(expect)] line.append(y_val) elif value_dict[f_id][0] == "numeric": rang = [None] * (np.max(spn.scope) + 1) expect = fn.expect(dist, f_id, rang) mi = value_dict[f_id][2][0] ma = value_dict[f_id][2][1] y_val = (expect - mi) / (ma - mi) line.append(y_val) else: raise Exception("Unknown attribute-type: " + str(value_dict[dist.scope[0]])) lines.append([p, line]) all_lines.append(lines) fig, axes = plt.subplots(n_vals, 1, figsize=(16, 6 * n_vals), squeeze=False) for i, lines in enumerate(all_lines): plot = axes[i][0] plot.set_yticklabels([]) for [p, line] in lines: x_vals = [] y_vals = [] for i in range(len(line) - 1): y_val = line[i] next_y_val = line[i + 1] for r in np.linspace(0, 1, numeric_prec): x_vals.append(i + r) y_vals.append(y_val + (next_y_val - y_val) * r + np.random.normal() * 0.025) plot.plot(x_vals, y_vals, linewidth=p * 100) x_feature_ids = sorted(list(set(spn.scope) - set([target_id]))) plot.set_xticks(np.arange(len(x_feature_ids))) if value_dict is not None: plot.set_xticklabels( [value_dict[scope][1] for scope in x_feature_ids]) for j, feature_id in enumerate(x_feature_ids): if value_dict[feature_id][0] == "discrete": for i, y_val in enumerate( np.linspace(0, 1, len(value_dict[feature_id][2]))): val_name = value_dict[feature_id][2][i] plot.text(j, y_val, val_name) elif value_dict[feature_id][0] == "numeric": mi = value_dict[feature_id][2][0] ma = value_dict[feature_id][2][1] for i, y_val in enumerate(np.linspace(0, 1, 5)): val_name = round(y_val * (ma - mi) + mi, 4) plot.text(j, y_val, val_name) else: raise Exception( "Not implemented for other than discrete or numeric") pad_row = 5 for i, (ax, p) in enumerate(zip(axes[:, 0], ps)): info = value_dict[target_id][1] + "=" + value_dict[target_id][2][ i] + " " + str(round(p * 100, 4)) + "%\n" ax.annotate(info, xy=(0, 0.5), xytext=(-ax.yaxis.labelpad - pad_row, 0), xycoords=ax.yaxis.label, textcoords='offset points', size='large', ha='right', va='center') plt.tight_layout() fig.subplots_adjust(left=0.15) if save_path is None: plt.show() else: plt.savefig(save_path)
def visualize_expected_sub_populations(spn, value_dict=None, top=None, rang=None, numeric_prec=10, save_path=None): if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn) if rang is not None: spn = fn.marg_rang(spn, rang) sub_pops = fn.get_sub_populations(spn, sort=True, top=top) fig, axes = plt.subplots(1, 1, figsize=(16, 6), squeeze=False) lines = [] for [prob, dists] in sub_pops: line = [] for dist in dists: f_id = dist.scope[0] if value_dict[f_id][0] == "discrete": rang = [None] * (np.max(spn.scope) + 1) expect = fn.expect(dist, f_id, rang) y_val = np.linspace(0, 1, len(value_dict[f_id][2]))[int(expect)] line.append(y_val) elif value_dict[f_id][0] == "numeric": rang = [None] * (np.max(spn.scope) + 1) expect = fn.expect(dist, f_id, rang) mi = value_dict[f_id][2][0] ma = value_dict[f_id][2][1] y_val = (expect - mi) / (ma - mi) line.append(y_val) else: raise Exception("Unknown attribute-type: " + str(value_dict[dist.scope[0]])) lines.append([prob, line]) plot = axes[0][0] plot.set_yticklabels([]) for [prob, line] in lines: x_vals = [] y_vals = [] for i in range(len(line) - 1): y_val = line[i] next_y_val = line[i + 1] for r in np.linspace(0, 1, numeric_prec): x_vals.append(i + r) y_vals.append(y_val + (next_y_val - y_val) * r + np.random.normal() * 0.025) plot.plot(x_vals, y_vals, linewidth=prob * 100) plot.set_xticks(np.arange(len(spn.scope))) if value_dict is not None: plot.set_xticklabels([value_dict[scope][1] for scope in spn.scope]) for j, feature_id in enumerate(spn.scope): if value_dict[feature_id][0] == "discrete": for i, y_val in enumerate( np.linspace(0, 1, len(value_dict[feature_id][2]))): val_name = value_dict[feature_id][2][i] plot.text(j, y_val, val_name) elif value_dict[feature_id][0] == "numeric": mi = value_dict[feature_id][2][0] ma = value_dict[feature_id][2][1] for i, y_val in enumerate(np.linspace(0, 1, 5)): val_name = round(y_val * (ma - mi) + mi, 4) plot.text(j, y_val, val_name) else: raise Exception( "Not implemented for other than discrete or numeric") plt.tight_layout() if save_path is None: plt.show() else: plt.savefig(save_path)
def visualize_sub_populations(spn, value_dict=None, top=None, rang=None, numeric_prec=50, save_path=None): if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn) if rang is not None: spn = fn.marg_rang(spn, rang) sub_pops = fn.get_sub_populations(spn, sort=True, top=top) ncols = len(spn.scope) nrows = len(sub_pops) figsize_x = ncols * 3 figsize_y = nrows * 2 fig, axes = plt.subplots(nrows, ncols, figsize=(figsize_x, figsize_y), squeeze=False) for i, [_, dists] in enumerate(sub_pops): for j, dist in enumerate(dists): f_id = dist.scope[0] if value_dict[f_id][0] == "discrete": val_pairs = sorted(value_dict[f_id][2].items(), key=lambda x: x[0]) y_vals = fn.evaluate_discrete_leaf( dist, f_vals=[x[0] for x in val_pairs]) viz_helper.bar_plot(axes[i][j], y_vals, x_tick_labels=[x[1] for x in val_pairs], y_label="probability", ylim=[0, 1]) elif value_dict[f_id][0] == "numeric": x_vals = np.linspace(value_dict[f_id][2][0], value_dict[f_id][2][1], num=numeric_prec) y_vals = fn.evaluate_numeric_density_leaf(dist, x_vals) viz_helper.line_plot(axes[i][j], x_vals, y_vals, y_label="density") else: raise Exception("Unknown attribute-type: " + str(value_dict[dist.scope[0]])) pad_col = 5 if value_dict is None: feature_names = ["Feature " + str(x) for x in sorted(spn.scope)] else: feature_names = [value_dict[x][1] for x in sorted(spn.scope)] for ax, col in zip(axes[0], feature_names): ax.annotate(col, xy=(0.5, 1), xytext=(0, pad_col), xycoords='axes fraction', textcoords='offset points', size='large', ha='center', va='baseline') pad_row = 5 for ax, row in zip(axes[:, 0], [round(x, 6) for [x, _] in sub_pops]): ax.annotate(row, xy=(0, 0.5), xytext=(-ax.yaxis.labelpad - pad_row, 0), xycoords=ax.yaxis.label, textcoords='offset points', size='large', ha='right', va='center') plt.tight_layout() fig.subplots_adjust(left=0.15, top=0.95) if save_path is None: plt.show() else: plt.savefig(save_path)
def test_get_subpopulations(): spn = example_spns.get_gender_spn() #rang = [NominalRange([0]), NominalRange([1]), None] sub_pops = fn.get_sub_populations(spn) print(sub_pops)
df = pd.DataFrame(data, columns=[value_dict[i][1] for i in range(num_vars)]) print(df.corr()) # parameters for the construction rdc_threshold = 0.1 min_instances_slice = 0.1 if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): print("Creating SPN ...") # get data # df, value_dict, parametric_types = real_data.get_titanic() spn, value_dict, _ = spn_handler.create_parametric_spns( data, data_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict, save=False) # # Load SPN # spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) # Print some statistics fn.print_statistics(spn) visualize_expected_sub_populations(spn, value_dict, 10) visualize_sub_populations(spn, value_dict, 10) subpops = fn.get_sub_populations(spn, ) print(subpops) print('============') pprint(subpops) fn.plot_spn(spn, "icecream_spn.pdf", value_dict)