Exemplo n.º 1
0
def test_bayesian_ruleset():
    from common.gen_samples import read_anomaly_dataset
    x, y = read_anomaly_dataset("toy2")
    y = np.asarray(y, dtype=np.int32)

    meta = get_feature_meta_default(x, y)
    # print(meta)

    compact_rules = [
        # These were precomputed; found after 30 feedback iterations with AAD.
        # Each rule corresponds to the anomaly class (label = 1)
        "(F1 > -0.431709) & (F1 <= 2.033541) & (F2 > 3.703597)",
        "(F1 > 4.752354) & (F1 <= 6.210754) & (F2 > 1.581015) & (F2 <= 3.592983)",
        "(F1 > 6.298735) & (F2 > -0.822048) & (F2 <= 3.740281)",
    ]

    rules = convert_strings_to_conjunctive_rules(compact_rules, meta)
    print("Candidate rules:")
    print("  %s" % "\n  ".join([str(rule) for rule in rules]))

    # sanity_check_bayesian_ruleset(x, y, rules, meta)

    br = BayesianRuleset(meta=meta,
                         opts=None,
                         maxlen=get_max_len_in_rules(rules),
                         max_iter=200,
                         n_min_support_stop=20)
    br.fit(x, y, rules)
    print("Selected rules:")
    for idx in br.predicted_rules:
        print("  rule %d: %s" % (idx, str(br.rules[idx])))
Exemplo n.º 2
0
def test_kl_data_drift_classifier():
    logger = logging.getLogger(__name__)

    args = get_command_args(debug=False)
    configure_logger(args)

    dataset_config = dataset_configs[args.dataset]
    stream_window = dataset_config[2]
    alpha = 0.05
    n_trees = 100

    X_full, y_full = read_anomaly_dataset(args.dataset)
    logger.debug("dataset: %s (%d, %d), stream_window: %d, alpha: %0.3f" %
                 (args.dataset, X_full.shape[0], X_full.shape[1], stream_window, alpha))

    stream = DataStream(X_full, y_full, IdServer(initial=0))

    # get first window of data
    training_set = stream.read_next_from_stream(stream_window)
    x, y, ids = training_set.x, training_set.y, training_set.ids
    logger.debug("First window loaded (%s): %d" % (args.dataset, x.shape[0]))

    # train classifier with the window of data
    clf = RF(n_estimators=n_trees)
    clf.fit(x, y)
    logger.debug("Random Forest classifier created with %d trees" % clf.n_estimators)

    # prepare wrapper over the classifier which will compute KL-divergences
    # NOTE: rf.clf is the scikit-learn Random Forest classifier instance
    model = RandomForestAadWrapper(x=x, y=y, clf=clf)
    logger.debug("Wrapper model created with %d nodes" % len(model.w))

    # compute KL replacement threshold *without* p
    ref_kls, kl_q_alpha = model.get_KL_divergence_distribution(x, p=None, alpha=alpha)
    # now initialize reference p
    p = model.get_node_sample_distributions(x)

    window = 0
    while not stream.empty():
        window += 1
        # get next window of data and check KL-divergence
        training_set = stream.read_next_from_stream(n=stream_window)
        x, y = training_set.x, training_set.y

        logger.debug("window %d loaded: %d" % (window, x.shape[0]))

        # compare KL-divergence of current data dist against reference dist p
        comp_kls, _ = model.get_KL_divergence_distribution(x, p=p)

        # find which trees exceed alpha-level threshold
        trees_exceeding_kl_q_alpha = model.get_trees_to_replace(comp_kls, kl_q_alpha)
        n_threshold = int(2 * alpha * n_trees)

        logger.debug("[%d] #trees_exceeding_kl_q_alpha: %d, threshold number of trees: %d\n%s" %
                     (window, len(trees_exceeding_kl_q_alpha), n_threshold, str(list(trees_exceeding_kl_q_alpha))))
Exemplo n.º 3
0
def test_kl_data_drift():
    logger = logging.getLogger(__name__)

    args = get_command_args(debug=False,
                            debug_args=[
                                "--debug", "--plot",
                                "--log_file=temp/test_concept_drift.log"
                            ])
    configure_logger(args)

    np.random.seed(42)

    dataset_config = dataset_configs[args.dataset]
    stream_window = dataset_config[2]
    alpha = 0.05

    X_full, y_full = read_anomaly_dataset(args.dataset)
    logger.debug(
        "dataset: %s (%d, %d), stream_window: %d, alpha: %0.3f" %
        (args.dataset, X_full.shape[0], X_full.shape[1], stream_window, alpha))

    stream = DataStream(X_full, y_full, IdServer(initial=0))
    training_set = stream.read_next_from_stream(stream_window)
    x, y, ids = training_set.x, training_set.y, training_set.ids
    model = get_iforest_model(x)

    all_kl_q_alpha = list()
    all_reference_kls = list()
    all_compare_kls = list()
    trees_replaced = list()

    # compute KL replacement threshold *without* p
    ref_kls, kl_q_alpha = model.get_KL_divergence_distribution(x,
                                                               p=None,
                                                               alpha=alpha)
    # now initialize reference p
    p = model.get_node_sample_distributions(x)

    max_kl = np.max(ref_kls)

    window = 0  # already read the first window
    while True:
        buffer = stream.read_next_from_stream(stream_window)
        if buffer is None:
            break
        window += 1
        x, y, ids = buffer.x, buffer.y, buffer.ids
        # logger.debug("#new: %d" % x.shape[0])

        model.add_samples(X=x)

        all_kl_q_alpha.append(kl_q_alpha)
        all_reference_kls.append(ref_kls)

        # compare KL-divergence of current data dist against reference dist p
        comp_kls, _ = model.get_KL_divergence_distribution(x, p=p)
        all_compare_kls.append(comp_kls)
        max_kl = max(max_kl, np.max(comp_kls))

        # find which trees exceed alpha-level threshold
        replace_trees_by_kl = model.get_trees_to_replace(comp_kls, kl_q_alpha)
        n_trees = model.clf.n_estimators
        n_replace = 0 if replace_trees_by_kl is None else len(
            replace_trees_by_kl)
        n_threshold = int(2 * alpha * n_trees)
        # we will replace if 2*alpha number of trees exceed the alpha-threshold
        do_replace = n_trees > 0 and n_replace >= n_threshold
        logger.debug(
            "window %d: n_replace: %d, threshold num: %d, do_replace: %s" %
            (window, n_replace, n_threshold, str(do_replace)))
        if do_replace:
            if False:
                logger.debug("window %d: #replace_trees_by_kl: %d\n%s" %
                             (window, len(replace_trees_by_kl),
                              str(list(replace_trees_by_kl))))
            trees_replaced.append(len(replace_trees_by_kl))
            model.update_model_from_stream_buffer(
                replace_trees=replace_trees_by_kl)
            # recompute KL replacement threshold *without* p
            ref_kls, kl_q_alpha = model.get_KL_divergence_distribution(
                x, p=None, alpha=alpha)
            max_kl = max(max_kl, np.max(ref_kls))
            # now recompute reference p
            p = model.get_node_sample_distributions(x)
        else:
            if False:
                logger.debug(
                    "window %d: model not updated; replace_trees_by_kl: %s" %
                    (window, str(list(replace_trees_by_kl))
                     if replace_trees_by_kl is not None else None))
            trees_replaced.append(0)

    if args.plot:
        xlim = [0, window + 1]
        ylim = [0, max_kl + 3]
        dp = DataPlotter(pdfpath="./temp/test_concept_drift_%s.pdf" %
                         args.dataset,
                         rows=1,
                         cols=1)
        pl = dp.get_next_plot()
        plt.xlim(xlim)
        plt.ylim(ylim)
        plt.xlabel('window')
        plt.ylabel('KL-divergence')
        for i in range(window):
            ref_label = com_label = threshold_label = replaced_label = None
            ref_kls = all_reference_kls[i]
            com_kls = all_compare_kls[i]
            mkl = max(np.max(ref_kls), np.max(com_kls))
            x_coord = i + 1
            replaced_y_coord = mkl + 2
            if i == 0:
                ref_label = "ref. KL dist"
                com_label = "KL-dist w.r.t ref. dist"
                threshold_label = "%0.2f-alpha KL" % alpha
                replaced_label = "(.) - number of trees replaced"
                pl.scatter([x_coord], [replaced_y_coord],
                           color="black",
                           marker=".",
                           s=0,
                           label=replaced_label)
            pl.scatter(np.ones(len(ref_kls), dtype=np.float32) * x_coord,
                       ref_kls,
                       color="orange",
                       marker="*",
                       s=8,
                       label=ref_label)
            pl.scatter([x_coord], [all_kl_q_alpha[i]],
                       color="red",
                       marker="+",
                       s=30,
                       label=threshold_label)
            pl.scatter(np.ones(len(ref_kls), dtype=np.float32) * x_coord + 0.1,
                       com_kls,
                       color="green",
                       marker="*",
                       s=8,
                       label=com_label)
            pl.text(x_coord - 0.2,
                    replaced_y_coord,
                    "(%d)" % trees_replaced[i],
                    fontsize=8,
                    label=replaced_label)
        pl.legend(loc='upper left', prop={'size': 6})
        dp.close()