示例#1
0
def get_closest_indexes(inst, test_set, num=1, dest_set=None):
    n = test_set.shape[0]
    dists = np.zeros(n)
    for i in np.arange(n):
        ts = test_set[i, :]
        if ts.shape[0] > 1:
            # dense matrix
            ts = matrix(ts, nrow=1)
            diff = inst - ts
            dist = np.sum(diff**2)
        else:
            # sparse matrix
            diff = inst - ts
            tmp = diff * diff.T
            if tmp.shape[0] != 1:
                raise ValueError("dot product is %s" % str(tmp.shape))
            dist = tmp[0, 0]
        dists[i] = dist
    ordered = np.argsort(dists)[np.arange(num)]
    if False:
        logger.debug("last ts:\n%s" % str(ts))
        logger.debug("last diff:\n%s" % str(diff))
        logger.debug("ordered indexes: %s" % str(list(ordered)))
        logger.debug("dists: %s" % str(list(dists[ordered])))
        # logger.debug("dists: %s" % str(list(dists)))
        logger.debug("inst:\n%s" % str(inst))
        logger.debug("points:\n%s" % str(test_set[ordered, :]))
        ts = test_set[ordered[1], :]
        ts = matrix(ts, nrow=1)
        logger.debug("dist 2:\n%s" % str(np.sum((inst - ts)**2)))
    if dest_set is not None:
        for indx in ordered:
            dest_set.add(indx)
    return ordered
示例#2
0
    def _transform_to_region_features_with_lookup(self, x, x_new):
        """ Transforms from original feature space to IF node space

        NOTE: This has been deprecated. Will be removed in future.

        Performs the conversion tree-by-tree. Even with batching by trees,
        this requires a lot of intermediate memory. Hence we do not use this method...

        :param x:
        :param x_new:
        :return:
        """
        starttime = timer()
        n = x_new.shape[0]
        for i, tree in enumerate(self.clf.estimators_):
            node_regions = self.all_node_regions[i]
            for j in range(n):
                tree_paths = self.get_decision_path(matrix(x[j, :], nrow=1),
                                                    tree)
                k = len(tree_paths[0])
                for node_idx in tree_paths[0]:
                    region_id = node_regions[node_idx]
                    x_new[
                        j,
                        region_id] = self.get_region_score_for_instance_transform(
                            region_id, k)
                if j >= 100000:
                    if j % 20000 == 0:
                        endtime = timer()
                        tdiff = difftime(endtime, starttime, units="secs")
                        logger.debug(
                            "processed %d/%d trees, %d/%d (%f) in %f sec(s)" %
                            (i, len(self.clf.estimators_), j + 1, n,
                             (j + 1) * 1. / n, tdiff))
示例#3
0
def get_sgd_batch(x, y, i, batch_size, shuffled_idxs=None):
    s = i * batch_size
    e = min(x.shape[0], (i + 1) * batch_size)
    if shuffled_idxs is None:
        idxs = np.arange(s, e)
    else:
        idxs = shuffled_idxs[np.arange(s, e)]
    return matrix(x[idxs, :], ncol=x.shape[1]), y[idxs]
示例#4
0
    def move_unlabeled_to_labeled(self, xi, yi):
        unlabeled_idx = xi - self.get_num_labeled()

        self.labeled_x = rbind(self.labeled_x,
                               matrix(self.unlabeled_x[unlabeled_idx], nrow=1))
        if self.labeled_y is None:
            self.labeled_y = np.array([yi], dtype=int)
        else:
            self.labeled_y = np.append(self.labeled_y, [yi])
        mask = np.ones(self.unlabeled_x.shape[0], dtype=bool)
        mask[unlabeled_idx] = False
        self.unlabeled_x = self.unlabeled_x[mask]
        self.unlabeled_y = self.unlabeled_y[mask]
示例#5
0
def plot_aad_2D(x, y, x_forest, xx, yy, forest, metrics, outputdir, dash_xy,
                dash_wh):
    # use this to plot the AAD feedback

    x_test = np.c_[xx.ravel(), yy.ravel()]
    x_if = forest.transform_to_region_features(x_test, dense=False)

    queried = np.array(metrics.queried)
    for i, q in enumerate(queried):
        pdfpath = "%s/iter_%02d.pdf" % (outputdir, i)
        dp = DataPlotter(pdfpath=pdfpath, rows=1, cols=1)
        pl = dp.get_next_plot()

        w = metrics.all_weights[i, :]
        Z = forest.get_score(x_if, w)
        Z = Z.reshape(xx.shape)

        pl.contourf(xx, yy, Z, 20, cmap=plt.cm.get_cmap('jet'))

        dp.plot_points(x,
                       pl,
                       labels=y,
                       lbl_color_map={
                           0: "grey",
                           1: "red"
                       },
                       s=25)
        # print queried[np.arange(i+1)]
        # print X_train[queried[np.arange(i+1)], :]
        dp.plot_points(matrix(x[queried[np.arange(i + 1)], :], nrow=i + 1),
                       pl,
                       labels=y[queried[np.arange(i + 1)]],
                       defaultcol="red",
                       lbl_color_map={
                           0: "green",
                           1: "red"
                       },
                       edgecolor=None,
                       facecolors=True,
                       marker=matplotlib.markers.MarkerStyle('o',
                                                             fillstyle=None),
                       s=35)

        # plot the sidebar
        anom_scores = forest.get_score(x_forest, w)
        anom_order = np.argsort(-anom_scores)
        anom_idxs = np.where(y[anom_order] == 1)[0]
        dash = 1 - (anom_idxs * 1.0 / x.shape[0])
        plot_sidebar(dash, dash_xy, dash_wh, pl)

        dp.close()
示例#6
0
 def transform_to_region_features_sparse_bkp(self, x):
     """ Transforms from original feature space to IF node space
     
     The conversion to sparse vectors seems to take a lot of intermediate
     memory in python. This is why we are converting the vectors in smaller
     batches. The transformation is a one-time task, hence not a concern in 
     most cases.
     
     :param x: 
     :return: 
     """
     # logger.debug("transforming to IF feature space...")
     n = x.shape[0]
     m = len(self.d)
     batch_size = 10000
     start_batch = 0
     end_batch = min(start_batch + batch_size, n)
     x_new = csr_matrix((0, m), dtype=float)
     while start_batch < end_batch:
         starttime = timer()
         x_tmp = matrix(x[start_batch:end_batch, :], ncol=x.shape[1])
         x_tmp_new = lil_matrix((end_batch - start_batch, m),
                                dtype=x_new.dtype)
         for i, tree in enumerate(self.clf.estimators_):
             n_tmp = x_tmp.shape[0]
             node_regions = self.all_node_regions[i]
             tree_paths = self.get_decision_path(x_tmp, tree)
             for j in range(n_tmp):
                 k = len(tree_paths[j])
                 for node_idx in tree_paths[j]:
                     region_id = node_regions[node_idx]
                     x_tmp_new[
                         j,
                         region_id] = self.get_region_score_for_instance_transform(
                             region_id, k)
         if n >= 100000:
             endtime = timer()
             tdiff = difftime(endtime, starttime, units="secs")
             logger.debug("processed %d/%d (%f); batch %d in %f sec(s)" %
                          (end_batch + 1, n,
                           (end_batch + 1) * 1. / n, batch_size, tdiff))
         x_new = vstack([x_new, x_tmp_new.tocsr()])
         start_batch = end_batch
         end_batch = min(start_batch + batch_size, n)
     return x_new
示例#7
0
def plot_iforest_baseline_contours_2D(x, y, x_iforest, xx, yy, budget,
                                      if_model, pdfpath_if_contours, dash_xy,
                                      dash_wh):
    # use this to plot baseline query points.

    w = np.ones(len(if_model.d), dtype=float)
    w = w / w.dot(w)  # normalized uniform weights

    baseline_scores = if_model.get_score(x_iforest, w)
    queried = np.argsort(-baseline_scores)

    n_found = np.cumsum(y[queried[np.arange(budget)]])
    print n_found

    dp = DataPlotter(pdfpath=pdfpath_if_contours, rows=1, cols=1)
    pl = dp.get_next_plot()

    x_test = np.c_[xx.ravel(), yy.ravel()]
    x_if = if_model.transform_to_region_features(x_test, dense=False)
    y_if = if_model.get_score(x_if, w)
    Z = y_if.reshape(xx.shape)

    pl.contourf(xx, yy, Z, 20)

    dp.plot_points(x, pl, labels=y, lbl_color_map={0: "grey", 1: "red"}, s=25)
    # print queried[np.arange(i+1)]
    # print X_train[queried[np.arange(i+1)], :]
    dp.plot_points(matrix(x[queried[np.arange(budget)], :], nrow=budget),
                   pl,
                   labels=y[queried[np.arange(budget)]],
                   defaultcol="red",
                   lbl_color_map={
                       0: "green",
                       1: "red"
                   },
                   edgecolor="black",
                   marker=matplotlib.markers.MarkerStyle('o', fillstyle=None),
                   s=35)

    # plot the sidebar
    anom_idxs = np.where(y[queried] == 1)[0]
    dash = 1 - (anom_idxs * 1.0 / x.shape[0])
    plot_sidebar(dash, dash_xy, dash_wh, pl)

    dp.close()
示例#8
0
def plot_forest_contours_2D(x, y, xx, yy, budget, forest, pdfpath_contours,
                            dash_xy, dash_wh):
    # Original detector contours
    baseline_scores = 0.5 - forest.decision_function(x)
    queried = np.argsort(-baseline_scores)
    # logger.debug("baseline scores:%s\n%s" % (str(baseline_scores.shape), str(list(baseline_scores))))

    n_found = np.cumsum(y[queried[np.arange(budget)]])
    print n_found

    Z_if = 0.5 - forest.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z_if = Z_if.reshape(xx.shape)

    dp = DataPlotter(pdfpath=pdfpath_contours, rows=1, cols=1)
    pl = dp.get_next_plot()
    pl.contourf(xx, yy, Z_if, 20, cmap=plt.cm.get_cmap('jet'))

    dp.plot_points(x, pl, labels=y, lbl_color_map={0: "grey", 1: "red"})

    dp.plot_points(matrix(x[queried[np.arange(budget)], :], nrow=budget),
                   pl,
                   labels=y[queried[np.arange(budget)]],
                   defaultcol="red",
                   lbl_color_map={
                       0: "green",
                       1: "red"
                   },
                   edgecolor="black",
                   marker=matplotlib.markers.MarkerStyle('o', fillstyle=None),
                   s=35)

    # plot the sidebar
    anom_idxs = np.where(y[queried] == 1)[0]
    dash = 1 - (anom_idxs * 1.0 / x.shape[0])
    plot_sidebar(dash, dash_xy, dash_wh, pl)

    dp.close()
示例#9
0
 def get_tau_ranked_instance(self, x, w, tau_rank):
     s = self.get_score(x, w)
     ps = order(s, decreasing=True)[tau_rank]
     return matrix(x[ps, :], nrow=1)
示例#10
0
def main():

    if False:
        # DEBUG
        args = prepare_forest_aad_debug_args()
    else:
        # PRODUCTION
        args = get_command_args(debug=False)
    # print "log file: %s" % args.log_file
    configure_logger(args)

    opts = Opts(args)
    # print opts.str_opts()
    logger.debug(opts.str_opts())

    if not opts.streaming:
        raise ValueError("Only streaming supported")

    X_full, y_full = read_data(opts)
    # X_train = X_train[0:10, :]
    # labels = labels[0:10]

    logger.debug("loaded file: (%s) %s" % (str(X_full.shape), opts.datafile))
    logger.debug("results dir: %s" % opts.resultsdir)

    all_num_seen = None
    all_num_seen_baseline = None
    all_window = None
    all_window_baseline = None

    aucs = np.zeros(0, dtype=float)

    opts.fid = 1
    for runidx in opts.get_runidxs():
        tm_run = Timer()
        opts.set_multi_run_options(opts.fid, runidx)

        stream = DataStream(X_full, y_full)
        X_train, y_train = stream.read_next_from_stream(opts.stream_window)

        # logger.debug("X_train:\n%s\nlabels:\n%s" % (str(X_train), str(list(labels))))

        model = prepare_aad_model(X_train, y_train,
                                  opts)  # initial model training
        sad = StreamingAnomalyDetector(stream,
                                       model,
                                       unlabeled_x=X_train,
                                       unlabeled_y=y_train,
                                       max_buffer=opts.stream_window,
                                       opts=opts)
        sad.init_query_state(opts)

        if False:
            # use for DEBUG only
            run_feedback(sad, 0, opts.budget, opts)
            print "This is experimental/demo code for streaming integration and will be application specific." + \
                  " Exiting after reading max %d instances from stream and iterating for %d feedback..." % \
                    (opts.stream_window, opts.budget)
            exit(0)

        all_scores = np.zeros(0)
        all_y = np.zeros(0, dtype=int)

        scores = sad.get_anomaly_scores(X_train)
        # auc = fn_auc(cbind(y_train, -scores))
        all_scores = np.append(all_scores, scores)
        all_y = np.append(all_y, y_train)
        iter = 0
        seen = np.zeros(0, dtype=int)
        seen_baseline = np.zeros(0, dtype=int)
        stream_window_tmp = np.zeros(0, dtype=int)
        stream_window_baseline = np.zeros(0, dtype=int)
        stop_iter = False
        while not stop_iter:
            iter += 1

            tm = Timer()
            seen_, seen_baseline_, queried_, queried_baseline_ = run_feedback(
                sad, opts.min_feedback_per_window,
                opts.max_feedback_per_window, opts)
            seen = append(seen, seen_)
            seen_baseline = append(seen_baseline, seen_baseline_)
            stream_window_tmp = append(stream_window_tmp,
                                       np.ones(len(seen_)) * iter)
            stream_window_baseline = append(
                stream_window_baseline,
                np.ones(len(seen_baseline_)) * iter)
            # queried = append(queried, queried_)
            # queried_baseline = append(queried_baseline, queried_baseline_)
            # logger.debug("seen:\n%s;\nbaseline:\n%s" % (str(list(seen)), str(list(seen_baseline))))

            x_eval, y_eval = sad.get_next_from_stream(sad.max_buffer)
            if x_eval is None or iter >= opts.max_windows:
                if iter >= opts.max_windows:
                    logger.debug("Exceeded %d iters; exiting stream read..." %
                                 opts.max_windows)
                stop_iter = True
            else:
                scores = sad.get_anomaly_scores(
                    x_eval)  # compute scores before updating the model

                all_scores = np.append(all_scores, scores)
                all_y = np.append(all_y, y_eval)

                if opts.allow_stream_update:
                    sad.update_model_from_buffer()

                sad.move_buffer_to_unlabeled()

            logger.debug(
                tm.message(
                    "Stream window [%d]: algo [%d/%d]; baseline [%d/%d]: " %
                    (iter, np.sum(seen), len(seen), np.sum(seen_baseline),
                     len(seen_baseline))))

        auc = fn_auc(cbind(all_y, -all_scores))
        # logger.debug("AUC: %f" % auc)
        aucs = append(aucs, [auc])

        # queried_baseline = order(all_scores, decreasing=True)[0:opts.budget]
        num_seen_tmp = np.cumsum(seen)  # np.cumsum(all_y[queried])
        # logger.debug("\nnum_seen    : %s" % (str(list(num_seen_tmp)),))

        num_seen_baseline = np.cumsum(
            seen_baseline)  # np.cumsum(all_y[queried_baseline])
        # logger.debug("Numseen in %d budget (overall):\n%s" % (opts.budget, str(list(num_seen_baseline))))

        stream_window_baseline = append(
            np.array([opts.fid, opts.runidx],
                     dtype=stream_window_baseline.dtype),
            stream_window_baseline)
        stream_window = np.ones(len(stream_window_baseline) + 2,
                                dtype=stream_window_tmp.dtype) * -1
        stream_window[0:2] = [opts.fid, opts.runidx]
        stream_window[2:(2 + len(stream_window_tmp))] = stream_window_tmp

        # queried = append(np.array([opts.fid, opts.runidx], dtype=queried.dtype), queried)
        # queried_baseline = append(np.array([opts.fid, opts.runidx], dtype=queried_baseline.dtype), queried_baseline)

        # num_seen_baseline has the uniformly maximum number of queries.
        # the number of queries in num_seen will vary under the query confidence mode
        num_seen = np.ones(len(num_seen_baseline) + 2,
                           dtype=num_seen_tmp.dtype) * -1
        num_seen[0:2] = [opts.fid, opts.runidx]
        num_seen[2:(2 + len(num_seen_tmp))] = num_seen_tmp

        num_seen_baseline = append(
            np.array([opts.fid, opts.runidx], dtype=num_seen_baseline.dtype),
            num_seen_baseline)

        # all_queried = rbind(all_queried, matrix(queried, nrow=1))
        # all_queried_baseline = rbind(all_queried_baseline, matrix(queried_baseline, nrow=1))

        all_num_seen = rbind(all_num_seen, matrix(num_seen, nrow=1))
        all_num_seen_baseline = rbind(all_num_seen_baseline,
                                      matrix(num_seen_baseline, nrow=1))
        all_window = rbind(all_window, matrix(stream_window, nrow=1))
        all_window_baseline = rbind(all_window_baseline,
                                    matrix(stream_window_baseline, nrow=1))

        logger.debug(tm_run.message("Completed runidx: %d" % runidx))

    results = SequentialResults(
        num_seen=all_num_seen,
        # true_queried_indexes=all_queried,
        num_seen_baseline=all_num_seen_baseline,
        # true_queried_indexes_baseline=all_queried_baseline,
        stream_window=all_window,
        stream_window_baseline=all_window_baseline,
        aucs=aucs)
    write_sequential_results_to_csv(results, opts)
示例#11
0
def get_gp_predictions_ext(x,
                           y,
                           ranked_indexes,
                           orig_train=None,
                           orig_test=None,
                           queried_indexes=None,
                           test_set=None,
                           n_train=100,
                           n_test=20,
                           n_closest=9):
    s = 0.005  # noise variance.

    n_train = min(n_train, x.shape[0])
    train_indexes_all = SetList(ranked_indexes[np.arange(n_train)])
    if False:
        logger.debug("all train indexes:\n%s" % str(list(train_indexes_all)))

    # if a separate test set has *not* been provided, then it is the
    # leave-one-out case where we compute variance for each test instance
    # by first training on other instances and then computing the mean
    # nd variance for the left-out test instance.
    leave_one_out = test_set is None

    test_indexes_all = np.array(train_indexes_all)
    if queried_indexes is not None:
        # test instances can only be unlabeled instances
        test_indexes_all = np.array(
            SetList(train_indexes_all) - SetList(queried_indexes))

    L = None
    if leave_one_out:
        pred_indexes = np.arange(n_test)
        test_indexes_all = test_indexes_all[np.arange(n_test)]
        if False:
            logger.debug("all test indexes:%d\n%s" %
                         (n_test, str(list(test_indexes_all))))
        y_pred = None  # np.zeros(len(pred_indexes))
        v_pred = np.ones(len(pred_indexes)) * 0.5
    else:
        closest_indexes = set(
        )  # all indexes from test_set that are closest to any unlabeled instances
        for i in range(n_test):
            test_index = test_indexes_all[i]
            if orig_train is not None and orig_test is not None:
                get_closest_indexes(matrix(orig_train[test_index, :], nrow=1),
                                    orig_test,
                                    num=n_closest,
                                    dest_set=closest_indexes)
            else:
                get_closest_indexes(x[test_index, :],
                                    test_set,
                                    num=n_closest,
                                    dest_set=closest_indexes)
        pred_indexes = np.array(list(closest_indexes))
        if False: logger.debug("pred indexes:\n%s" % str(list(pred_indexes)))
        y_pred = None  # np.zeros(test_set.shape[0])
        v_pred = np.ones(test_set.shape[0]) * 0.5

    n_pred = len(pred_indexes)
    logger.debug("Leave-one-out: %s, n_pred: %d" %
                 (str(leave_one_out), n_pred))

    tm = Timer()
    for cnt, i in enumerate(pred_indexes):

        # pick one test instance
        if leave_one_out:
            # this is the leave-out-out case
            test_index = test_indexes_all[i]
            x_test = x[test_index, :]
            # exclude the test instance from the training
            train_indexes = np.array(train_indexes_all - SetList([test_index]))
        else:
            x_test = test_set[i, :]
            train_indexes = train_indexes_all

        x_train = x[train_indexes, :]
        # y_train = y[train_indexes] + s*np.random.randn(len(train_indexes))

        if leave_one_out or cnt == 0:
            # the matrix needs to be recomputed in each iteration
            # for the leave-one-out case, else it should be
            # computed only once.
            K = kernel(x_train, x_train)
            L = np.linalg.cholesky(K + s * np.eye(K.shape[0]))

        logger.debug("K:\n%s" % str(K))

        # compute the mean at our test points.
        Lk = np.linalg.solve(L, kernel(x_train, x_test))
        # y_pred[i] = np.dot(Lk.T, np.linalg.solve(L, y_train))

        # compute the variance at our test points.
        K_ = kernel(x_test, x_test)
        if (cnt + 1) % 200 == 0:
            logger.debug("Test Kernel (%d, %d)" % (K_.shape[0], K_.shape[1]))
        s2 = np.diag(K_) - np.sum(Lk**2, axis=0)
        v_pred[i] = np.sqrt(s2)

    tm.end()
    logger.debug(tm.message("Time for GP compuation:"))

    if False:
        if y_pred is not None:
            logger.debug("predicted means:\n%s" % str(list(y_pred)))
        logger.debug("predicted variances:\n%s" % str(list(v_pred)))

    return y_pred, v_pred, train_indexes_all, test_indexes_all[np.arange(
        n_test)]
示例#12
0
def get_gp_predictions(x,
                       y,
                       ordered_indexes,
                       queried_indexes=None,
                       n_train=100,
                       n_test=20,
                       length_scale=20,
                       orig_x=None,
                       eval_set=None,
                       orig_eval_set=None,
                       n_closest=9):
    s = 0.005  # noise variance.

    top_ranked_indexes = ordered_indexes[np.arange(
        max(n_train, len(queried_indexes)) + n_test)]

    train, test = get_gp_train_test(top_ranked_indexes, queried_indexes,
                                    n_train, n_test)

    n_train = len(train)  # this value might be different from input
    n_test = len(test)

    # logger.debug("train indexes:\n%s\ntest indexes:\n%s" % (str(list(train)), str(list(test))))

    y_pred = None  # np.zeros(len(pred_indexes))
    v_pred = np.ones(len(test)) * 0.5

    x_train = x[train, :]
    # y_train = y[train_indexes] + s*np.random.randn(len(train_indexes))

    K = kernel(x_train, x_train, length_scale=length_scale)
    L = np.linalg.cholesky(K + s * np.eye(K.shape[0]))

    # logger.debug("K:\n%s" % str(K))

    tm = Timer()
    for i, idx in enumerate(test):

        x_test = x[idx, :]

        # compute the mean at our test points.
        Lk = np.linalg.solve(
            L, kernel(x_train, x_test, length_scale=length_scale))
        # y_pred[i] = np.dot(Lk.T, np.linalg.solve(L, y_train))

        # compute the variance at our test points.
        K_ = kernel(x_test, x_test, length_scale=length_scale)
        if (i + 1) % 200 == 0:
            logger.debug("Test Kernel (%d, %d)" % (K_.shape[0], K_.shape[1]))
        s2 = np.diag(K_) - np.sum(Lk**2, axis=0)
        v_pred[i] = np.sqrt(s2)

    tm.end()
    logger.debug(tm.message("Time for GP computation on test set:"))

    if False:
        if y_pred is not None:
            logger.debug("predicted means:\n%s" % str(list(y_pred)))
        logger.debug("predicted variances:\n%s" % str(list(v_pred)))

    v_eval = None
    if eval_set is not None:
        tm = Timer()
        closest_indexes = set(
        )  # all indexes from test_set that are closest to any unlabeled instances
        for i in range(n_test):
            test_index = test[i]
            if orig_x is not None and orig_eval_set is not None:
                get_closest_indexes(matrix(orig_x[test_index, :], nrow=1),
                                    orig_eval_set,
                                    num=n_closest,
                                    dest_set=closest_indexes)
            else:
                get_closest_indexes(x[test_index, :],
                                    eval_set,
                                    num=n_closest,
                                    dest_set=closest_indexes)

        v_eval = np.ones(eval_set.shape[0], dtype=float) * 0.5
        for i, idx in enumerate(closest_indexes):
            x_test = eval_set[idx, :]

            # compute the mean at our test points.
            Lk = np.linalg.solve(
                L, kernel(x_train, x_test, length_scale=length_scale))
            # y_pred[i] = np.dot(Lk.T, np.linalg.solve(L, y_train))

            # compute the variance at our test points.
            K_ = kernel(x_test, x_test, length_scale=length_scale)
            if (i + 1) % 200 == 0:
                logger.debug("Test Kernel (%d, %d)" %
                             (K_.shape[0], K_.shape[1]))
            s2 = np.diag(K_) - np.sum(Lk**2, axis=0)
            v_eval[idx] = np.sqrt(s2)
        logger.debug(tm.message("Time for GP compuation on eval set:"))

    return y_pred, v_pred, train, test, v_eval
示例#13
0
def plot_aad_gp(x, y, x_forest, xx, yy, forest, metrics, outputdir, dash_xy,
                dash_wh):
    # use this to plot the AAD feedback

    x_test = np.c_[xx.ravel(), yy.ravel()]
    x_test_forest = forest.transform_to_region_features(x_test, dense=False)

    queried = np.array(metrics.queried)
    for i, q in enumerate(queried):
        pdfpath = "%s/gp_iter_%02d.pdf" % (outputdir, i)
        dp = DataPlotter(pdfpath=pdfpath, rows=1, cols=1)
        pl = dp.get_next_plot()

        w = metrics.all_weights[i, :]
        s_train = forest.get_score(x_forest, w)
        ranked_indexes = np.argsort(-s_train, )
        # s_test = forest.get_score(x_test_forest, w)

        gp_eval_set = x_test_forest
        gp_score, gp_var, train_indexes, test_indexes, v_eval = \
            get_gp_predictions(x=x_forest, y=s_train,
                               orig_x=x,
                               ordered_indexes=ranked_indexes,
                               queried_indexes=queried,
                               n_train=100, n_test=30, length_scale=40,
                               eval_set=gp_eval_set, orig_eval_set=x_test,
                               n_closest=9)
        logger.debug("gp_var:\n%s\ntest_indexes:\n%s" %
                     (str(list(gp_var)), str(list(test_indexes))))

        if gp_eval_set is not None:
            Z = v_eval.reshape(xx.shape)

            levels = np.linspace(0., 1., 20)
            CS = pl.contourf(xx, yy, Z, levels, cmap=plt.cm.get_cmap('jet'))
            cbar = plt.colorbar(CS)
            cbar.ax.set_ylabel('score variance')

        dp.plot_points(x,
                       pl,
                       labels=y,
                       lbl_color_map={
                           0: "grey",
                           1: "red"
                       },
                       s=25)
        dp.plot_points(x[train_indexes, :],
                       pl,
                       marker='o',
                       defaultcol='blue',
                       s=35,
                       edgecolor='blue',
                       facecolors='none')
        dp.plot_points(x[test_indexes, :],
                       pl,
                       marker='o',
                       defaultcol='magenta',
                       s=60,
                       edgecolor='magenta',
                       facecolors='none')
        # print queried[np.arange(i+1)]
        # print X_train[queried[np.arange(i+1)], :]
        dp.plot_points(matrix(x[queried[np.arange(i + 1)], :], nrow=i + 1),
                       pl,
                       labels=y[queried[np.arange(i + 1)]],
                       defaultcol="red",
                       lbl_color_map={
                           0: "green",
                           1: "red"
                       },
                       edgecolor="black",
                       marker=matplotlib.markers.MarkerStyle('o',
                                                             fillstyle=None),
                       s=35)

        # plot the sidebar
        anom_scores = forest.get_score(x_forest, w)
        anom_order = np.argsort(-anom_scores)
        anom_idxs = np.where(y[anom_order] == 1)[0]
        dash = 1 - (anom_idxs * 1.0 / x.shape[0])
        plot_sidebar(dash, dash_xy, dash_wh, pl)

        dp.close()
示例#14
0
def plot_aad_score_var(x, y, x_forest, xx, yy, forest, metrics, outputdir,
                       dash_xy, dash_wh):
    # use this to plot the AAD feedback

    x_test = np.c_[xx.ravel(), yy.ravel()]
    x_test_forest = forest.transform_to_region_features(x_test, dense=False)

    queried = np.array(metrics.queried)
    for i, q in enumerate(queried):
        pdfpath = "%s/score_iter_%02d.pdf" % (outputdir, i)
        dp = DataPlotter(pdfpath=pdfpath, rows=1, cols=1)
        pl = dp.get_next_plot()

        w = metrics.all_weights[i, :]
        s_train = forest.get_score(x_forest, w)
        ranked_indexes = np.argsort(-s_train, )
        # s_test = forest.get_score(x_test_forest, w)
        test_indexes = metrics.test_indexes[i]
        score_eval_set = x_test_forest
        score_mean, score_var, test_indexes, v_eval, _ = \
            get_score_variances(x=x_forest, w=w,
                                n_test=len(test_indexes) if test_indexes is not None else 10,
                                ordered_indexes=ranked_indexes,
                                queried_indexes=queried,
                                test_indexes=test_indexes,
                                eval_set=score_eval_set,
                                n_closest=9)
        qpos = np.argmax(score_var)
        q = test_indexes[qpos]
        logger.debug("score_var:\n%s\ntest_indexes:\n%s" %
                     (str(list(score_var)), str(list(test_indexes))))
        logger.debug(
            "qpos: %d, query instance: %d, var: %f, queried:%s" %
            (qpos, q, score_var[qpos], str(list(queried[np.arange(i)]))))

        if score_eval_set is not None:
            Z = v_eval.reshape(xx.shape)

            levels = np.linspace(np.min(v_eval), np.max(v_eval), 20)
            CS = pl.contourf(xx, yy, Z, levels, cmap=plt.cm.get_cmap('jet'))
            cbar = plt.colorbar(CS)
            cbar.ax.set_ylabel('score variance')

        dp.plot_points(x,
                       pl,
                       labels=y,
                       lbl_color_map={
                           0: "grey",
                           1: "red"
                       },
                       s=25)
        dp.plot_points(x[test_indexes, :],
                       pl,
                       marker='o',
                       defaultcol='magenta',
                       s=60,
                       edgecolor='magenta',
                       facecolors='none')
        dp.plot_points(matrix(x[queried[np.arange(i + 1)], :], nrow=i + 1),
                       pl,
                       labels=y[queried[np.arange(i + 1)]],
                       defaultcol="red",
                       lbl_color_map={
                           0: "green",
                           1: "red"
                       },
                       edgecolor=None,
                       facecolors=True,
                       marker=matplotlib.markers.MarkerStyle('o',
                                                             fillstyle=None),
                       s=35)

        # plot the sidebar
        anom_scores = forest.get_score(x_forest, w)
        anom_order = np.argsort(-anom_scores)
        anom_idxs = np.where(y[anom_order] == 1)[0]
        dash = 1 - (anom_idxs * 1.0 / x.shape[0])
        plot_sidebar(dash, dash_xy, dash_wh, pl)

        dp.close()