def plotDecisionBoundry(X, y, y_predicted, modelName):

    X_Train_embedded = TSNE(n_components=2).fit_transform(X)

    # create meshgrid
    resolution = 1000  # 100x100 background pixels
    X2d_xmin, X2d_xmax = np.min(X_Train_embedded[:, 0]), np.max(
        X_Train_embedded[:, 0])
    X2d_ymin, X2d_ymax = np.min(X_Train_embedded[:, 1]), np.max(
        X_Train_embedded[:, 1])
    xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution),
                         np.linspace(X2d_ymin, X2d_ymax, resolution))

    # approximate Voronoi tesselation on resolution x resolution grid using 1-NN
    background_model = KNeighborsClassifier(n_neighbors=1).fit(
        X_Train_embedded, y_predicted)
    voronoiBackground = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
    voronoiBackground = voronoiBackground.reshape((resolution, resolution))

    plt.contourf(xx, yy, voronoiBackground)
    plt.scatter(X_Train_embedded[:, 0],
                X_Train_embedded[:, 1],
 def do(train_data, train_label, test_data, test_label=None, adjust_parameters=True, k=5):
     train_data = np.array(train_data).squeeze()
     train_label = np.array(train_label).squeeze()
     test_data = np.array(test_data).squeeze()
     if test_label is not None:
         test_label = np.array(test_label).squeeze()
     if not adjust_parameters:
         knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8)
         knn.fit(train_data, train_label)
         predicted_label = knn.predict(test_data)
         if test_label is not None:
             acc = accuracy_score(test_label, predicted_label)
             print 'acc is ', acc
         return predicted_label
         max_acc = 0.0
         max_k = 0
         max_predicted = None
         for k in range(1, 11):
             knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8)
             knn.fit(train_data, train_label)
             predicted_label = knn.predict(test_data)
             acc = accuracy_score(test_label, predicted_label)
             if acc > max_acc:
                 max_acc = acc
                 max_k = k
                 max_predicted = predicted_label
             print 'k = ', k, ' acc is ', acc
         print 'max acc is ', max_acc, ' responding to k is ', max_k
         return max_predicted, max_k
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(
                n_neighbors=self.n_neighbors, algorithm='brute')
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes
        self.classifier.fit(X, y)
        nn_idx = self.classifier.kneighbors(X,
        nn_idx = nn_idx.T[1]

        mask = [
            nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]]
            for index in xrange(nn_idx.shape[0])
        mask = ~np.asarray(mask)
        if self.keep_class != None and self.keep_class in self.classes_:
            mask[y == self.keep_class] = True

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
class DCS(object):

    def select(self, ensemble, x):

    def __init__(self, Xval, yval, K=5, weighted=False, knn=None):
        self.Xval = Xval
        self.yval = yval
        self.K = K

        if knn == None:
            self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute')
            self.knn = knn

        self.knn.fit(Xval, yval)
        self.weighted = weighted

    def get_neighbors(self, x, return_distance=False):
        # obtain the K nearest neighbors of test sample in the validation set
        if not return_distance:
            [idx] = self.knn.kneighbors(x, 
            [dists], [idx] = self.knn.kneighbors(x, 
        X_nn = self.Xval[idx] # k neighbors
        y_nn = self.yval[idx] # k neighbors target

        if return_distance:
            return X_nn, y_nn, dists
            return X_nn, y_nn
class RawModel:
    def __init__(self):
        # 2015-05-15 GEL Found that n_components=20 gives a nice balance of 
        # speed (substantial improvement), accuracy, and reduced memory usage 
        # (25% decrease).
        self.decomposer = TruncatedSVD(n_components=20)

        # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than 
        # algorithm='kd_tree'
        # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples)
        # euclidean        0.950025
        # manhattan        0.933533
        # chebyshev        0.675662
        # hamming          0.708646
        # canberra         0.934033
        # braycurtis       0.940530
        self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')

    def fit(self, trainExamples):       
        X = self.decomposer.fit_transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples] ) )
        Y = [x.Y for x in trainExamples]

        self.model.fit(X, Y)
        return self

    def predict(self, examples):
        X = self.decomposer.transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) )
        return self.model.predict( X )
    def predict(self, X, n_neighbors=1):
        """Perform classification on an array of test vectors X.

        The predicted class C for each sample in X is returned.

        X : array-like, shape = [n_samples, n_features]

        C : array, shape = [n_samples]

        The default prediction is using KNeighborsClassifier, if the
        instance reducition algorithm is to be performed with another
        classifier, it should be explicited overwritten and explained
        in the documentation.
        X = check_array(X)
        if not hasattr(self, "X_") or self.X_ is None:
            raise AttributeError("Model has not been trained yet.")

        if not hasattr(self, "y_") or self.y_ is None:
            raise AttributeError("Model has not been trained yet.")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict(X)
class DCS(object):
    def select(self, ensemble, x):

    def __init__(self, Xval, yval, K=5, weighted=False, knn=None):
        self.Xval = Xval
        self.yval = yval
        self.K = K

        if knn is None:
            self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute')
            self.knn = knn

        self.knn.fit(Xval, yval)
        self.weighted = weighted

    def get_neighbors(self, x, return_distance=False):
        # obtain the K nearest neighbors of test sample in the validation set
        if not return_distance:
            [idx] = self.knn.kneighbors(x, return_distance=return_distance)
            rd = return_distance
            [dists], [idx] = self.knn.kneighbors(x, return_distance=rd)
        X_nn = self.Xval[idx]  # k neighbors
        y_nn = self.yval[idx]  # k neighbors target

        if return_distance:
            return X_nn, y_nn, dists
            return X_nn, y_nn
    def reduce_data(self, X, y):

        X, y = check_arrays(X, y, sparse_format="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]

        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)

        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        classes = np.unique(y)
        self.classes_ = classes

        # loading inicial groups
        self.groups = []
        for label in classes:
            mask = y == label
            self.groups = self.groups + [_Group(X[mask], label)]

        self.X_ = np.asarray([g.rep_x for g in self.groups])
        self.y_ = np.asarray([g.label for g in self.groups])
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
def KNN_method(X, y):
    skf = StratifiedKFold(n_splits=4, random_state=42)
    skf.get_n_splits(X, y)

    for train_index, test_index in skf.split(X, y):
        print("Train:", train_index, "Validation:", test_index)
        trainX, testX = X[train_index], X[test_index]
        trainY, testY = y[train_index], y[test_index]

        #here starts KNN
        #how many neighbours want to use in the KNC
        kvalues = [1, 3, 5, 7, 9, 11, 13, 15, 19, 24, 30, 40, 50, 60, 70, 90]
        dist = ['manhattan', 'euclidean', 'chebyshev']
        results = {}
        for element in dist:
            accuracy_results = []
            for k in kvalues:
                knn = KNeighborsClassifier(n_neighbors=k, metric=element)
                knn.fit(trainX, trainY)
                predictedY = knn.predict(testX)
                accuracy_results.append(accuracy_score(testY, predictedY))
            results[element] = accuracy_results
        print("Results of model preparation for: " + str(results))

                            'KNN variants',
    def _pruning(self):

        if len(self.groups) < 2:
            return self.groups

        pruned, fst = False, True
        knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute')
        while pruned or fst:
            index = 0
            pruned, fst = False, False

            while index < len(self.groups):
                group = self.groups[index]

                mask = np.ones(len(self.groups), dtype=bool)
                mask[index] = False
                reps_x = np.asarray([g.rep_x for g in self.groups])[mask]
                reps_y = np.asarray([g.label for g in self.groups])[mask]
                labels = knn.fit(reps_x, reps_y).predict(group.X)

                if (labels == group.label).all():
                    pruned = True
                    index = index + 1

                if len(self.groups) == 1:
                    index = len(self.groups)
                    pruned = False

        return self.groups
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0
            return self.X_, self.y_

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
    def _main_loop(self):
        exit_count = 0
        knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute')
        while exit_count < len(self.groups):
            index, exit_count = 0, 0
            while index < len(self.groups):

                group = self.groups[index]
                reps_x = np.asarray([g.rep_x for g in self.groups])
                reps_y = np.asarray([g.label for g in self.groups])
                knn.fit(reps_x, reps_y)
                nn_idx = knn.kneighbors(group.X, n_neighbors=1, return_distance=False)
                nn_idx = nn_idx.T[0]
                mask = nn_idx == index
                # if all are correctly classified
                if not (False in mask):
                    exit_count = exit_count + 1
                # if all are misclasified
                elif not (group.label in reps_y[nn_idx]):
                    pca = PCA(n_components=1)
                    # maybe use a 'for' instead of creating array
                    d = pca.transform(reps_x[index])
                    dis = [pca.transform(inst)[0] for inst in group.X]
                    mask_split = (dis < d).flatten()
                    new_X = group.X[mask_split]
                    self.groups.append(_Group(new_X, group.label))
                    group.X = group.X[~mask_split]
                elif (reps_y[nn_idx] == group.label).all() and (nn_idx != index).any():
                    mask_mv = nn_idx != index
                    index_mv = np.asarray(range(len(group)))[mask_mv]
                    X_mv = group.remove_instances(index_mv)
                    G_mv = nn_idx[mask_mv]                        

                    for x, g in zip(X_mv, G_mv):

                elif (reps_y[nn_idx] != group.label).sum()/float(len(group)) > self.r_mis:
                    mask_mv = reps_y[nn_idx] != group.label
                    new_X = group.X[mask_mv]
                    self.groups.append(_Group(new_X, group.label))
                    group.X = group.X[~mask_mv]
                   exit_count = exit_count + 1

                if len(group) == 0:
                    index = index + 1

                for g in self.groups:

        return self.groups                     
def evaluate(Xtra, ytra, Xtst, ytst, k=1, positive_label=1):
    knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute')
    knn.fit(Xtra, ytra)

    y_true = ytst
    y_pred = knn.predict(Xtst)

    return evaluate_results(y_true, y_pred, positive_label=positive_label)
 def __init__(self, csv_path_train, csv_path_test, k):
     self.csv_path_train = csv_path_train
     self.csv_path_test = csv_path_test
     self.classifier = KNeighborsClassifier(n_neighbors=k,
def knn(X, y, model_path):
    model = KNeighborsClassifier()
    model.fit(X, y)
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)
    def __init__(self, Xval, yval, K=5, weighted=False, knn=None):
        self.Xval = Xval
        self.yval = yval
        self.K = K

        if knn is None:
            self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute')
            self.knn = knn

        self.knn.fit(Xval, yval)
        self.weighted = weighted
def plot_boundaries_decision(X, y, clf, namefile):
    Method to plot the boundaries decision of our data 
    X : A numpy array of the data we want to plot 
    y : A numpy array of the  label corresponding to our data
    clf : the model use to predict the label of our data
    namefile : the name of the file in which we want to save the figure  
    X_train, X_test, y_train, y_test = train_test_split(X,
    #    #The plot of boundary decision in the 2D space of representation of data
    model.fit(X_train, y_train)

    # create meshgrid
    resolution = 100  # 100x100 background pixels
    X2d_xmin, X2d_xmax = np.min(X[:, 0]), np.max(X[:, 0])
    X2d_ymin, X2d_ymax = np.min(X[:, 1]), np.max(X[:, 1])
    xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution),
                         np.linspace(X2d_ymin, X2d_ymax, resolution))

    # approximate Voronoi tesselation on resolution x resolution grid using 1-NN
    background_model = KNeighborsClassifier(n_neighbors=1).fit(X, y)
    voronoiBackground = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
    voronoiBackground = voronoiBackground.reshape((resolution, resolution))

    fig = pyplot.figure()
    fig.set_size_inches(10.5, 8.5)

    ax = fig.add_subplot(211)  #small subplot to show how the legend has moved.
    ax.contourf(xx, yy, voronoiBackground)
        " Boundaries decision in using the dimensionality reduction of Multidimensional scaling"
    ax.scatter(X[:, 0], X[:, 1], c=color[y].tolist())

    label = numpy.array([x for x in ["Apple", "Tomatoes"]])
    # Legend
    for ind, s in enumerate(label):
        ax.scatter([], [], label=s, color=color[ind])

                  bbox_to_anchor=(1.2, .4),
                  loc='center right')

def get_best_k(X, y, max_k=30, keep_best_n=10, weights=None):

    # TODO: check X, y. description

    # Set default values
    if max_k is None:
        max_k = len(X)

    if weights is None:
        weights = ['uniform', 'distance']

    # Make weights into a list if it is not already one
    if type(weights) is not list:
        weights = [weights]

    # Check if inputs are valid
    check_pandas_dataframe_nd(X, 'X')

    check_numpy_array_pandas_dataframe_series_1d(y, 'y')

    check_list_of_strings(weights, 'weights')

    check_integer(max_k, 'max_k')
    check_larger(max_k, 'max_k', 1)

    check_integer(keep_best_n, 'keep_best_n')
    check_larger(keep_best_n, 'keep_best_n', 1)

    # Change shape of y if necessary
    y = np.array(y)
    y = y.ravel()

    # Split into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    # Get value for max_k
    max_k = min(max_k, len(X_test))

    # Set up results-list
    best_model = []

    for k in range(1, max_k):
        for weight in weights:
            model = KNeighborsClassifier(n_neighbors=k,
                                         weights=weight).fit(X_train, y_train)
            score = model.score(X_test, y_test)
            best_model.append((k, weight, score))

    best_model.sort(key=lambda x: x[2], reverse=True)
    best_model = best_model[0:keep_best_n]
    return best_model
def __plot_decision_boundaries(X,
                               resolution: int = 100,
    if embedding is None:
        embedding = TSNE(n_components=2, random_state=160290).fit_transform(X)

    x_min, x_max = safe_bounds(embedding[:, 0])
    y_min, y_max = safe_bounds(embedding[:, 1])
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, resolution),
                         np.linspace(y_min, y_max, resolution))

    # approximate Voronoi tesselation on resolution x resolution grid using 1-NN
    background_model = KNeighborsClassifier(n_neighbors=1).fit(
        embedding, y_pred)
    voronoi_bg = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
    voronoi_bg = voronoi_bg.reshape((resolution, resolution))

    mesh = hv.QuadMesh((xx, yy, voronoi_bg)).opts(cmap="viridis")
    points = hv.Scatter(
            "x": embedding[:, 0],
            "y": embedding[:, 1],
            "pred": y_pred,
            "class": y
        kdims=["x", "y"],
        vdims=["pred", "class"],
    errors = y_pred != y
    failed_points = hv.Scatter(
            "x": embedding[errors, 0],
            "y": embedding[errors, 1]
        kdims=["x", "y"]).opts(color="red", size=5, alpha=0.9)

    points = points.opts(color="pred",
    plot = mesh * points * failed_points
    plot = plot.opts(xaxis=None,
                     title="Decision boundaries on TSNE")
    return plot
    def get_result(self):
        # file opener
        directory = filedialog.askdirectory()
        result = self.read_emails_from_directory(directory)

        train_labels = np.zeros(1430)
        train_labels[715:1430] = 1
        # This equates to 1-715 = HAM and 716-1430 = SPAM
        #                              If you change result[n] to something else
        #                              Make sure you change the same result down
        #                              down in line 251 (test_matrix)
        train_matrix = self.extract_features(directory, result[0])
        # print("body words:", result[0])
        # print("\n\nsubject words:", result[1])
        # print("\n\nbody phrases:", result[2])
        # print("\n\nsubject phrases:", result[3])

        print("body words:", len(result[0]))
        print("subject words:", len(result[1]))
        print("body phrases:", len(result[2]))
        print("subject phrases:", len(result[3]))

        model1 = MultinomialNB()
        model2 = LinearSVC()
        model3 = RandomForestClassifier()
        model4 = KNeighborsClassifier()
        model1.fit(train_matrix, train_labels)
        model2.fit(train_matrix, train_labels)
        model3.fit(train_matrix, train_labels)
        model4.fit(train_matrix, train_labels)

        test_dir = filedialog.askdirectory()
        #                                       Here -----v
        test_matrix = self.extract_features(test_dir, result[0])
        test_labels = np.zeros(600)
        # This equates to 1-300 = HAM and 301-600 = SPAM
        test_labels[300:600] = 1
        result1 = model1.predict(test_matrix)
        result2 = model2.predict(test_matrix)
        result3 = model3.predict(test_matrix)
        result4 = model4.predict(test_matrix)

        print(confusion_matrix(test_labels, result1))
        print(confusion_matrix(test_labels, result2))
        print(confusion_matrix(test_labels, result3))
        print(confusion_matrix(test_labels, result4))
        return result
def knn_builder():
    pip_knn = Pipeline([("selector",SelectKBest(chi2)),("knn_clf",KNeighborsClassifier())])
    parameters_knn ={'selector__k':[20],
    scorer_knn = make_scorer(accuracy_score)
    searcher_knn = GridSearchCV(pip_knn, parameters_knn, scoring=scorer_knn)
    return searcher_knn
    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        classes = np.unique(y)
        self.classes_ = classes

        minority_class = self.pos_class
        if self.pos_class == None:
            minority_class = min(set(y), key = list(y).count)

        # loading inicial groups
        self.groups = []
        for label in classes:
            mask = y == label
            self.groups = self.groups + [_Group(X[mask], label)]

        min_groups = filter(lambda g: g.label == minority_class, self.groups)
        max_groups = filter(lambda g: g.label != minority_class, self.groups)
        self.groups = min_groups + max_groups
        self.X_ = np.asarray([g.rep_x for g in self.groups])
        self.y_ = np.asarray([g.label for g in self.groups])
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
    def __init__(self, estimator=KNeighborsClassifier(n_neighbors=10), dimensionality_reduction=PCA(n_components=2), acceptance_threshold=0.03, n_decision_boundary_keypoints=60, n_connecting_keypoints=None, n_interpolated_keypoints=None, n_generated_testpoints_per_keypoint=15, linear_iteration_budget=100, hypersphere_iteration_budget=300, verbose=True):
        if acceptance_threshold == 0:
            raise Warning(
                "A nonzero acceptance threshold is strongly recommended so the optimizer can finish in finite time")
        if linear_iteration_budget < 2 or hypersphere_iteration_budget < 2:
            raise Exception("Invalid iteration budget")

        self.classifier = estimator
        self.dimensionality_reduction = dimensionality_reduction
        self.acceptance_threshold = acceptance_threshold

        if n_decision_boundary_keypoints and n_connecting_keypoints and n_interpolated_keypoints and n_connecting_keypoints + n_interpolated_keypoints != n_decision_boundary_keypoints:
            raise Exception(
                "n_connecting_keypoints and n_interpolated_keypoints must sum to n_decision_boundary_keypoints (set them to None to use calculated suggestions)")

        self.n_connecting_keypoints = n_connecting_keypoints if n_connecting_keypoints != None else n_decision_boundary_keypoints / 3
        self.n_interpolated_keypoints = n_interpolated_keypoints if n_interpolated_keypoints != None else n_decision_boundary_keypoints * 2 / 3

        self.linear_iteration_budget = linear_iteration_budget
        self.n_generated_testpoints_per_keypoint = n_generated_testpoints_per_keypoint
        self.hypersphere_iteration_budget = hypersphere_iteration_budget
        self.verbose = verbose

        self.decision_boundary_points = []
        self.decision_boundary_points_2d = []
        self.X_testpoints = []
        self.y_testpoints = []
        self.background = []
        self.steps = 3

        self.hypersphere_max_retry_budget = 20
        self.penalties_enabled = True
        self.random_gap_selection = False
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     return self
    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]

        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)
        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
    def predict(self, X, n_neighbors=1):
        """Perform classification on an array of test vectors X.

        The predicted class C for each sample in X is returned.

        X : array-like, shape = [n_samples, n_features]

        C : array, shape = [n_samples]

        The default prediction is using KNeighborsClassifier, if the
        instance reducition algorithm is to be performed with another
        classifier, it should be explicited overwritten and explained
        in the documentation.
        X = atleast2d_or_csr(X)
        if not hasattr(self, "X_") or self.X_ is None:
            raise AttributeError("Model has not been trained yet.")

        if not hasattr(self, "y_") or self.y_ is None:
            raise AttributeError("Model has not been trained yet.")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict(X)
def get_gating(dss, tsf_name, use_gating=UseGating.TREE, *args, **kwargs):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.neighbors.classification import KNeighborsClassifier

    component_scale = [1, 0.2]
    # TODO this is specific to coordinate transform to slice just the body frame reaction force
    # input_slice = slice(3, None)
    input_slice = None

    if use_gating is UseGating.MLP:
        gating = gating_function.MLPSelector(dss, *args, **kwargs, name=tsf_name, input_slice=input_slice)
    elif use_gating is UseGating.KDE:
        gating = gating_function.KDESelector(dss, component_scale=component_scale, input_slice=input_slice)
    elif use_gating is UseGating.GMM:
        opts = {'n_components': 10, }
        if kwargs is not None:
        gating = gating_function.GMMSelector(dss, gmm_opts=opts, variational=True, component_scale=component_scale,
    elif use_gating is UseGating.TREE:
        input_slice=input_slice)
    elif use_gating is UseGating.FORCE:
        gating = gating_function.ReactionForceHeuristicSelector(12, slice(3, None))
    elif use_gating is UseGating.MLP_SKLEARN:
        gating = gating_function.SklearnClassifierSelector(dss, MLPClassifier(**kwargs), input_slice=input_slice)
    elif use_gating is UseGating.KNN:
        input_slice=input_slice)
    else:
        raise RuntimeError("Unrecognized selector option")
        raise RuntimeError("Unrecognized selector option")
    return gating
    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U
    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U
    def __init__(self,
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.max_loop = max_loop
        self.threshold = threshold
        self.chromosomes_count = chromosomes_count

        self.evaluations = None
        self.chromosomes = None

        self.best_chromosome_ac = -1
        self.best_chromosome_rd = -1

        self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
 def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):
     self._hyperparams = {
         'n_neighbors': n_neighbors,
         'weights': weights,
         'algorithm': algorithm,
         'leaf_size': leaf_size,
         'p': p,
         'metric': metric,
         'metric_params': metric_params,
         'n_jobs': n_jobs}
     self._wrapped_model = Op(**self._hyperparams)
    def __init__(self, Xval, yval, K=5, weighted=False, knn=None):
        self.Xval = Xval
        self.yval = yval
        self.K = K

        if knn == None:
            self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute')
            self.knn = knn

        self.knn.fit(Xval, yval)
        self.weighted = weighted
def run_knn_multi_level_classifier(train, train_labels):
    k_range = list(range(2, 5))
    k_scores = []
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn,
    return k_scores
    def setclassifier(self, estimator=KNeighborsClassifier(n_neighbors=10)):
        """Assign classifier for which decision boundary should be plotted.

        estimator : BaseEstimator instance, optional (default=KNeighborsClassifier(n_neighbors=10)).
            Classifier for which the decision boundary should be plotted. Must have
            probability estimates enabled (i.e. estimator.predict_proba must work).
            Make sure it is possible for probability estimates to get close to 0.5
            (more specifically, as close as specified by acceptance_threshold).
        self.classifier = estimator
def compute_cnn(X, y):

  "condenced nearest neighbor. the cnn removes reduntant instances, maintaining the samples in the decision boundaries."

  classifier = KNeighborsClassifier(n_neighbors=3)

  prots_s = []
  labels_s = []

  classes = np.unique(y)
  classes_ = classes

  for cur_class in classes:
    mask = y == cur_class
    insts = X[mask]
    prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
    labels_s = labels_s + [cur_class]
  classifier.fit(prots_s, labels_s)
  for sample, label in zip(X, y):
    if classifier.predict(sample) != [label]:
      prots_s = prots_s + [sample]
      labels_s = labels_s + [label]
      classifier.fit(prots_s, labels_s)

  X_ = np.asarray(prots_s)
  y_ = np.asarray(labels_s)
  reduction_ = 1.0 - float(len(y_)/len(y))
  print reduction_
def build_and_test_model(classifier, X, Y, Z, param):

    accuracies = []
    ari = []

    for train, test in LeaveOneOut().split(X):

        X_train, Y_train = X[train], Y[train]
        X_test, Y_test, Z_test = X[test], Y[test], Z[test]
        predicted = None

        if classifier == "KNN":
            neigh = KNeighborsClassifier(n_neighbors=param).fit(
                X_train, Y_train)
            predicted = neigh.predict(X_test)

        elif classifier == "RF":
            clf = RandomForestClassifier(n_estimators=param,
                                         random_state=0)  # ,max_depth=2,
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test)

        elif classifier == "SVM":
            clf = svm.SVC(gamma='scale')
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test).astype(int)

        elif classifier == "NAIVE":
            clf = GaussianNB()
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test).astype(int)

        elif classifier == "RANDOM":
            options = list(set(Y_train))
            predicted = [random.choice(options) for _ in range(len(Y_test))]

        accuracies.append(metrics.accuracy_score(Y_test, predicted))
        ari.append(metrics.adjusted_rand_score(Z_test, predicted))

    return np.mean(accuracies), np.std(accuracies), np.mean(ari), np.std(ari)
class PatchedRawModel:
    def __init__(self):
        self.baseModel = RawModel()
        self.model49 = KNeighborsClassifier(n_neighbors=10)
        self.model35 = KNeighborsClassifier(n_neighbors=10)
    def fit(self, trainExamples):

        X49 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [4, 9]] )
        Y49 = [x.Y for x in trainExamples if x.Y in [4, 9]]
        self.model49.fit(X49, Y49)

        X35 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [3, 5]] )
        Y35 = [x.Y for x in trainExamples if x.Y in [3, 5]]
        self.model35.fit(X35, Y35)

    def predict(self, examples):
        basePredictions = self.baseModel.predict(examples)

        for (x, y, i) in zip(examples, basePredictions, range(0, len(examples))):
            if y in [4, 9]:
                specializedPrediction = self.model49.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT)))
                if specializedPrediction != y:
                    basePredictions[i] = specializedPrediction
            elif y in [3, 5]:
                specializedPrediction = self.model35.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT)))
                if specializedPrediction != y:
                    basePredictions[i] = specializedPrediction

        return basePredictions
def compute_enn(X, y):
  the edited nearest neighbors removes the instances in the boundaries, maintaining reduntant samples

  classifier = KNeighborsClassifier(n_neighbors=3)

  classes = np.unique(y)
  classes_ = classes

  mask = np.zeros(y.size, dtype=bool)
  classifier.fit(X, y)

  for i in xrange(y.size):
    sample, label = X[i], y[i]
    if classifier.predict(sample) == [label]:
      mask[i] = not mask[i]

  X_ = np.asarray(X[mask])
  y_ = np.asarray(y[mask])
  reduction_ = 1.0 - float(len(y_)) / len(y)
  print reduction_
    def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.max_loop = max_loop
        self.threshold = threshold
        self.chromosomes_count = chromosomes_count

        self.evaluations = None
        self.chromosomes = None

        self.best_chromosome_ac = -1
        self.best_chromosome_rd = -1

        self.classifier = KNeighborsClassifier(n_neighbors = n_neighbors)
    def __init__(self):
        # 2015-05-15 GEL Found that n_components=20 gives a nice balance of 
        # speed (substantial improvement), accuracy, and reduced memory usage 
        # (25% decrease).
        self.decomposer = TruncatedSVD(n_components=20)

        # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than 
        # algorithm='kd_tree'
        # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples)
        # euclidean        0.950025
        # manhattan        0.933533
        # chebyshev        0.675662
        # hamming          0.708646
        # canberra         0.934033
        # braycurtis       0.940530
        self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors, algorithm='brute')
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes
        self.classifier.fit(X, y)
        nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False)
        nn_idx = nn_idx.T[1]

        mask = [nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0])]
        mask = ~np.asarray(mask) 
        if self.keep_class != None and self.keep_class in self.classes_:
            mask[y==self.keep_class] = True

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
 def __init__(self):
     self.baseModel = RawModel()
     self.model49 = KNeighborsClassifier(n_neighbors=10)
     self.model35 = KNeighborsClassifier(n_neighbors=10)
class ENN(InstanceReductionMixin):

    """Edited Nearest Neighbors.

    The Edited Nearest Neighbors  removes the instances in de 
    boundaries, maintaining redudant samples.

    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    >>> from protopy.selection.enn import ENN
    >>> import numpy as np
    >>> X = np.array([[-1, 0], [-0.8, 1], [-0.8, -1], [-0.5, 0] , [0.5, 0], [1, 0], [0.8, 1], [0.8, -1]])
    >>> y = np.array([1, 1, 1, 2, 1, 2, 2, 2])
    >>> editednn = ENN()
    >>> editednn.fit(X, y)
    >>> print(editednn.predict([[-0.6, 0.6]]))
    >>> print editednn.reduction_

    See also
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    Ruiqin Chang, Zheng Pei, and Chao Zhang. A modified editing k-nearest
    neighbor rule. JCP, 6(7):1493–1500, 2011.


    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
        self.classifier = None

    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
class SSMA(InstanceReductionMixin):
    """Steady State Memetic Algorithm

    The Steady-State Memetic Algorithm is an evolutionary prototype
    selection algorithm. It uses a memetic algorithm in order to 
    perform a local search in the code.

    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    alpha   : float (default = 0.6)
        Parameter that ponderates the fitness function.

    max_loop    : int (default = 1000)
        Number of maximum loops performed by the algorithm.

    threshold   : int (default = 0)
        Threshold that regulates the substitution condition;

    chromosomes_count: int (default = 10)
        number of chromosomes used to find the optimal solution.

    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    >>> from protopy.selection.ssma import SSMA
    >>> import numpy as np
    >>> X = np.array([[i] for i in range(100)])
    >>> y = np.asarray(50 * [0] + 50 * [1])
    >>> ssma = SSMA()
    >>> ssma.fit(X, y)
    SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0)
    >>> print ssma.predict([[40],[60]])
    [0 1]
    >>> print ssma.reduction_

    See also
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype
    selection based on a steady-state memetic algorithm: a study of scalability.
    Memetic Computing, 2(3):183–199, 2010.

    def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.max_loop = max_loop
        self.threshold = threshold
        self.chromosomes_count = chromosomes_count

        self.evaluations = None
        self.chromosomes = None

        self.best_chromosome_ac = -1
        self.best_chromosome_rd = -1

        self.classifier = KNeighborsClassifier(n_neighbors = n_neighbors)

    def accuracy(self, chromosome, X, y):
        mask = np.asarray(chromosome, dtype=bool)
        cX, cy = X[mask], y[mask]
        #print len(cX), len(cy), sum(chromosome)

        self.classifier.fit(cX, cy)
        labels = self.classifier.predict(X)
        accuracy = (labels == y).sum()

        return float(accuracy)/len(y)

    def fitness(self, chromosome, X, y):
        #TODO add the possibility of use AUC for factor1
        ac = self.accuracy(chromosome, X, y)
        rd = 1.0 - (float(sum(chromosome))/len(chromosome))

        return self.alpha * ac + (1.0 - self.alpha) * rd

    def fitness_gain(self, gain, n):
        return self.alpha * (float(gain)/n) + (1 - self.alpha) * (1.0 / n)

    def update_threshold(self, X, y):
        best_index = np.argmax(self.evaluations)
        chromosome = self.chromosomes[best_index]

        best_ac = self.accuracy(chromosome, X, y)
        best_rd = 1.0 - float(sum(chromosome))/len(y)

        if best_ac <= self.best_chromosome_ac:
            self.threshold = self.threshold + 1
        if best_rd <= self.best_chromosome_rd:
            self.threshold = self.threshold - 1

        self.best_chromosome_ac = best_ac
        self.best_chromosome_rd = best_rd

    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U

    def memetic_looper(self, S, R):
        c = 0
        for i in range(len(S)):
            if S[i] == 1 and i not in R:
                c = c + 1
                if c == 2:
                    return True

        return False

    def memetic_select_j(self, S, R):
        indexs = []
        for i in range(len(S)):
            if i not in R and S[i] == 1:
        # if list is empty wlil return error
        return np.random.choice(indexs)

    def generate_population(self, X, y):
        self.chromosomes = [[np.random.choice([0,1]) for i in range(len(y))]
                            for c in range(self.chromosomes_count)]
        self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes]

        self.update_threshold(X, y)

    def select_parents(self, X, y):
        parents = []
        for i in range(2):
            samples = random.sample(self.chromosomes, 2)
            parents = parents + [samples[0] if self.fitness(samples[0], X, y) >
                                    self.fitness(samples[1], X, y) else samples[1]]
        return np.array(parents, copy=True)

    def crossover(self, parent_1, parent_2):
        size = len(parent_1)
        mask = [0] * (size/2) + [1] * (size - size/2)
        mask = np.asarray(mask, dtype=bool)

        off_1 = parent_1 * mask + parent_2 * ~mask
        off_2 = parent_2 * mask + parent_1 * ~mask
        return np.asarray([off_1, off_2])

    def mutation(self, offspring):
        for i in range(len(offspring)):
            if np.random.uniform(0,1) < 1.0/len(offspring):
                offspring[i] = not offspring[i]

        return offspring

    def memetic_search(self, chromosome, X, y, chromosome_fitness = None):
        S = np.array(chromosome, copy=True)
        if S.sum() == 0:
            return S, 0

        if chromosome_fitness == None:
            chromosome_fitness = self.fitness(chromosome, X, y)
        fitness_s = chromosome_fitness

        # List of visited genes in S 
        R = []
        # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i
        U = self.index_nearest_neighbor(S, X, y)
        while self.memetic_looper(S, R):
            j = self.memetic_select_j(S, R) 
            S[j] = 0
            gain = 0.0
            U_copy = list(U)
            mask = np.asarray(S, dtype=bool)
            X_tra, y_tra = X[mask], y[mask]
            real_idx = np.asarray(range(len(y)))[mask]

            if len(y_tra) > 0:
                for i in range(len(U)):
                    if U[i] == j:
                        self.classifier.fit(X_tra, y_tra)
                        [[idx]] = self.classifier.kneighbors(X[i], n_neighbors=1,
                        U[i] = real_idx[idx]
                        if y[i] == y[U_copy[i]] and y[i] != y[U[i]]:
                            gain = gain - 1.0
                        if y[i] != y[U_copy[i]] and y[i] == y[U[i]]:
                            gain = gain + 1.0
            if gain >= self.threshold:
                n = S.sum()
                g = self.fitness_gain(gain, n)
                fitness_s = fitness_s + g
                R = []
                U = U_copy
                S[j] = 1

        return list(S), fitness_s


    def main_loop(self, X, y):
        self.generate_population(X, y)
        n, worse_fit_index = 0, -1
        while (n < self.max_loop):
            parents = self.select_parents(X, y)
            offspring = self.crossover(parents[0], parents[1])
            offspring[0] = self.mutation(offspring[0])
            offspring[1] = self.mutation(offspring[1])

            fit_offs = [self.fitness(off, X, y) if sum(off) > 0 else -1 for off in offspring]
            if worse_fit_index == -1:
                worse_fit_index = np.argmin(self.evaluations)

            for i in range(len(offspring)):
                p_ls = 1.0 

                if fit_offs[i] == -1:
                    p_ls = -1

                if fit_offs[i] <= self.evaluations[worse_fit_index]:
                    p_ls = 0.0625

                if np.random.uniform(0,1) < p_ls:

                    offspring[i], fit_offs[i] = self.memetic_search(offspring[i], X, y, chromosome_fitness = fit_offs[i])

            for i in range(len(offspring)):
                if fit_offs[i] > self.evaluations[worse_fit_index]:
                    self.chromosomes[worse_fit_index] = offspring[i]
                    self.evaluations[worse_fit_index] = fit_offs[i]

                    worse_fit_index = np.argmin(self.evaluations)

            n = n + 1
            if n % 10 == 0:
                self.update_threshold(X, y)

    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        classes = np.unique(y)
        self.classes_ = classes

        self.main_loop(X, y)

        best_index = np.argmax(self.evaluations)
        mask = np.asarray(self.chromosomes[best_index], dtype=bool)
        self.X_ = X[mask]
        self.y_ = y[mask]
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)

        return self.X_, self.y_
class InstanceReductionMixin(InstanceReductionBase, ClassifierMixin):

    """Mixin class for all instance reduction techniques"""

    def set_classifier(self):
        """Sets the classified to be used in the instance reduction process
            and classification.

        classifier : classifier, following the KNeighborsClassifier style
            (default = KNN)

        y : array-like, shape = [n_samples]
            Labels for X.

        P : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        q : array-like, shape = [indertaminated]
            Labels for P

        self.classifier = classifier

    def reduce_data(self, X, y):
        """Perform the instance reduction procedure on the given training data.

        X : array-like, shape = [n_samples, n_features]
            Training set.0

        y : array-like, shape = [n_samples]
            Labels for X.

        X_ : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        y_ : array-like, shape = [indertaminated]
            Labels for X_
    def get_prototypes(self):
        return self.X_, self.y_

    def fit(self, X, y, reduce_data=True):
        Fit the InstanceReduction model according to the given training data.

        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        reduce_data : bool, flag indicating if the reduction would be performed
        self.X = X
        self.y = y
        self.labels = set(y)
        self.prototypes = None
        self.prototypes_labels = None
        self.reduction_ratio = 0.0

        if reduce_data:
            self.reduce_data(X, y)

        return self

    def predict(self, X, n_neighbors=1):
        """Perform classification on an array of test vectors X.

        The predicted class C for each sample in X is returned.

        X : array-like, shape = [n_samples, n_features]

        C : array, shape = [n_samples]

        The default prediction is using KNeighborsClassifier, if the
        instance reducition algorithm is to be performed with another
        classifier, it should be explicited overwritten and explained
        in the documentation.
        X = atleast2d_or_csr(X)
        if not hasattr(self, "X_") or self.X_ is None:
            raise AttributeError("Model has not been trained yet.")

        if not hasattr(self, "y_") or self.y_ is None:
            raise AttributeError("Model has not been trained yet.")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict(X)

    def predict_proba(self, X):
        """Return probability estimates for the test data X.
        after a given prototype selection algorithm.
        X : array, shape = (n_samples, n_features)
            A 2-D array representing the test points.
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
        of such arrays if n_outputs > 1.
        The class probabilities of the input samples. Classes are ordered
        by lexicographic order.
        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict_proba(X)
class TomekLinks(InstanceReductionMixin):

    """Tomek Links.

    The Tomek Links algorithm removes a pair instances that
    forms a Tomek Link. This techniques removes instances in
    the decision region.

    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default in the classification (only).
        The Tomek Links uses only n_neighbors=1 in the reduction.

    keep_class : int, optional (default = None)
        Label of the class to not be removed in the tomek links. If None,
        it removes all nodes of the links.

    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.


    >>> from protopy.selection.tomek_links import TomekLinks
    >>> import numpy as np
    >>> X = np.array([[0],[1],[2.1],[2.9],[4],[5],[6],[7.1],[7.9],[9]])
    >>> y = np.array([1,1,2,1,2,2,2,1,2,2])
    >>> tl = TomekLinks()
    >>> tl.fit(X, y)
    >>> print tl.predict([[2.5],[7.5]])
    [1, 2]
    >>> print tl.reduction_

    See also
    protopy.selection.enn.ENN: edited nearest neighbor

    I. Tomek, “Two modifications of cnn,” IEEE Transactions on Systems,
    Man and Cybernetics, vol. SMC-6, pp. 769–772, 1976.


    def __init__(self, n_neighbors=3, keep_class=None):
        self.n_neighbors = n_neighbors
        self.classifier = None
        self.keep_class = keep_class

    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors, algorithm='brute')
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes
        self.classifier.fit(X, y)
        nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False)
        nn_idx = nn_idx.T[1]

        mask = [nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0])]
        mask = ~np.asarray(mask) 
        if self.keep_class != None and self.keep_class in self.classes_:
            mask[y==self.keep_class] = True

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
class SGP2(SGP):
    """Self-Generating Prototypes 2

    The Self-Generating Prototypes 2 is the second version of the
    Self-Generating Prototypes algorithm.
    It has a higher generalization power, including the procedures
    merge and pruning.

    r_min: float, optional (default = 0.0)
        Determine the minimum size of a cluster [0.00, 0.20]

    r_mis: float, optional (default = 0.0)
        Determine the error tolerance before split a group

    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    >>> from protopy.generation.sgp import SGP2
    >>> import numpy as np
    >>> X = [np.asarray(range(1,13)) + np.asarray([0.1,0,-0.1,0.1,0,-0.1,0.1,-0.1,0.1,-0.1,0.1,-0.1])]
    >>> X = np.asarray(X).T
    >>> y = np.array([1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1])
    >>> sgp2 = SGP2()
    >>> sgp2.fit(X, y)
    SGP2(r_min=0.0, r_mis=0.0)
    >>> print sgp2.reduction_

    See also
    protopy.generation.SGP: self-generating prototypes
    protopy.generation.sgp.ASGP: adaptive self-generating prototypes

    Hatem A. Fayed, Sherif R Hashem, and Amir F Atiya. Self-generating prototypes
    for pattern classification. Pattern Recognition, 40(5):1498–1509, 2007.
    def __init__(self, r_min=0.0, r_mis=0.0):
        self.groups = None
        self.r_min = r_min
        self.r_mis = r_mis
        self.n_neighbors = 1
        self.classifier = None
        self.groups = None

    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        classes = np.unique(y)
        self.classes_ = classes

        # loading inicial groups
        self.groups = []
        for label in classes:
            mask = y == label
            self.groups = self.groups + [_Group(X[mask], label)]

        self.X_ = np.asarray([g.rep_x for g in self.groups])
        self.y_ = np.asarray([g.label for g in self.groups])
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_

    def _merge(self):

        if len(self.groups) < 2:
            return self.groups

        merged = False
        for group in self.groups:
            reps_x = np.asarray([g.rep_x for g in self.groups])
            reps_y = np.asarray([g.label for g in self.groups])
            self.classifier.fit(reps_x, reps_y)

            nn2_idx = self.classifier.kneighbors(group.X, n_neighbors=2, return_distance=False)
            nn2_idx = nn2_idx.T[1]

            # could use a threshold
            if len(set(nn2_idx)) == 1 and reps_y[nn2_idx[0]] == group.label:
                ng_group = self.groups[nn2_idx[0]]
                ng2_idx = self.classifier.kneighbors(ng_group.X, n_neighbors=2, return_distance=False)
                ng2_idx = ng2_idx.T[1]
                if len(set(ng2_idx)) == 1 and self.groups[ng2_idx[0]] == group:
                    group.add_instances(ng_group.X, update=True)
                    merged = True
        if merged:

        return self.groups

    def _pruning(self):

        if len(self.groups) < 2:
            return self.groups

        pruned, fst = False, True
        knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute')
        while pruned or fst:
            index = 0
            pruned, fst = False, False

            while index < len(self.groups):
                group = self.groups[index]

                mask = np.ones(len(self.groups), dtype=bool)
                mask[index] = False
                reps_x = np.asarray([g.rep_x for g in self.groups])[mask]
                reps_y = np.asarray([g.label for g in self.groups])[mask]
                labels = knn.fit(reps_x, reps_y).predict(group.X)

                if (labels == group.label).all():
                    pruned = True
                    index = index + 1

                if len(self.groups) == 1:
                    index = len(self.groups)
                    pruned = False

        return self.groups
class CNN(InstanceReductionMixin):
    """Condensed Nearest Neighbors.

    Each class is represented by a set of prototypes, with test samples
    classified to the class with the nearest prototype.
    The Condensed Nearest Neighbors removes the redundant instances,
    maintaining the samples in the decision boundaries.

    n_neighbors : int, optional (default = 1)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    `prototypes_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `labels_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    >>> from protopy.selection.cnn import CNN
    >>> import numpy as np
    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    >>> y = np.array([1, 1, 1, 2, 2, 2])
    >>> cnn = CNN()
    >>> cnn.fit(X, y)
    >>> print(cnn.predict([[-0.8, -1]]))

    See also
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    The Condensed Nearest Neighbor is one the first prototype selection
    technique in literature.

    P. E. Hart, The condensed nearest neighbor rule, IEEE Transactions on 
    Information Theory 14 (1968) 515–516.


    def __init__(self, n_neighbors=1):
        self.n_neighbors = n_neighbors
        self.classifier = None

    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]

        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)
        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
def nearest_fit(X,y):
    clf = KNeighborsClassifier(7, 'distance')
    return clf.fit(X, y)
def knn_score(X, y, neighbors):
    knn5 = KNeighborsClassifier(n_neighbors=neighbors)
    knn5.fit(X, y)
    y_pred = knn5.predict(X)
    print "KNN{} accuracy_score: {}".format(neighbors,
                                            metrics.accuracy_score(y, y_pred))