示例#1
0
def plotDecisionBoundry(X, y, y_predicted, modelName):

    X_Train_embedded = TSNE(n_components=2).fit_transform(X)
    print(X_Train_embedded.shape)

    # create meshgrid
    resolution = 1000  # 100x100 background pixels
    X2d_xmin, X2d_xmax = np.min(X_Train_embedded[:, 0]), np.max(
        X_Train_embedded[:, 0])
    X2d_ymin, X2d_ymax = np.min(X_Train_embedded[:, 1]), np.max(
        X_Train_embedded[:, 1])
    xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution),
                         np.linspace(X2d_ymin, X2d_ymax, resolution))

    # approximate Voronoi tesselation on resolution x resolution grid using 1-NN
    background_model = KNeighborsClassifier(n_neighbors=1).fit(
        X_Train_embedded, y_predicted)
    voronoiBackground = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
    voronoiBackground = voronoiBackground.reshape((resolution, resolution))

    #plot
    plt.contourf(xx, yy, voronoiBackground)
    plt.scatter(X_Train_embedded[:, 0],
                X_Train_embedded[:, 1],
                c=y.values.flatten())
    plt.title(modelName)
    plt.show()
示例#2
0
 def do(train_data, train_label, test_data, test_label=None, adjust_parameters=True, k=5):
     train_data = np.array(train_data).squeeze()
     train_label = np.array(train_label).squeeze()
     test_data = np.array(test_data).squeeze()
     if test_label is not None:
         test_label = np.array(test_label).squeeze()
     if not adjust_parameters:
         knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8)
         knn.fit(train_data, train_label)
         predicted_label = knn.predict(test_data)
         if test_label is not None:
             acc = accuracy_score(test_label, predicted_label)
             print 'acc is ', acc
         return predicted_label
     else:
         max_acc = 0.0
         max_k = 0
         max_predicted = None
         for k in range(1, 11):
             knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8)
             knn.fit(train_data, train_label)
             predicted_label = knn.predict(test_data)
             acc = accuracy_score(test_label, predicted_label)
             if acc > max_acc:
                 max_acc = acc
                 max_k = k
                 max_predicted = predicted_label
             print 'k = ', k, ' acc is ', acc
         print 'max acc is ', max_acc, ' responding to k is ', max_k
         return max_predicted, max_k
示例#3
0
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(
                n_neighbors=self.n_neighbors, algorithm='brute')
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes
        self.classifier.fit(X, y)
        nn_idx = self.classifier.kneighbors(X,
                                            n_neighbors=2,
                                            return_distance=False)
        nn_idx = nn_idx.T[1]

        mask = [
            nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]]
            for index in xrange(nn_idx.shape[0])
        ]
        mask = ~np.asarray(mask)
        if self.keep_class != None and self.keep_class in self.classes_:
            mask[y == self.keep_class] = True

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
示例#4
0
class DCS(object):

    @abstractmethod
    def select(self, ensemble, x):
        pass

    def __init__(self, Xval, yval, K=5, weighted=False, knn=None):
        self.Xval = Xval
        self.yval = yval
        self.K = K

        if knn == None:
            self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute')
        else:
            self.knn = knn

        self.knn.fit(Xval, yval)
        self.weighted = weighted


    def get_neighbors(self, x, return_distance=False):
        # obtain the K nearest neighbors of test sample in the validation set
        if not return_distance:
            [idx] = self.knn.kneighbors(x, 
                    return_distance=return_distance)
        else:
            [dists], [idx] = self.knn.kneighbors(x, 
                    return_distance=return_distance)
        X_nn = self.Xval[idx] # k neighbors
        y_nn = self.yval[idx] # k neighbors target

        if return_distance:
            return X_nn, y_nn, dists
        else:
            return X_nn, y_nn
示例#5
0
class RawModel:
    def __init__(self):
        # 2015-05-15 GEL Found that n_components=20 gives a nice balance of 
        # speed (substantial improvement), accuracy, and reduced memory usage 
        # (25% decrease).
        self.decomposer = TruncatedSVD(n_components=20)

        # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than 
        # algorithm='kd_tree'
        
        # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples)
        # euclidean        0.950025
        # manhattan        0.933533
        # chebyshev        0.675662
        # hamming          0.708646
        # canberra         0.934033
        # braycurtis       0.940530
        self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')

    def fit(self, trainExamples):       
        X = self.decomposer.fit_transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples] ) )
        Y = [x.Y for x in trainExamples]

        self.model.fit(X, Y)
        return self

    def predict(self, examples):
        X = self.decomposer.transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) )
        return self.model.predict( X )
示例#6
0
    def predict(self, X, n_neighbors=1):
        """Perform classification on an array of test vectors X.

        The predicted class C for each sample in X is returned.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array, shape = [n_samples]

        Notes
        -----
        The default prediction is using KNeighborsClassifier, if the
        instance reducition algorithm is to be performed with another
        classifier, it should be explicited overwritten and explained
        in the documentation.
        """
        X = check_array(X)
        if not hasattr(self, "X_") or self.X_ is None:
            raise AttributeError("Model has not been trained yet.")

        if not hasattr(self, "y_") or self.y_ is None:
            raise AttributeError("Model has not been trained yet.")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict(X)
示例#7
0
class DCS(object):
    @abstractmethod
    def select(self, ensemble, x):
        pass

    def __init__(self, Xval, yval, K=5, weighted=False, knn=None):
        self.Xval = Xval
        self.yval = yval
        self.K = K

        if knn is None:
            self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute')
        else:
            self.knn = knn

        self.knn.fit(Xval, yval)
        self.weighted = weighted

    def get_neighbors(self, x, return_distance=False):
        # obtain the K nearest neighbors of test sample in the validation set
        if not return_distance:
            [idx] = self.knn.kneighbors(x, return_distance=return_distance)
        else:
            rd = return_distance
            [dists], [idx] = self.knn.kneighbors(x, return_distance=rd)
        X_nn = self.Xval[idx]  # k neighbors
        y_nn = self.yval[idx]  # k neighbors target

        if return_distance:
            return X_nn, y_nn, dists
        else:
            return X_nn, y_nn
示例#8
0
    def reduce_data(self, X, y):

        X, y = check_arrays(X, y, sparse_format="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(
                n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]

        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)

        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
示例#9
0
    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        classes = np.unique(y)
        self.classes_ = classes

        # loading inicial groups
        self.groups = []
        for label in classes:
            mask = y == label
            self.groups = self.groups + [_Group(X[mask], label)]

        self._main_loop()
        self._generalization_step()
        self._merge()
        self._pruning()
        self.X_ = np.asarray([g.rep_x for g in self.groups])
        self.y_ = np.asarray([g.label for g in self.groups])
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
示例#10
0
def KNN_method(X, y):
    skf = StratifiedKFold(n_splits=4, random_state=42)
    skf.get_n_splits(X, y)

    for train_index, test_index in skf.split(X, y):
        print("Train:", train_index, "Validation:", test_index)
        trainX, testX = X[train_index], X[test_index]
        trainY, testY = y[train_index], y[test_index]

        #here starts KNN
        #how many neighbours want to use in the KNC
        kvalues = [1, 3, 5, 7, 9, 11, 13, 15, 19, 24, 30, 40, 50, 60, 70, 90]
        dist = ['manhattan', 'euclidean', 'chebyshev']
        results = {}
        for element in dist:
            accuracy_results = []
            for k in kvalues:
                knn = KNeighborsClassifier(n_neighbors=k, metric=element)
                knn.fit(trainX, trainY)
                predictedY = knn.predict(testX)
                accuracy_results.append(accuracy_score(testY, predictedY))
            results[element] = accuracy_results
        print("Results of model preparation for: " + str(results))

        plt.figure()
        multiple_line_chart(plt.gca(),
                            kvalues,
                            results,
                            'KNN variants',
                            'n',
                            'accuracy',
                            percentage=True)
        plt.show()
示例#11
0
    def _pruning(self):

        if len(self.groups) < 2:
            return self.groups

        pruned, fst = False, True
        knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute')
        
        while pruned or fst:
            index = 0
            pruned, fst = False, False

            while index < len(self.groups):
                group = self.groups[index]

                mask = np.ones(len(self.groups), dtype=bool)
                mask[index] = False
                reps_x = np.asarray([g.rep_x for g in self.groups])[mask]
                reps_y = np.asarray([g.label for g in self.groups])[mask]
                labels = knn.fit(reps_x, reps_y).predict(group.X)

                if (labels == group.label).all():
                    self.groups.remove(group)
                    pruned = True
                else:
                    index = index + 1

                if len(self.groups) == 1:
                    index = len(self.groups)
                    pruned = False

        return self.groups
示例#12
0
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0
            return self.X_, self.y_

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
示例#13
0
    def _main_loop(self):
        exit_count = 0
        knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute')
        while exit_count < len(self.groups):
            index, exit_count = 0, 0
            while index < len(self.groups):

                group = self.groups[index]
                reps_x = np.asarray([g.rep_x for g in self.groups])
                reps_y = np.asarray([g.label for g in self.groups])
                knn.fit(reps_x, reps_y)
                
                nn_idx = knn.kneighbors(group.X, n_neighbors=1, return_distance=False)
                nn_idx = nn_idx.T[0]
                mask = nn_idx == index
                
                # if all are correctly classified
                if not (False in mask):
                    exit_count = exit_count + 1
                
                # if all are misclasified
                elif not (group.label in reps_y[nn_idx]):
                    pca = PCA(n_components=1)
                    pca.fit(group.X)
                    # maybe use a 'for' instead of creating array
                    d = pca.transform(reps_x[index])
                    dis = [pca.transform(inst)[0] for inst in group.X]
                    mask_split = (dis < d).flatten()
                    
                    new_X = group.X[mask_split]
                    self.groups.append(_Group(new_X, group.label))
                    group.X = group.X[~mask_split]
                
                elif (reps_y[nn_idx] == group.label).all() and (nn_idx != index).any():
                    mask_mv = nn_idx != index
                    index_mv = np.asarray(range(len(group)))[mask_mv]
                    X_mv = group.remove_instances(index_mv)
                    G_mv = nn_idx[mask_mv]                        

                    for x, g in zip(X_mv, G_mv):
                        self.groups[g].add_instances([x])

                elif (reps_y[nn_idx] != group.label).sum()/float(len(group)) > self.r_mis:
                    mask_mv = reps_y[nn_idx] != group.label
                    new_X = group.X[mask_mv]
                    self.groups.append(_Group(new_X, group.label))
                    group.X = group.X[~mask_mv]
                else:
                   exit_count = exit_count + 1

                if len(group) == 0:
                    self.groups.remove(group)
                else:
                    index = index + 1

                for g in self.groups:
                    g.update_all()

        return self.groups                     
示例#14
0
def evaluate(Xtra, ytra, Xtst, ytst, k=1, positive_label=1):
    knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute')
    knn.fit(Xtra, ytra)

    y_true = ytst
    y_pred = knn.predict(Xtst)

    return evaluate_results(y_true, y_pred, positive_label=positive_label)
示例#15
0
 def __init__(self, csv_path_train, csv_path_test, k):
     '''
     Constructor
     '''
     self.csv_path_train = csv_path_train
     self.csv_path_test = csv_path_test
     self.classifier = KNeighborsClassifier(n_neighbors=k,
                                            p=2,
                                            metric='minkowski')
示例#16
0
def knn(X, y, model_path):
    model = KNeighborsClassifier()
    model.fit(X, y)
    print(model)
    #预测
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)
示例#17
0
    def __init__(self, Xval, yval, K=5, weighted=False, knn=None):
        self.Xval = Xval
        self.yval = yval
        self.K = K

        if knn is None:
            self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute')
        else:
            self.knn = knn

        self.knn.fit(Xval, yval)
        self.weighted = weighted
示例#18
0
def plot_boundaries_decision(X, y, clf, namefile):
    """
    Method to plot the boundaries decision of our data 
    X : A numpy array of the data we want to plot 
    y : A numpy array of the  label corresponding to our data
    clf : the model use to predict the label of our data
    namefile : the name of the file in which we want to save the figure  
    """
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.333,
                                                        random_state=42)
    #    #The plot of boundary decision in the 2D space of representation of data
    model.fit(X_train, y_train)

    # create meshgrid
    resolution = 100  # 100x100 background pixels
    X2d_xmin, X2d_xmax = np.min(X[:, 0]), np.max(X[:, 0])
    X2d_ymin, X2d_ymax = np.min(X[:, 1]), np.max(X[:, 1])
    xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution),
                         np.linspace(X2d_ymin, X2d_ymax, resolution))

    # approximate Voronoi tesselation on resolution x resolution grid using 1-NN
    background_model = KNeighborsClassifier(n_neighbors=1).fit(X, y)
    voronoiBackground = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
    voronoiBackground = voronoiBackground.reshape((resolution, resolution))

    fig = pyplot.figure()
    fig.set_size_inches(10.5, 8.5)

    ax = fig.add_subplot(211)  #small subplot to show how the legend has moved.
    #plot
    ax.contourf(xx, yy, voronoiBackground)
    ax.set_title(
        " Boundaries decision in using the dimensionality reduction of Multidimensional scaling"
    )
    ax.scatter(X[:, 0], X[:, 1], c=color[y].tolist())

    label = numpy.array([x for x in ["Apple", "Tomatoes"]])
    # Legend
    for ind, s in enumerate(label):
        ax.scatter([], [], label=s, color=color[ind])

    pyplot.legend(scatterpoints=1,
                  frameon=True,
                  labelspacing=0.5,
                  bbox_to_anchor=(1.2, .4),
                  loc='center right')

    pyplot.tight_layout()
    pyplot.savefig(namefile)
    pyplot.show()
示例#19
0
def get_best_k(X, y, max_k=30, keep_best_n=10, weights=None):

    # TODO: check X, y. description

    # Set default values
    if max_k is None:
        max_k = len(X)

    if weights is None:
        weights = ['uniform', 'distance']

    # Make weights into a list if it is not already one
    if type(weights) is not list:
        weights = [weights]

    # Check if inputs are valid
    check_pandas_dataframe_nd(X, 'X')

    check_numpy_array_pandas_dataframe_series_1d(y, 'y')

    check_list_of_strings(weights, 'weights')

    check_integer(max_k, 'max_k')
    check_larger(max_k, 'max_k', 1)

    check_integer(keep_best_n, 'keep_best_n')
    check_larger(keep_best_n, 'keep_best_n', 1)

    # Change shape of y if necessary
    y = np.array(y)
    y = y.ravel()

    # Split into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    # Get value for max_k
    max_k = min(max_k, len(X_test))

    # Set up results-list
    best_model = []

    for k in range(1, max_k):
        for weight in weights:
            model = KNeighborsClassifier(n_neighbors=k,
                                         weights=weight).fit(X_train, y_train)
            score = model.score(X_test, y_test)
            best_model.append((k, weight, score))

    best_model.sort(key=lambda x: x[2], reverse=True)
    best_model = best_model[0:keep_best_n]
    return best_model
示例#20
0
def __plot_decision_boundaries(X,
                               y,
                               y_pred,
                               resolution: int = 100,
                               embedding=None):
    if embedding is None:
        embedding = TSNE(n_components=2, random_state=160290).fit_transform(X)

    x_min, x_max = safe_bounds(embedding[:, 0])
    y_min, y_max = safe_bounds(embedding[:, 1])
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, resolution),
                         np.linspace(y_min, y_max, resolution))

    # approximate Voronoi tesselation on resolution x resolution grid using 1-NN
    background_model = KNeighborsClassifier(n_neighbors=1).fit(
        embedding, y_pred)
    voronoi_bg = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
    voronoi_bg = voronoi_bg.reshape((resolution, resolution))

    mesh = hv.QuadMesh((xx, yy, voronoi_bg)).opts(cmap="viridis")
    points = hv.Scatter(
        {
            "x": embedding[:, 0],
            "y": embedding[:, 1],
            "pred": y_pred,
            "class": y
        },
        kdims=["x", "y"],
        vdims=["pred", "class"],
    )
    errors = y_pred != y
    failed_points = hv.Scatter(
        {
            "x": embedding[errors, 0],
            "y": embedding[errors, 1]
        },
        kdims=["x", "y"]).opts(color="red", size=5, alpha=0.9)

    points = points.opts(color="pred",
                         cmap="viridis",
                         line_color="grey",
                         size=10,
                         alpha=0.8,
                         tools=["hover"])
    plot = mesh * points * failed_points
    plot = plot.opts(xaxis=None,
                     yaxis=None,
                     width=500,
                     height=450,
                     title="Decision boundaries on TSNE")
    return plot
示例#21
0
    def get_result(self):
        # file opener
        tkinter.Tk().withdraw()
        directory = filedialog.askdirectory()
        result = self.read_emails_from_directory(directory)

        train_labels = np.zeros(1430)
        train_labels[715:1430] = 1
        # This equates to 1-715 = HAM and 716-1430 = SPAM
        #                              If you change result[n] to something else
        #                              Make sure you change the same result down
        #                              down in line 251 (test_matrix)
        train_matrix = self.extract_features(directory, result[0])
        #print(train_matrix)
        # print("body words:", result[0])
        # print("\n\nsubject words:", result[1])
        # print("\n\nbody phrases:", result[2])
        # print("\n\nsubject phrases:", result[3])

        print("body words:", len(result[0]))
        print("subject words:", len(result[1]))
        print("body phrases:", len(result[2]))
        print("subject phrases:", len(result[3]))

        model1 = MultinomialNB()
        model2 = LinearSVC()
        model3 = RandomForestClassifier()
        model4 = KNeighborsClassifier()
        model1.fit(train_matrix, train_labels)
        model2.fit(train_matrix, train_labels)
        model3.fit(train_matrix, train_labels)
        model4.fit(train_matrix, train_labels)

        test_dir = filedialog.askdirectory()
        #                                       Here -----v
        test_matrix = self.extract_features(test_dir, result[0])
        test_labels = np.zeros(600)
        # This equates to 1-300 = HAM and 301-600 = SPAM
        test_labels[300:600] = 1
        result1 = model1.predict(test_matrix)
        result2 = model2.predict(test_matrix)
        result3 = model3.predict(test_matrix)
        result4 = model4.predict(test_matrix)

        print(confusion_matrix(test_labels, result1))
        print(confusion_matrix(test_labels, result2))
        print(confusion_matrix(test_labels, result3))
        print(confusion_matrix(test_labels, result4))
        return result
示例#22
0
def knn_builder():
    pip_knn = Pipeline([("selector",SelectKBest(chi2)),("knn_clf",KNeighborsClassifier())])
    parameters_knn ={'selector__k':[20],
                "knn_clf__n_neighbors":[1]}
    scorer_knn = make_scorer(accuracy_score)
    searcher_knn = GridSearchCV(pip_knn, parameters_knn, scoring=scorer_knn)
    return searcher_knn
示例#23
0
    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        classes = np.unique(y)
        self.classes_ = classes

        minority_class = self.pos_class
        if self.pos_class == None:
            minority_class = min(set(y), key = list(y).count)

        # loading inicial groups
        self.groups = []
        for label in classes:
            mask = y == label
            self.groups = self.groups + [_Group(X[mask], label)]

        self._main_loop()
        self._generalization_step()
        min_groups = filter(lambda g: g.label == minority_class, self.groups)
        self._merge()
        self._pruning()
        max_groups = filter(lambda g: g.label != minority_class, self.groups)
        self.groups = min_groups + max_groups
        self.X_ = np.asarray([g.rep_x for g in self.groups])
        self.y_ = np.asarray([g.label for g in self.groups])
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
示例#24
0
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
示例#25
0
    def __init__(self, estimator=KNeighborsClassifier(n_neighbors=10), dimensionality_reduction=PCA(n_components=2), acceptance_threshold=0.03, n_decision_boundary_keypoints=60, n_connecting_keypoints=None, n_interpolated_keypoints=None, n_generated_testpoints_per_keypoint=15, linear_iteration_budget=100, hypersphere_iteration_budget=300, verbose=True):
        if acceptance_threshold == 0:
            raise Warning(
                "A nonzero acceptance threshold is strongly recommended so the optimizer can finish in finite time")
        if linear_iteration_budget < 2 or hypersphere_iteration_budget < 2:
            raise Exception("Invalid iteration budget")

        self.classifier = estimator
        self.dimensionality_reduction = dimensionality_reduction
        self.acceptance_threshold = acceptance_threshold

        if n_decision_boundary_keypoints and n_connecting_keypoints and n_interpolated_keypoints and n_connecting_keypoints + n_interpolated_keypoints != n_decision_boundary_keypoints:
            raise Exception(
                "n_connecting_keypoints and n_interpolated_keypoints must sum to n_decision_boundary_keypoints (set them to None to use calculated suggestions)")

        self.n_connecting_keypoints = n_connecting_keypoints if n_connecting_keypoints != None else n_decision_boundary_keypoints / 3
        self.n_interpolated_keypoints = n_interpolated_keypoints if n_interpolated_keypoints != None else n_decision_boundary_keypoints * 2 / 3

        self.linear_iteration_budget = linear_iteration_budget
        self.n_generated_testpoints_per_keypoint = n_generated_testpoints_per_keypoint
        self.hypersphere_iteration_budget = hypersphere_iteration_budget
        self.verbose = verbose

        self.decision_boundary_points = []
        self.decision_boundary_points_2d = []
        self.X_testpoints = []
        self.y_testpoints = []
        self.background = []
        self.steps = 3

        self.hypersphere_max_retry_budget = 20
        self.penalties_enabled = True
        self.random_gap_selection = False
示例#26
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
示例#27
0
    def reduce_data(self, X, y):
        
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]


        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)
       
        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
示例#28
0
文件: baseNew.py 项目: dvro/ml
    def predict(self, X, n_neighbors=1):
        """Perform classification on an array of test vectors X.

        The predicted class C for each sample in X is returned.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array, shape = [n_samples]

        Notes
        -----
        The default prediction is using KNeighborsClassifier, if the
        instance reducition algorithm is to be performed with another
        classifier, it should be explicited overwritten and explained
        in the documentation.
        """
        X = atleast2d_or_csr(X)
        if not hasattr(self, "X_") or self.X_ is None:
            raise AttributeError("Model has not been trained yet.")

        if not hasattr(self, "y_") or self.y_ is None:
            raise AttributeError("Model has not been trained yet.")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict(X)
示例#29
0
def get_gating(dss, tsf_name, use_gating=UseGating.TREE, *args, **kwargs):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.neighbors.classification import KNeighborsClassifier

    component_scale = [1, 0.2]
    # TODO this is specific to coordinate transform to slice just the body frame reaction force
    # input_slice = slice(3, None)
    input_slice = None

    if use_gating is UseGating.MLP:
        gating = gating_function.MLPSelector(dss, *args, **kwargs, name=tsf_name, input_slice=input_slice)
    elif use_gating is UseGating.KDE:
        gating = gating_function.KDESelector(dss, component_scale=component_scale, input_slice=input_slice)
    elif use_gating is UseGating.GMM:
        opts = {'n_components': 10, }
        if kwargs is not None:
            opts.update(kwargs)
        gating = gating_function.GMMSelector(dss, gmm_opts=opts, variational=True, component_scale=component_scale,
                                             input_slice=input_slice)
    elif use_gating is UseGating.TREE:
        gating = gating_function.SklearnClassifierSelector(dss, DecisionTreeClassifier(**kwargs),
                                                           input_slice=input_slice)
    elif use_gating is UseGating.FORCE:
        gating = gating_function.ReactionForceHeuristicSelector(12, slice(3, None))
    elif use_gating is UseGating.MLP_SKLEARN:
        gating = gating_function.SklearnClassifierSelector(dss, MLPClassifier(**kwargs), input_slice=input_slice)
    elif use_gating is UseGating.KNN:
        gating = gating_function.SklearnClassifierSelector(dss, KNeighborsClassifier(n_neighbors=1, **kwargs),
                                                           input_slice=input_slice)
    else:
        raise RuntimeError("Unrecognized selector option")
    return gating
示例#30
0
    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U
示例#31
0
    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U
示例#32
0
    def __init__(self,
                 n_neighbors=1,
                 alpha=0.6,
                 max_loop=1000,
                 threshold=0,
                 chromosomes_count=10):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.max_loop = max_loop
        self.threshold = threshold
        self.chromosomes_count = chromosomes_count

        self.evaluations = None
        self.chromosomes = None

        self.best_chromosome_ac = -1
        self.best_chromosome_rd = -1

        self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
示例#33
0
 def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):
     self._hyperparams = {
         'n_neighbors': n_neighbors,
         'weights': weights,
         'algorithm': algorithm,
         'leaf_size': leaf_size,
         'p': p,
         'metric': metric,
         'metric_params': metric_params,
         'n_jobs': n_jobs}
     self._wrapped_model = Op(**self._hyperparams)
示例#34
0
    def __init__(self, Xval, yval, K=5, weighted=False, knn=None):
        self.Xval = Xval
        self.yval = yval
        self.K = K

        if knn == None:
            self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute')
        else:
            self.knn = knn

        self.knn.fit(Xval, yval)
        self.weighted = weighted
示例#35
0
def run_knn_multi_level_classifier(train, train_labels):
    k_range = list(range(2, 5))
    k_scores = []
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn,
                                 train,
                                 train_labels,
                                 cv=10,
                                 scoring='accuracy')
        k_scores.append(scores.mean())
    return k_scores
示例#36
0
    def setclassifier(self, estimator=KNeighborsClassifier(n_neighbors=10)):
        """Assign classifier for which decision boundary should be plotted.

        Parameters
        ----------
        estimator : BaseEstimator instance, optional (default=KNeighborsClassifier(n_neighbors=10)).
            Classifier for which the decision boundary should be plotted. Must have
            probability estimates enabled (i.e. estimator.predict_proba must work).
            Make sure it is possible for probability estimates to get close to 0.5
            (more specifically, as close as specified by acceptance_threshold).
        """
        self.classifier = estimator
示例#37
0
def compute_cnn(X, y):

  "condenced nearest neighbor. the cnn removes reduntant instances, maintaining the samples in the decision boundaries."

  classifier = KNeighborsClassifier(n_neighbors=3)

  prots_s = []
  labels_s = []

  classes = np.unique(y)
  classes_ = classes

  for cur_class in classes:
    mask = y == cur_class
    insts = X[mask]
    prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
    labels_s = labels_s + [cur_class]
    
  classifier.fit(prots_s, labels_s)
  for sample, label in zip(X, y):
    if classifier.predict(sample) != [label]:
      prots_s = prots_s + [sample]
      labels_s = labels_s + [label]
      classifier.fit(prots_s, labels_s)

  X_ = np.asarray(prots_s)
  y_ = np.asarray(labels_s)
  reduction_ = 1.0 - float(len(y_)/len(y))
  print reduction_
示例#38
0
def build_and_test_model(classifier, X, Y, Z, param):

    accuracies = []
    ari = []

    for train, test in LeaveOneOut().split(X):

        X_train, Y_train = X[train], Y[train]
        X_test, Y_test, Z_test = X[test], Y[test], Z[test]
        predicted = None

        if classifier == "KNN":
            neigh = KNeighborsClassifier(n_neighbors=param).fit(
                X_train, Y_train)
            predicted = neigh.predict(X_test)

        elif classifier == "RF":
            clf = RandomForestClassifier(n_estimators=param,
                                         random_state=0)  # ,max_depth=2,
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test)

        elif classifier == "SVM":
            clf = svm.SVC(gamma='scale')
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test).astype(int)

        elif classifier == "NAIVE":
            clf = GaussianNB()
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test).astype(int)

        elif classifier == "RANDOM":
            options = list(set(Y_train))
            predicted = [random.choice(options) for _ in range(len(Y_test))]

        accuracies.append(metrics.accuracy_score(Y_test, predicted))
        ari.append(metrics.adjusted_rand_score(Z_test, predicted))

    return np.mean(accuracies), np.std(accuracies), np.mean(ari), np.std(ari)
示例#39
0
class PatchedRawModel:
    def __init__(self):
        self.baseModel = RawModel()
        self.model49 = KNeighborsClassifier(n_neighbors=10)
        self.model35 = KNeighborsClassifier(n_neighbors=10)
    
    def fit(self, trainExamples):
        self.baseModel.fit(trainExamples)

        X49 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [4, 9]] )
        Y49 = [x.Y for x in trainExamples if x.Y in [4, 9]]
        self.model49.fit(X49, Y49)

        X35 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [3, 5]] )
        Y35 = [x.Y for x in trainExamples if x.Y in [3, 5]]
        self.model35.fit(X35, Y35)

    def predict(self, examples):
        basePredictions = self.baseModel.predict(examples)

        for (x, y, i) in zip(examples, basePredictions, range(0, len(examples))):
            if y in [4, 9]:
                specializedPrediction = self.model49.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT)))
                if specializedPrediction != y:
                    basePredictions[i] = specializedPrediction
            elif y in [3, 5]:
                specializedPrediction = self.model35.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT)))
                if specializedPrediction != y:
                    basePredictions[i] = specializedPrediction

        return basePredictions
示例#40
0
def compute_enn(X, y):
  """
  the edited nearest neighbors removes the instances in the boundaries, maintaining reduntant samples
  """

  classifier = KNeighborsClassifier(n_neighbors=3)

  classes = np.unique(y)
  classes_ = classes

  mask = np.zeros(y.size, dtype=bool)
  classifier.fit(X, y)

  for i in xrange(y.size):
    sample, label = X[i], y[i]
    if classifier.predict(sample) == [label]:
      mask[i] = not mask[i]

  X_ = np.asarray(X[mask])
  y_ = np.asarray(y[mask])
  reduction_ = 1.0 - float(len(y_)) / len(y)
  print reduction_
示例#41
0
    def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.max_loop = max_loop
        self.threshold = threshold
        self.chromosomes_count = chromosomes_count

        self.evaluations = None
        self.chromosomes = None

        self.best_chromosome_ac = -1
        self.best_chromosome_rd = -1

        self.classifier = KNeighborsClassifier(n_neighbors = n_neighbors)
示例#42
0
    def __init__(self):
        # 2015-05-15 GEL Found that n_components=20 gives a nice balance of 
        # speed (substantial improvement), accuracy, and reduced memory usage 
        # (25% decrease).
        self.decomposer = TruncatedSVD(n_components=20)

        # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than 
        # algorithm='kd_tree'
        
        # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples)
        # euclidean        0.950025
        # manhattan        0.933533
        # chebyshev        0.675662
        # hamming          0.708646
        # canberra         0.934033
        # braycurtis       0.940530
        self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors, algorithm='brute')
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes
        self.classifier.fit(X, y)
        nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False)
        nn_idx = nn_idx.T[1]

        mask = [nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0])]
        mask = ~np.asarray(mask) 
        if self.keep_class != None and self.keep_class in self.classes_:
            mask[y==self.keep_class] = True

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
示例#44
0
 def __init__(self):
     self.baseModel = RawModel()
     self.model49 = KNeighborsClassifier(n_neighbors=10)
     self.model35 = KNeighborsClassifier(n_neighbors=10)
示例#45
0
class ENN(InstanceReductionMixin):

    """Edited Nearest Neighbors.

    The Edited Nearest Neighbors  removes the instances in de 
    boundaries, maintaining redudant samples.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.enn import ENN
    >>> import numpy as np
    >>> X = np.array([[-1, 0], [-0.8, 1], [-0.8, -1], [-0.5, 0] , [0.5, 0], [1, 0], [0.8, 1], [0.8, -1]])
    >>> y = np.array([1, 1, 1, 2, 1, 2, 2, 2])
    >>> editednn = ENN()
    >>> editednn.fit(X, y)
    ENN(n_neighbors=3)
    >>> print(editednn.predict([[-0.6, 0.6]]))
    [1]
    >>> print editednn.reduction_
    0.75

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    References
    ----------
    Ruiqin Chang, Zheng Pei, and Chao Zhang. A modified editing k-nearest
    neighbor rule. JCP, 6(7):1493–1500, 2011.

    """

    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
        self.classifier = None


    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
示例#46
0
class SSMA(InstanceReductionMixin):
    """Steady State Memetic Algorithm

    The Steady-State Memetic Algorithm is an evolutionary prototype
    selection algorithm. It uses a memetic algorithm in order to 
    perform a local search in the code.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    alpha   : float (default = 0.6)
        Parameter that ponderates the fitness function.

    max_loop    : int (default = 1000)
        Number of maximum loops performed by the algorithm.

    threshold   : int (default = 0)
        Threshold that regulates the substitution condition;

    chromosomes_count: int (default = 10)
        number of chromosomes used to find the optimal solution.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.ssma import SSMA
    >>> import numpy as np
    >>> X = np.array([[i] for i in range(100)])
    >>> y = np.asarray(50 * [0] + 50 * [1])
    >>> ssma = SSMA()
    >>> ssma.fit(X, y)
    SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0)
    >>> print ssma.predict([[40],[60]])
    [0 1]
    >>> print ssma.reduction_
    0.98

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    References
    ----------
    Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype
    selection based on a steady-state memetic algorithm: a study of scalability.
    Memetic Computing, 2(3):183–199, 2010.

    """
    def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.max_loop = max_loop
        self.threshold = threshold
        self.chromosomes_count = chromosomes_count

        self.evaluations = None
        self.chromosomes = None

        self.best_chromosome_ac = -1
        self.best_chromosome_rd = -1

        self.classifier = KNeighborsClassifier(n_neighbors = n_neighbors)


    def accuracy(self, chromosome, X, y):
        mask = np.asarray(chromosome, dtype=bool)
        cX, cy = X[mask], y[mask]
        #print len(cX), len(cy), sum(chromosome)

        self.classifier.fit(cX, cy)
        labels = self.classifier.predict(X)
        accuracy = (labels == y).sum()

        return float(accuracy)/len(y)


    def fitness(self, chromosome, X, y):
        #TODO add the possibility of use AUC for factor1
        ac = self.accuracy(chromosome, X, y)
        rd = 1.0 - (float(sum(chromosome))/len(chromosome))

        return self.alpha * ac + (1.0 - self.alpha) * rd


    def fitness_gain(self, gain, n):
        return self.alpha * (float(gain)/n) + (1 - self.alpha) * (1.0 / n)


    def update_threshold(self, X, y):
        best_index = np.argmax(self.evaluations)
        chromosome = self.chromosomes[best_index]

        best_ac = self.accuracy(chromosome, X, y)
        best_rd = 1.0 - float(sum(chromosome))/len(y)

        if best_ac <= self.best_chromosome_ac:
            self.threshold = self.threshold + 1
        if best_rd <= self.best_chromosome_rd:
            self.threshold = self.threshold - 1

        self.best_chromosome_ac = best_ac
        self.best_chromosome_rd = best_rd


    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U
            

    def memetic_looper(self, S, R):
        c = 0
        for i in range(len(S)):
            if S[i] == 1 and i not in R:
                c = c + 1
                if c == 2:
                    return True

        return False

    def memetic_select_j(self, S, R):
        indexs = []
        for i in range(len(S)):
            if i not in R and S[i] == 1:
                indexs.append(i)
        # if list is empty wlil return error
        return np.random.choice(indexs)


    def generate_population(self, X, y):
        self.chromosomes = [[np.random.choice([0,1]) for i in range(len(y))]
                            for c in range(self.chromosomes_count)]
        self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes]

        self.update_threshold(X, y)
        

    def select_parents(self, X, y):
        parents = []
        for i in range(2):
            samples = random.sample(self.chromosomes, 2)
            parents = parents + [samples[0] if self.fitness(samples[0], X, y) >
                                    self.fitness(samples[1], X, y) else samples[1]]
        return np.array(parents, copy=True)

    def crossover(self, parent_1, parent_2):
        size = len(parent_1)
        mask = [0] * (size/2) + [1] * (size - size/2)
        mask = np.asarray(mask, dtype=bool)
        np.random.shuffle(mask)

        off_1 = parent_1 * mask + parent_2 * ~mask
        off_2 = parent_2 * mask + parent_1 * ~mask
        
        return np.asarray([off_1, off_2])
        

    def mutation(self, offspring):
        for i in range(len(offspring)):
            if np.random.uniform(0,1) < 1.0/len(offspring):
                offspring[i] = not offspring[i]

        return offspring

    def memetic_search(self, chromosome, X, y, chromosome_fitness = None):
        S = np.array(chromosome, copy=True)
        if S.sum() == 0:
            return S, 0

        if chromosome_fitness == None:
            chromosome_fitness = self.fitness(chromosome, X, y)
        fitness_s = chromosome_fitness

        # List of visited genes in S 
        R = []
        # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i
        U = self.index_nearest_neighbor(S, X, y)
        
        while self.memetic_looper(S, R):
            j = self.memetic_select_j(S, R) 
            S[j] = 0
            gain = 0.0
            U_copy = list(U)
            mask = np.asarray(S, dtype=bool)
            X_tra, y_tra = X[mask], y[mask]
            real_idx = np.asarray(range(len(y)))[mask]

            if len(y_tra) > 0:
                for i in range(len(U)):
                    if U[i] == j:
                        self.classifier.fit(X_tra, y_tra)
                        [[idx]] = self.classifier.kneighbors(X[i], n_neighbors=1,
                                return_distance=False)
                        U[i] = real_idx[idx]
                        
                        if y[i] == y[U_copy[i]] and y[i] != y[U[i]]:
                            gain = gain - 1.0
                        if y[i] != y[U_copy[i]] and y[i] == y[U[i]]:
                            gain = gain + 1.0
                
            if gain >= self.threshold:
                n = S.sum()
                g = self.fitness_gain(gain, n)
                fitness_s = fitness_s + g
                R = []
            else:
                U = U_copy
                S[j] = 1
                R.append(j)

        return list(S), fitness_s

                    


    def main_loop(self, X, y):
        self.generate_population(X, y)
        n, worse_fit_index = 0, -1
        while (n < self.max_loop):
            parents = self.select_parents(X, y)
            offspring = self.crossover(parents[0], parents[1])
            offspring[0] = self.mutation(offspring[0])
            offspring[1] = self.mutation(offspring[1])

            fit_offs = [self.fitness(off, X, y) if sum(off) > 0 else -1 for off in offspring]
            
            if worse_fit_index == -1:
                worse_fit_index = np.argmin(self.evaluations)

            
            for i in range(len(offspring)):
                p_ls = 1.0 

                if fit_offs[i] == -1:
                    p_ls = -1

                if fit_offs[i] <= self.evaluations[worse_fit_index]:
                    p_ls = 0.0625

                if np.random.uniform(0,1) < p_ls:

                    offspring[i], fit_offs[i] = self.memetic_search(offspring[i], X, y, chromosome_fitness = fit_offs[i])

            for i in range(len(offspring)):
                if fit_offs[i] > self.evaluations[worse_fit_index]:
                    self.chromosomes[worse_fit_index] = offspring[i]
                    self.evaluations[worse_fit_index] = fit_offs[i]

                    worse_fit_index = np.argmin(self.evaluations)

            n = n + 1
            if n % 10 == 0:
                self.update_threshold(X, y)


    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        classes = np.unique(y)
        self.classes_ = classes

        self.main_loop(X, y)

        best_index = np.argmax(self.evaluations)
        mask = np.asarray(self.chromosomes[best_index], dtype=bool)
        self.X_ = X[mask]
        self.y_ = y[mask]
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)

        return self.X_, self.y_
示例#47
0
文件: baseNew.py 项目: dvro/ml
class InstanceReductionMixin(InstanceReductionBase, ClassifierMixin):

    """Mixin class for all instance reduction techniques"""


    def set_classifier(self):
        """Sets the classified to be used in the instance reduction process
            and classification.

        Parameters
        ----------
        classifier : classifier, following the KNeighborsClassifier style
            (default = KNN)

        y : array-like, shape = [n_samples]
            Labels for X.

        Returns
        -------
        P : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        q : array-like, shape = [indertaminated]
            Labels for P
        """

        self.classifier = classifier


    def reduce_data(self, X, y):
        """Perform the instance reduction procedure on the given training data.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training set.0

        y : array-like, shape = [n_samples]
            Labels for X.

        Returns
        -------
        X_ : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        y_ : array-like, shape = [indertaminated]
            Labels for X_
        """
        pass
    
    def get_prototypes(self):
        return self.X_, self.y_

    def fit(self, X, y, reduce_data=True):
        """
        Fit the InstanceReduction model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        reduce_data : bool, flag indicating if the reduction would be performed
        """
        self.X = X
        self.y = y
        self.labels = set(y)
        self.prototypes = None
        self.prototypes_labels = None
        self.reduction_ratio = 0.0

        if reduce_data:
            self.reduce_data(X, y)

        return self

    def predict(self, X, n_neighbors=1):
        """Perform classification on an array of test vectors X.

        The predicted class C for each sample in X is returned.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array, shape = [n_samples]

        Notes
        -----
        The default prediction is using KNeighborsClassifier, if the
        instance reducition algorithm is to be performed with another
        classifier, it should be explicited overwritten and explained
        in the documentation.
        """
        X = atleast2d_or_csr(X)
        if not hasattr(self, "X_") or self.X_ is None:
            raise AttributeError("Model has not been trained yet.")

        if not hasattr(self, "y_") or self.y_ is None:
            raise AttributeError("Model has not been trained yet.")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict(X)


    def predict_proba(self, X):
        """Return probability estimates for the test data X.
        after a given prototype selection algorithm.
    
        Parameters
        ----------
        X : array, shape = (n_samples, n_features)
            A 2-D array representing the test points.
        
        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
        of such arrays if n_outputs > 1.
        The class probabilities of the input samples. Classes are ordered
        by lexicographic order.
        """
        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict_proba(X)
class TomekLinks(InstanceReductionMixin):

    """Tomek Links.

    The Tomek Links algorithm removes a pair instances that
    forms a Tomek Link. This techniques removes instances in
    the decision region.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default in the classification (only).
        The Tomek Links uses only n_neighbors=1 in the reduction.

    keep_class : int, optional (default = None)
        Label of the class to not be removed in the tomek links. If None,
        it removes all nodes of the links.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------

    >>> from protopy.selection.tomek_links import TomekLinks
    >>> import numpy as np
    >>> X = np.array([[0],[1],[2.1],[2.9],[4],[5],[6],[7.1],[7.9],[9]])
    >>> y = np.array([1,1,2,1,2,2,2,1,2,2])
    >>> tl = TomekLinks()
    >>> tl.fit(X, y)
    TomekLinks(keep_class=None)
    >>> print tl.predict([[2.5],[7.5]])
    [1, 2]
    >>> print tl.reduction_
    0.4

    See also
    --------
    protopy.selection.enn.ENN: edited nearest neighbor

    References
    ----------
    I. Tomek, “Two modifications of cnn,” IEEE Transactions on Systems,
    Man and Cybernetics, vol. SMC-6, pp. 769–772, 1976.

    """

    def __init__(self, n_neighbors=3, keep_class=None):
        self.n_neighbors = n_neighbors
        self.classifier = None
        self.keep_class = keep_class


    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors, algorithm='brute')
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes
        self.classifier.fit(X, y)
        nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False)
        nn_idx = nn_idx.T[1]

        mask = [nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0])]
        mask = ~np.asarray(mask) 
        if self.keep_class != None and self.keep_class in self.classes_:
            mask[y==self.keep_class] = True

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
示例#49
0
class SGP2(SGP):
    """Self-Generating Prototypes 2

    The Self-Generating Prototypes 2 is the second version of the
    Self-Generating Prototypes algorithm.
    It has a higher generalization power, including the procedures
    merge and pruning.

    Parameters
    ----------
    r_min: float, optional (default = 0.0)
        Determine the minimum size of a cluster [0.00, 0.20]

    r_mis: float, optional (default = 0.0)
        Determine the error tolerance before split a group

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.generation.sgp import SGP2
    >>> import numpy as np
    >>> X = [np.asarray(range(1,13)) + np.asarray([0.1,0,-0.1,0.1,0,-0.1,0.1,-0.1,0.1,-0.1,0.1,-0.1])]
    >>> X = np.asarray(X).T
    >>> y = np.array([1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1])
    >>> sgp2 = SGP2()
    >>> sgp2.fit(X, y)
    SGP2(r_min=0.0, r_mis=0.0)
    >>> print sgp2.reduction_
    0.5

    See also
    --------
    protopy.generation.SGP: self-generating prototypes
    protopy.generation.sgp.ASGP: adaptive self-generating prototypes

    References
    ----------
    Hatem A. Fayed, Sherif R Hashem, and Amir F Atiya. Self-generating prototypes
    for pattern classification. Pattern Recognition, 40(5):1498–1509, 2007.
    """
    def __init__(self, r_min=0.0, r_mis=0.0):
        self.groups = None
        self.r_min = r_min
        self.r_mis = r_mis
        self.n_neighbors = 1
        self.classifier = None
        self.groups = None


    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        classes = np.unique(y)
        self.classes_ = classes

        # loading inicial groups
        self.groups = []
        for label in classes:
            mask = y == label
            self.groups = self.groups + [_Group(X[mask], label)]

        self._main_loop()
        self._generalization_step()
        self._merge()
        self._pruning()
        self.X_ = np.asarray([g.rep_x for g in self.groups])
        self.y_ = np.asarray([g.label for g in self.groups])
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_


    def _merge(self):

        if len(self.groups) < 2:
            return self.groups

        merged = False
        for group in self.groups:
            reps_x = np.asarray([g.rep_x for g in self.groups])
            reps_y = np.asarray([g.label for g in self.groups])
            self.classifier.fit(reps_x, reps_y)

            nn2_idx = self.classifier.kneighbors(group.X, n_neighbors=2, return_distance=False)
            nn2_idx = nn2_idx.T[1]

            # could use a threshold
            if len(set(nn2_idx)) == 1 and reps_y[nn2_idx[0]] == group.label:
                ng_group = self.groups[nn2_idx[0]]
                ng2_idx = self.classifier.kneighbors(ng_group.X, n_neighbors=2, return_distance=False)
                ng2_idx = ng2_idx.T[1]
                if len(set(ng2_idx)) == 1 and self.groups[ng2_idx[0]] == group:
                    group.add_instances(ng_group.X, update=True)
                    self.groups.remove(ng_group)
                    merged = True
                
        if merged:
            self._merge()

        return self.groups


    def _pruning(self):

        if len(self.groups) < 2:
            return self.groups

        pruned, fst = False, True
        knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute')
        
        while pruned or fst:
            index = 0
            pruned, fst = False, False

            while index < len(self.groups):
                group = self.groups[index]

                mask = np.ones(len(self.groups), dtype=bool)
                mask[index] = False
                reps_x = np.asarray([g.rep_x for g in self.groups])[mask]
                reps_y = np.asarray([g.label for g in self.groups])[mask]
                labels = knn.fit(reps_x, reps_y).predict(group.X)

                if (labels == group.label).all():
                    self.groups.remove(group)
                    pruned = True
                else:
                    index = index + 1

                if len(self.groups) == 1:
                    index = len(self.groups)
                    pruned = False

        return self.groups
示例#50
0
class CNN(InstanceReductionMixin):
    """Condensed Nearest Neighbors.

    Each class is represented by a set of prototypes, with test samples
    classified to the class with the nearest prototype.
    The Condensed Nearest Neighbors removes the redundant instances,
    maintaining the samples in the decision boundaries.

    Parameters
    ----------
    n_neighbors : int, optional (default = 1)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    Attributes
    ----------
    `prototypes_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `labels_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.cnn import CNN
    >>> import numpy as np
    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    >>> y = np.array([1, 1, 1, 2, 2, 2])
    >>> cnn = CNN()
    >>> cnn.fit(X, y)
    CNN(n_neighbors=1)
    >>> print(cnn.predict([[-0.8, -1]]))
    [1]

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    Notes
    -----
    The Condensed Nearest Neighbor is one the first prototype selection
    technique in literature.

    References
    ----------
    P. E. Hart, The condensed nearest neighbor rule, IEEE Transactions on 
    Information Theory 14 (1968) 515–516.

    """

    def __init__(self, n_neighbors=1):
        self.n_neighbors = n_neighbors
        self.classifier = None

    def reduce_data(self, X, y):
        
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]


        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)
       
        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
def nearest_fit(X,y):
    clf = KNeighborsClassifier(7, 'distance')
    return clf.fit(X, y)
示例#52
0
def knn_score(X, y, neighbors):
    knn5 = KNeighborsClassifier(n_neighbors=neighbors)
    knn5.fit(X, y)
    y_pred = knn5.predict(X)
    print "KNN{} accuracy_score: {}".format(neighbors,
                                            metrics.accuracy_score(y, y_pred))