예제 #1
0
 def do(train_data, train_label, test_data, test_label=None, adjust_parameters=True, k=5):
     train_data = np.array(train_data).squeeze()
     train_label = np.array(train_label).squeeze()
     test_data = np.array(test_data).squeeze()
     if test_label is not None:
         test_label = np.array(test_label).squeeze()
     if not adjust_parameters:
         knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8)
         knn.fit(train_data, train_label)
         predicted_label = knn.predict(test_data)
         if test_label is not None:
             acc = accuracy_score(test_label, predicted_label)
             print 'acc is ', acc
         return predicted_label
     else:
         max_acc = 0.0
         max_k = 0
         max_predicted = None
         for k in range(1, 11):
             knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8)
             knn.fit(train_data, train_label)
             predicted_label = knn.predict(test_data)
             acc = accuracy_score(test_label, predicted_label)
             if acc > max_acc:
                 max_acc = acc
                 max_k = k
                 max_predicted = predicted_label
             print 'k = ', k, ' acc is ', acc
         print 'max acc is ', max_acc, ' responding to k is ', max_k
         return max_predicted, max_k
예제 #2
0
class PatchedRawModel:
    def __init__(self):
        self.baseModel = RawModel()
        self.model49 = KNeighborsClassifier(n_neighbors=10)
        self.model35 = KNeighborsClassifier(n_neighbors=10)
    
    def fit(self, trainExamples):
        self.baseModel.fit(trainExamples)

        X49 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [4, 9]] )
        Y49 = [x.Y for x in trainExamples if x.Y in [4, 9]]
        self.model49.fit(X49, Y49)

        X35 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [3, 5]] )
        Y35 = [x.Y for x in trainExamples if x.Y in [3, 5]]
        self.model35.fit(X35, Y35)

    def predict(self, examples):
        basePredictions = self.baseModel.predict(examples)

        for (x, y, i) in zip(examples, basePredictions, range(0, len(examples))):
            if y in [4, 9]:
                specializedPrediction = self.model49.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT)))
                if specializedPrediction != y:
                    basePredictions[i] = specializedPrediction
            elif y in [3, 5]:
                specializedPrediction = self.model35.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT)))
                if specializedPrediction != y:
                    basePredictions[i] = specializedPrediction

        return basePredictions
예제 #3
0
def plotDecisionBoundry(X, y, y_predicted, modelName):

    X_Train_embedded = TSNE(n_components=2).fit_transform(X)
    print(X_Train_embedded.shape)

    # create meshgrid
    resolution = 1000  # 100x100 background pixels
    X2d_xmin, X2d_xmax = np.min(X_Train_embedded[:, 0]), np.max(
        X_Train_embedded[:, 0])
    X2d_ymin, X2d_ymax = np.min(X_Train_embedded[:, 1]), np.max(
        X_Train_embedded[:, 1])
    xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution),
                         np.linspace(X2d_ymin, X2d_ymax, resolution))

    # approximate Voronoi tesselation on resolution x resolution grid using 1-NN
    background_model = KNeighborsClassifier(n_neighbors=1).fit(
        X_Train_embedded, y_predicted)
    voronoiBackground = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
    voronoiBackground = voronoiBackground.reshape((resolution, resolution))

    #plot
    plt.contourf(xx, yy, voronoiBackground)
    plt.scatter(X_Train_embedded[:, 0],
                X_Train_embedded[:, 1],
                c=y.values.flatten())
    plt.title(modelName)
    plt.show()
예제 #4
0
class KNeighborsClassifierImpl():

    def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):
        self._hyperparams = {
            'n_neighbors': n_neighbors,
            'weights': weights,
            'algorithm': algorithm,
            'leaf_size': leaf_size,
            'p': p,
            'metric': metric,
            'metric_params': metric_params,
            'n_jobs': n_jobs}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
예제 #5
0
def compute_cnn(X, y):

  "condenced nearest neighbor. the cnn removes reduntant instances, maintaining the samples in the decision boundaries."

  classifier = KNeighborsClassifier(n_neighbors=3)

  prots_s = []
  labels_s = []

  classes = np.unique(y)
  classes_ = classes

  for cur_class in classes:
    mask = y == cur_class
    insts = X[mask]
    prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
    labels_s = labels_s + [cur_class]
    
  classifier.fit(prots_s, labels_s)
  for sample, label in zip(X, y):
    if classifier.predict(sample) != [label]:
      prots_s = prots_s + [sample]
      labels_s = labels_s + [label]
      classifier.fit(prots_s, labels_s)

  X_ = np.asarray(prots_s)
  y_ = np.asarray(labels_s)
  reduction_ = 1.0 - float(len(y_)/len(y))
  print reduction_
예제 #6
0
def KNN_method(X, y):
    skf = StratifiedKFold(n_splits=4, random_state=42)
    skf.get_n_splits(X, y)

    for train_index, test_index in skf.split(X, y):
        print("Train:", train_index, "Validation:", test_index)
        trainX, testX = X[train_index], X[test_index]
        trainY, testY = y[train_index], y[test_index]

        #here starts KNN
        #how many neighbours want to use in the KNC
        kvalues = [1, 3, 5, 7, 9, 11, 13, 15, 19, 24, 30, 40, 50, 60, 70, 90]
        dist = ['manhattan', 'euclidean', 'chebyshev']
        results = {}
        for element in dist:
            accuracy_results = []
            for k in kvalues:
                knn = KNeighborsClassifier(n_neighbors=k, metric=element)
                knn.fit(trainX, trainY)
                predictedY = knn.predict(testX)
                accuracy_results.append(accuracy_score(testY, predictedY))
            results[element] = accuracy_results
        print("Results of model preparation for: " + str(results))

        plt.figure()
        multiple_line_chart(plt.gca(),
                            kvalues,
                            results,
                            'KNN variants',
                            'n',
                            'accuracy',
                            percentage=True)
        plt.show()
예제 #7
0
class RawModel:
    def __init__(self):
        # 2015-05-15 GEL Found that n_components=20 gives a nice balance of 
        # speed (substantial improvement), accuracy, and reduced memory usage 
        # (25% decrease).
        self.decomposer = TruncatedSVD(n_components=20)

        # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than 
        # algorithm='kd_tree'
        
        # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples)
        # euclidean        0.950025
        # manhattan        0.933533
        # chebyshev        0.675662
        # hamming          0.708646
        # canberra         0.934033
        # braycurtis       0.940530
        self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')

    def fit(self, trainExamples):       
        X = self.decomposer.fit_transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples] ) )
        Y = [x.Y for x in trainExamples]

        self.model.fit(X, Y)
        return self

    def predict(self, examples):
        X = self.decomposer.transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) )
        return self.model.predict( X )
예제 #8
0
def evaluate(Xtra, ytra, Xtst, ytst, k=1, positive_label=1):
    knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute')
    knn.fit(Xtra, ytra)

    y_true = ytst
    y_pred = knn.predict(Xtst)

    return evaluate_results(y_true, y_pred, positive_label=positive_label)
예제 #9
0
def knn(X, y, model_path):
    model = KNeighborsClassifier()
    model.fit(X, y)
    print(model)
    #预测
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)
예제 #10
0
def plot_boundaries_decision(X, y, clf, namefile):
    """
    Method to plot the boundaries decision of our data 
    X : A numpy array of the data we want to plot 
    y : A numpy array of the  label corresponding to our data
    clf : the model use to predict the label of our data
    namefile : the name of the file in which we want to save the figure  
    """
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.333,
                                                        random_state=42)
    #    #The plot of boundary decision in the 2D space of representation of data
    model.fit(X_train, y_train)

    # create meshgrid
    resolution = 100  # 100x100 background pixels
    X2d_xmin, X2d_xmax = np.min(X[:, 0]), np.max(X[:, 0])
    X2d_ymin, X2d_ymax = np.min(X[:, 1]), np.max(X[:, 1])
    xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution),
                         np.linspace(X2d_ymin, X2d_ymax, resolution))

    # approximate Voronoi tesselation on resolution x resolution grid using 1-NN
    background_model = KNeighborsClassifier(n_neighbors=1).fit(X, y)
    voronoiBackground = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
    voronoiBackground = voronoiBackground.reshape((resolution, resolution))

    fig = pyplot.figure()
    fig.set_size_inches(10.5, 8.5)

    ax = fig.add_subplot(211)  #small subplot to show how the legend has moved.
    #plot
    ax.contourf(xx, yy, voronoiBackground)
    ax.set_title(
        " Boundaries decision in using the dimensionality reduction of Multidimensional scaling"
    )
    ax.scatter(X[:, 0], X[:, 1], c=color[y].tolist())

    label = numpy.array([x for x in ["Apple", "Tomatoes"]])
    # Legend
    for ind, s in enumerate(label):
        ax.scatter([], [], label=s, color=color[ind])

    pyplot.legend(scatterpoints=1,
                  frameon=True,
                  labelspacing=0.5,
                  bbox_to_anchor=(1.2, .4),
                  loc='center right')

    pyplot.tight_layout()
    pyplot.savefig(namefile)
    pyplot.show()
예제 #11
0
def __plot_decision_boundaries(X,
                               y,
                               y_pred,
                               resolution: int = 100,
                               embedding=None):
    if embedding is None:
        embedding = TSNE(n_components=2, random_state=160290).fit_transform(X)

    x_min, x_max = safe_bounds(embedding[:, 0])
    y_min, y_max = safe_bounds(embedding[:, 1])
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, resolution),
                         np.linspace(y_min, y_max, resolution))

    # approximate Voronoi tesselation on resolution x resolution grid using 1-NN
    background_model = KNeighborsClassifier(n_neighbors=1).fit(
        embedding, y_pred)
    voronoi_bg = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
    voronoi_bg = voronoi_bg.reshape((resolution, resolution))

    mesh = hv.QuadMesh((xx, yy, voronoi_bg)).opts(cmap="viridis")
    points = hv.Scatter(
        {
            "x": embedding[:, 0],
            "y": embedding[:, 1],
            "pred": y_pred,
            "class": y
        },
        kdims=["x", "y"],
        vdims=["pred", "class"],
    )
    errors = y_pred != y
    failed_points = hv.Scatter(
        {
            "x": embedding[errors, 0],
            "y": embedding[errors, 1]
        },
        kdims=["x", "y"]).opts(color="red", size=5, alpha=0.9)

    points = points.opts(color="pred",
                         cmap="viridis",
                         line_color="grey",
                         size=10,
                         alpha=0.8,
                         tools=["hover"])
    plot = mesh * points * failed_points
    plot = plot.opts(xaxis=None,
                     yaxis=None,
                     width=500,
                     height=450,
                     title="Decision boundaries on TSNE")
    return plot
예제 #12
0
    def get_result(self):
        # file opener
        tkinter.Tk().withdraw()
        directory = filedialog.askdirectory()
        result = self.read_emails_from_directory(directory)

        train_labels = np.zeros(1430)
        train_labels[715:1430] = 1
        # This equates to 1-715 = HAM and 716-1430 = SPAM
        #                              If you change result[n] to something else
        #                              Make sure you change the same result down
        #                              down in line 251 (test_matrix)
        train_matrix = self.extract_features(directory, result[0])
        #print(train_matrix)
        # print("body words:", result[0])
        # print("\n\nsubject words:", result[1])
        # print("\n\nbody phrases:", result[2])
        # print("\n\nsubject phrases:", result[3])

        print("body words:", len(result[0]))
        print("subject words:", len(result[1]))
        print("body phrases:", len(result[2]))
        print("subject phrases:", len(result[3]))

        model1 = MultinomialNB()
        model2 = LinearSVC()
        model3 = RandomForestClassifier()
        model4 = KNeighborsClassifier()
        model1.fit(train_matrix, train_labels)
        model2.fit(train_matrix, train_labels)
        model3.fit(train_matrix, train_labels)
        model4.fit(train_matrix, train_labels)

        test_dir = filedialog.askdirectory()
        #                                       Here -----v
        test_matrix = self.extract_features(test_dir, result[0])
        test_labels = np.zeros(600)
        # This equates to 1-300 = HAM and 301-600 = SPAM
        test_labels[300:600] = 1
        result1 = model1.predict(test_matrix)
        result2 = model2.predict(test_matrix)
        result3 = model3.predict(test_matrix)
        result4 = model4.predict(test_matrix)

        print(confusion_matrix(test_labels, result1))
        print(confusion_matrix(test_labels, result2))
        print(confusion_matrix(test_labels, result3))
        print(confusion_matrix(test_labels, result4))
        return result
예제 #13
0
def build_and_test_model(classifier, X, Y, Z, param):

    accuracies = []
    ari = []

    for train, test in LeaveOneOut().split(X):

        X_train, Y_train = X[train], Y[train]
        X_test, Y_test, Z_test = X[test], Y[test], Z[test]
        predicted = None

        if classifier == "KNN":
            neigh = KNeighborsClassifier(n_neighbors=param).fit(
                X_train, Y_train)
            predicted = neigh.predict(X_test)

        elif classifier == "RF":
            clf = RandomForestClassifier(n_estimators=param,
                                         random_state=0)  # ,max_depth=2,
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test)

        elif classifier == "SVM":
            clf = svm.SVC(gamma='scale')
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test).astype(int)

        elif classifier == "NAIVE":
            clf = GaussianNB()
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test).astype(int)

        elif classifier == "RANDOM":
            options = list(set(Y_train))
            predicted = [random.choice(options) for _ in range(len(Y_test))]

        accuracies.append(metrics.accuracy_score(Y_test, predicted))
        ari.append(metrics.adjusted_rand_score(Z_test, predicted))

    return np.mean(accuracies), np.std(accuracies), np.mean(ari), np.std(ari)
예제 #14
0
def compute_enn(X, y):
  """
  the edited nearest neighbors removes the instances in the boundaries, maintaining reduntant samples
  """

  classifier = KNeighborsClassifier(n_neighbors=3)

  classes = np.unique(y)
  classes_ = classes

  mask = np.zeros(y.size, dtype=bool)
  classifier.fit(X, y)

  for i in xrange(y.size):
    sample, label = X[i], y[i]
    if classifier.predict(sample) == [label]:
      mask[i] = not mask[i]

  X_ = np.asarray(X[mask])
  y_ = np.asarray(y[mask])
  reduction_ = 1.0 - float(len(y_)) / len(y)
  print reduction_
예제 #15
0
def nd_boundary_plot(X_tst, y_predicted, model, ax, resolution=256):
    if len(X_tst.shape) != 2:
        raise ValueError("X must be ndarray of the form [nsamples, nfeatures]")
    if X_tst.shape[0] < 2:
        raise ValueError("Must have at least 2 dimensions")
    if not hasattr(model, "classes_"):
        raise ValueError("Model has to be trained first")
    if len(model.classes_) < 2:
        raise ValueError("Classification must be at least binary")
    #done with sanity checks

    if X_tst.shape[1] == 2:  #2 dimensions
        X = X_tst
        xmin, xmax = np.min(X[:, 0]), np.max(X[:, 0])
        ymin, ymax = np.min(X[:, 1]), np.max(X[:, 1])
        xx, yy = np.meshgrid(np.linspace(xmin, xmax, resolution),
                             np.linspace(ymin, ymax, resolution))

        if hasattr(model, "decision_function") or len(
                model.classes_
        ) != 2:  #model does not comute posterior or hard to graph
            Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    else:  #lots of dimensions
        X = TSNE(n_components=2).fit_transform(X_tst)
        background_model = KNeighborsClassifier(n_neighbors=1).fit(
            X, y_predicted)
        xmin, xmax = np.min(X[:, 0]), np.max(X[:, 0])
        ymin, ymax = np.min(X[:, 1]), np.max(X[:, 1])
        xx, yy = np.meshgrid(np.linspace(xmin, xmax, resolution),
                             np.linspace(ymin, ymax, resolution))
        Z = background_model.predict(np.c_[xx.ravel(), yy.ravel()])

    Z = Z.reshape((resolution, resolution))
    ax.contourf(xx, yy, Z, alpha=.3)
    ax.scatter(X[:, 0], X[:, 1], c=y_predicted)
    def check_accuracy_f1(self, path):
        data_after_feature_selected = []
        for i in path:
            data_after_feature_selected.append(feature[:, i])
        data_after_feature_selected = np.array(data_after_feature_selected)
        data_after_feature_selected = data_after_feature_selected.transpose(
        )  # 矩阵转置
        X_train2, X_test2, y_train2, y_test2 = train_test_split(
            data_after_feature_selected, label, test_size=0.3)

        if select_model == "SVM":
            model_svm = svm.SVC(kernel='poly', gamma=0.125, C=20)
            model_svm.fit(X_train2, y_train2)
            model_svm.get_params(deep=True)
            prediction2 = model_svm.predict(X_test2)
        elif select_model == "KNN":
            model_knn = KNeighborsClassifier(n_neighbors=1)
            model_knn.fit(X_train2, y_train2)
            model_knn.get_params(deep=True)
            prediction2 = model_knn.predict(X_test2)
        elif select_model == "RF":
            model_rf2 = RandomForestClassifier()
            model_rf2.fit(X_train2, y_train2)
            model_rf2.get_params(deep=True)
            prediction2 = model_rf2.predict(X_test2)
        elif select_model == "LR":
            model_lr2 = LogisticRegression()
            model_lr2.fit(X_train2, y_train2)
            prediction2 = model_lr2.predict(X_test2)
        elif select_model == "DT":
            model_dt = DecisionTreeClassifier()
            model_dt.fit(X_train2, y_train2)
            prediction2 = model_dt.predict(X_test2)

        return accuracy_score(y_test2, prediction2), f1_score(y_test2,
                                                              prediction2,
                                                              average='macro')
        # model.fit(feature[train_index,:][:,top], label[train_index])
        # prediction = model.predict(feature[test_index,:][:,top])
        # acc, f = get_result(label[test_index], prediction)
        # accuracy['DT'].append(acc)
        # f1['DT'].append(f)

        # model = LogisticRegression()
        # model.fit(feature[train_index,:][:,top], label[train_index])
        # prediction = model.predict(feature[test_index,:][:,top])
        # acc, f = get_result(label[test_index], prediction)
        # accuracy['LR'].append(acc)
        # f1['LR'].append(f)

        model = KNeighborsClassifier(n_neighbors=1)
        model.fit(feature[train_index, :][:, top], label[train_index])
        prediction = model.predict(feature[test_index, :][:, top])
        acc, f = get_result(label[test_index], prediction)
        accuracy['KNN'].append(acc)
        f1['KNN'].append(f)

        model = RandomForestClassifier(n_estimators=250)
        model.fit(feature[train_index, :][:, top], label[train_index])
        prediction = model.predict(feature[test_index, :][:, top])
        acc, f = get_result(label[test_index], prediction)
        accuracy['RF'].append(acc)
        f1['RF'].append(f)

        # model = MLPClassifier()
        # model.fit(feature[train_index,:][:,top], label[train_index])
        # prediction = model.predict(feature[test_index,:][:,top])
        # acc, f = get_result(label[test_index], prediction)
예제 #18
0
class InstanceReductionMixin(InstanceReductionBase, ClassifierMixin):
    """Mixin class for all instance reduction techniques"""
    def set_classifier(self):
        """Sets the classified to be used in the instance reduction process
            and classification.

        Parameters
        ----------
        classifier : classifier, following the KNeighborsClassifier style
            (default = KNN)

        y : array-like, shape = [n_samples]
            Labels for X.

        Returns
        -------
        P : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        q : array-like, shape = [indertaminated]
            Labels for P
        """

        self.classifier = classifier

    def reduce_data(self, X, y):
        """Perform the instance reduction procedure on the given training data.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training set.0

        y : array-like, shape = [n_samples]
            Labels for X.

        Returns
        -------
        X_ : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        y_ : array-like, shape = [indertaminated]
            Labels for X_
        """
        pass

    def fit(self, X, y, reduce_data=True):
        """
        Fit the InstanceReduction model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        reduce_data : bool, flag indicating if the reduction would be performed
        """
        self.X = X
        self.y = y

        if reduce_data:
            self.reduce_data(X, y)

        return self

    def predict(self, X, n_neighbors=1):
        """Perform classification on an array of test vectors X.

        The predicted class C for each sample in X is returned.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array, shape = [n_samples]

        Notes
        -----
        The default prediction is using KNeighborsClassifier, if the
        instance reducition algorithm is to be performed with another
        classifier, it should be explicited overwritten and explained
        in the documentation.
        """
        X = check_array(X)
        if not hasattr(self, "X_") or self.X_ is None:
            raise AttributeError("Model has not been trained yet.")

        if not hasattr(self, "y_") or self.y_ is None:
            raise AttributeError("Model has not been trained yet.")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict(X)

    def predict_proba(self, X):
        """Return probability estimates for the test data X.
        after a given prototype selection algorithm.
    
        Parameters
        ----------
        X : array, shape = (n_samples, n_features)
            A 2-D array representing the test points.
        
        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
        of such arrays if n_outputs > 1.
        The class probabilities of the input samples. Classes are ordered
        by lexicographic order.
        """
        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict_proba(X)
예제 #19
0
class KNNClassifier():
    '''
    classdocs
    '''
    def __init__(self, csv_path_train, csv_path_test, k):
        '''
        Constructor
        '''
        self.csv_path_train = csv_path_train
        self.csv_path_test = csv_path_test
        self.classifier = KNeighborsClassifier(n_neighbors=k,
                                               p=2,
                                               metric='minkowski')

    def create_arrays(self):
        arr_train = np.genfromtxt(self.csv_path_train,
                                  delimiter=',',
                                  skip_header=1)
        self.X_train = np.delete(arr_train, [arr_train.shape[1] - 1], axis=1)
        self.y_train = np.delete(arr_train,
                                 list(range(arr_train.shape[1] - 1)),
                                 axis=1)

        arr_test = np.genfromtxt(self.csv_path_test,
                                 delimiter=',',
                                 skip_header=1)
        self.X_test = np.delete(arr_test, [arr_test.shape[1] - 1], axis=1)
        self.y_test = np.delete(arr_test,
                                list(range(arr_test.shape[1] - 1)),
                                axis=1)

    def preprocess(self):
        #         X_train, X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=0)
        sc = StandardScaler()
        sc.fit(self.X_train)
        self.X_train_std = sc.transform(self.X_train)
        self.X_test_std = sc.transform(self.X_test)

    def train(self):
        self.create_arrays()
        self.preprocess()
        self.classifier.fit(self.X_train_std, self.y_train.ravel())

    def test(self, f, patient_num, total_fpr, total_tpr):
        y_pred = self.classifier.predict(self.X_test_std)
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = accuracy_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        print("Accuracy: %.2f" % accuracy)
        print("Precision: %.2f" % precision)
        print("Recall: %.2f" % recall)
        line = str(accuracy) + "," + str(precision) + "," + str(recall)
        f.write(line)
        f.write("\n")
        confmat = confusion_matrix(self.y_test, y_pred)
        fig, ax = plt.subplots(figsize=(2.5, 2.5))
        ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
        for i in range(confmat.shape[0]):
            for j in range(confmat.shape[1]):
                ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
        plt.xlabel('predicted label')
        plt.ylabel('true label')
        plt.savefig("D:\\Documents\\KNN\\FFT\\chb" + patient_num +
                    "_confmat.png")
        plt.close()
        fpr, tpr, thresholds = roc_curve(self.y_test, y_pred)
        print("fpr", fpr)
        print("tpr", tpr)
        total_fpr[1] += fpr[len(fpr) - 2]
        total_tpr[1] += tpr[len(tpr) - 2]
        print(total_fpr)
        print(total_tpr)
        roc_auc = auc(fpr, tpr)
        plt.title('ROC Curve')
        plt.plot(fpr, tpr, 'b', label='AUC = %.2F' % roc_auc)
        plt.legend(loc='lower right')
        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([-0.1, 1.2])
        plt.ylim([-0.1, 1.2])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.savefig("D:\\Documents\\KNN\\FFT\\chb" + patient_num + "roc.png")
        plt.close()
        return total_fpr, total_tpr
예제 #20
0
class CNN(InstanceReductionMixin):
    """Condensed Nearest Neighbors.

    Each class is represented by a set of prototypes, with test samples
    classified to the class with the nearest prototype.
    The Condensed Nearest Neighbors removes the redundant instances,
    maintaining the samples in the decision boundaries.

    Parameters
    ----------
    n_neighbors : int, optional (default = 1)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    Attributes
    ----------
    `prototypes_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `labels_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.cnn import CNN
    >>> import numpy as np
    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    >>> y = np.array([1, 1, 1, 2, 2, 2])
    >>> cnn = CNN()
    >>> cnn.fit(X, y)
    CNN(n_neighbors=1)
    >>> print(cnn.predict([[-0.8, -1]]))
    [1]

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    Notes
    -----
    The Condensed Nearest Neighbor is one the first prototype selection
    technique in literature.

    References
    ----------
    P. E. Hart, The condensed nearest neighbor rule, IEEE Transactions on 
    Information Theory 14 (1968) 515–516.

    """

    def __init__(self, n_neighbors=1):
        self.n_neighbors = n_neighbors
        self.classifier = None

    def reduce_data(self, X, y):
        
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]


        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)
       
        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
예제 #21
0
class SSMA(InstanceReductionMixin):
    """Steady State Memetic Algorithm

    The Steady-State Memetic Algorithm is an evolutionary prototype
    selection algorithm. It uses a memetic algorithm in order to 
    perform a local search in the code.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    alpha   : float (default = 0.6)
        Parameter that ponderates the fitness function.

    max_loop    : int (default = 1000)
        Number of maximum loops performed by the algorithm.

    threshold   : int (default = 0)
        Threshold that regulates the substitution condition;

    chromosomes_count: int (default = 10)
        number of chromosomes used to find the optimal solution.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.ssma import SSMA
    >>> import numpy as np
    >>> X = np.array([[i] for i in range(100)])
    >>> y = np.asarray(50 * [0] + 50 * [1])
    >>> ssma = SSMA()
    >>> ssma.fit(X, y)
    SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0)
    >>> print ssma.predict([[40],[60]])
    [0 1]
    >>> print ssma.reduction_
    0.98

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    References
    ----------
    Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype
    selection based on a steady-state memetic algorithm: a study of scalability.
    Memetic Computing, 2(3):183–199, 2010.

    """
    def __init__(self,
                 n_neighbors=1,
                 alpha=0.6,
                 max_loop=1000,
                 threshold=0,
                 chromosomes_count=10):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.max_loop = max_loop
        self.threshold = threshold
        self.chromosomes_count = chromosomes_count

        self.evaluations = None
        self.chromosomes = None

        self.best_chromosome_ac = -1
        self.best_chromosome_rd = -1

        self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

    def accuracy(self, chromosome, X, y):
        mask = np.asarray(chromosome, dtype=bool)
        cX, cy = X[mask], y[mask]
        #print len(cX), len(cy), sum(chromosome)

        self.classifier.fit(cX, cy)
        labels = self.classifier.predict(X)
        accuracy = (labels == y).sum()

        return float(accuracy) / len(y)

    def fitness(self, chromosome, X, y):
        #TODO add the possibility of use AUC for factor1
        ac = self.accuracy(chromosome, X, y)
        rd = 1.0 - (float(sum(chromosome)) / len(chromosome))

        return self.alpha * ac + (1.0 - self.alpha) * rd

    def fitness_gain(self, gain, n):
        return self.alpha * (float(gain) / n) + (1 - self.alpha) * (1.0 / n)

    def update_threshold(self, X, y):
        best_index = np.argmax(self.evaluations)
        chromosome = self.chromosomes[best_index]

        best_ac = self.accuracy(chromosome, X, y)
        best_rd = 1.0 - float(sum(chromosome)) / len(y)

        if best_ac <= self.best_chromosome_ac:
            self.threshold = self.threshold + 1
        if best_rd <= self.best_chromosome_rd:
            self.threshold = self.threshold - 1

        self.best_chromosome_ac = best_ac
        self.best_chromosome_rd = best_rd

    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U

    def memetic_looper(self, S, R):
        c = 0
        for i in range(len(S)):
            if S[i] == 1 and i not in R:
                c = c + 1
                if c == 2:
                    return True

        return False

    def memetic_select_j(self, S, R):
        indexs = []
        for i in range(len(S)):
            if i not in R and S[i] == 1:
                indexs.append(i)
        # if list is empty wlil return error
        return np.random.choice(indexs)

    def generate_population(self, X, y):
        self.chromosomes = [[np.random.choice([0, 1]) for i in range(len(y))]
                            for c in range(self.chromosomes_count)]
        self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes]

        self.update_threshold(X, y)

    def select_parents(self, X, y):
        parents = []
        for i in range(2):
            samples = random.sample(self.chromosomes, 2)
            parents = parents + [
                samples[0] if self.fitness(samples[0], X, y) > self.fitness(
                    samples[1], X, y) else samples[1]
            ]
        return np.array(parents, copy=True)

    def crossover(self, parent_1, parent_2):
        size = len(parent_1)
        mask = [0] * (size / 2) + [1] * (size - size / 2)
        mask = np.asarray(mask, dtype=bool)
        np.random.shuffle(mask)

        off_1 = parent_1 * mask + parent_2 * ~mask
        off_2 = parent_2 * mask + parent_1 * ~mask

        return np.asarray([off_1, off_2])

    def mutation(self, offspring):
        for i in range(len(offspring)):
            if np.random.uniform(0, 1) < 1.0 / len(offspring):
                offspring[i] = not offspring[i]

        return offspring

    def memetic_search(self, chromosome, X, y, chromosome_fitness=None):
        S = np.array(chromosome, copy=True)
        if S.sum() == 0:
            return S, 0

        if chromosome_fitness == None:
            chromosome_fitness = self.fitness(chromosome, X, y)
        fitness_s = chromosome_fitness

        # List of visited genes in S
        R = []
        # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i
        U = self.index_nearest_neighbor(S, X, y)

        while self.memetic_looper(S, R):
            j = self.memetic_select_j(S, R)
            S[j] = 0
            gain = 0.0
            U_copy = list(U)
            mask = np.asarray(S, dtype=bool)
            X_tra, y_tra = X[mask], y[mask]
            real_idx = np.asarray(range(len(y)))[mask]

            if len(y_tra) > 0:
                for i in range(len(U)):
                    if U[i] == j:
                        self.classifier.fit(X_tra, y_tra)
                        [[idx]
                         ] = self.classifier.kneighbors(X[i],
                                                        n_neighbors=1,
                                                        return_distance=False)
                        U[i] = real_idx[idx]

                        if y[i] == y[U_copy[i]] and y[i] != y[U[i]]:
                            gain = gain - 1.0
                        if y[i] != y[U_copy[i]] and y[i] == y[U[i]]:
                            gain = gain + 1.0

            if gain >= self.threshold:
                n = S.sum()
                g = self.fitness_gain(gain, n)
                fitness_s = fitness_s + g
                R = []
            else:
                U = U_copy
                S[j] = 1
                R.append(j)

        return list(S), fitness_s

    def main_loop(self, X, y):
        self.generate_population(X, y)
        n, worse_fit_index = 0, -1
        while (n < self.max_loop):
            parents = self.select_parents(X, y)
            offspring = self.crossover(parents[0], parents[1])
            offspring[0] = self.mutation(offspring[0])
            offspring[1] = self.mutation(offspring[1])

            fit_offs = [
                self.fitness(off, X, y) if sum(off) > 0 else -1
                for off in offspring
            ]

            if worse_fit_index == -1:
                worse_fit_index = np.argmin(self.evaluations)

            for i in range(len(offspring)):
                p_ls = 1.0

                if fit_offs[i] == -1:
                    p_ls = -1

                if fit_offs[i] <= self.evaluations[worse_fit_index]:
                    p_ls = 0.0625

                if np.random.uniform(0, 1) < p_ls:

                    offspring[i], fit_offs[i] = self.memetic_search(
                        offspring[i], X, y, chromosome_fitness=fit_offs[i])

            for i in range(len(offspring)):
                if fit_offs[i] > self.evaluations[worse_fit_index]:
                    self.chromosomes[worse_fit_index] = offspring[i]
                    self.evaluations[worse_fit_index] = fit_offs[i]

                    worse_fit_index = np.argmin(self.evaluations)

            n = n + 1
            if n % 10 == 0:
                self.update_threshold(X, y)

    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        classes = np.unique(y)
        self.classes_ = classes

        self.main_loop(X, y)

        best_index = np.argmax(self.evaluations)
        mask = np.asarray(self.chromosomes[best_index], dtype=bool)
        self.X_ = X[mask]
        self.y_ = y[mask]
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
예제 #22
0
class ENN(InstanceReductionMixin):

    """Edited Nearest Neighbors.

    The Edited Nearest Neighbors  removes the instances in de 
    boundaries, maintaining redudant samples.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.enn import ENN
    >>> import numpy as np
    >>> X = np.array([[-1, 0], [-0.8, 1], [-0.8, -1], [-0.5, 0] , [0.5, 0], [1, 0], [0.8, 1], [0.8, -1]])
    >>> y = np.array([1, 1, 1, 2, 1, 2, 2, 2])
    >>> editednn = ENN()
    >>> editednn.fit(X, y)
    ENN(n_neighbors=3)
    >>> print(editednn.predict([[-0.6, 0.6]]))
    [1]
    >>> print editednn.reduction_
    0.75

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    References
    ----------
    Ruiqin Chang, Zheng Pei, and Chao Zhang. A modified editing k-nearest
    neighbor rule. JCP, 6(7):1493–1500, 2011.

    """

    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
        self.classifier = None


    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
예제 #23
0
class CNN(InstanceReductionMixin):
    """Condensed Nearest Neighbors.

    Each class is represented by a set of prototypes, with test samples
    classified to the class with the nearest prototype.
    The Condensed Nearest Neighbors removes the redundant instances,
    maintaining the samples in the decision boundaries.

    Parameters
    ----------
    n_neighbors : int, optional (default = 1)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    Attributes
    ----------
    `prototypes_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `labels_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.cnn import CNN
    >>> import numpy as np
    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    >>> y = np.array([1, 1, 1, 2, 2, 2])
    >>> cnn = CNN()
    >>> cnn.fit(X, y)
    CNN(n_neighbors=1)
    >>> print(cnn.predict([[-0.8, -1]]))
    [1]

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    Notes
    -----
    The Condensed Nearest Neighbor is one the first prototype selection
    technique in literature.

    References
    ----------
    P. E. Hart, The condensed nearest neighbor rule, IEEE Transactions on 
    Information Theory 14 (1968) 515–516.

    """
    def __init__(self, n_neighbors=1):
        self.n_neighbors = n_neighbors
        self.classifier = None

    def reduce_data(self, X, y):

        X, y = check_arrays(X, y, sparse_format="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(
                n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]

        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)

        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
예제 #24
0
############################################################
# 审查结果可视化
_Fig = plt.figure()
_Fig.suptitle(t="ALGORITHM COMPARISION")
_Ax = _Fig.add_subplot(111)
plt.boxplot(x=_ALGORITHM_CMP_RESULT_LIST)
_Ax.set_xticklabels(labels=_MODELS.keys())
plt.show()
################################################################################
# 预测程序启动...
from sklearn import metrics
############################################################
# K近邻算法预测
_KNC_MODEL = KNC()
_KNC_MODEL.fit(X=_X_TRAIN, y=_Y_TRAIN)
_KNC_PREDICTIONS = _KNC_MODEL.predict(X=_X_VAL)
print(
    "KNC-K近邻算法预测结果:\n",
    #
    " " * 4,
    "ACCURACY_SCORE:\n",
    " " * 8,
    metrics.accuracy_score(y_true=_Y_VAL, y_pred=_KNC_PREDICTIONS),
    "\n",
    #
    " " * 4,
    "CONFUSION_MATRIX:\n",
    metrics.confusion_matrix(y_true=_Y_VAL, y_pred=_KNC_PREDICTIONS),
    "\n",
    #
    " " * 4,
예제 #25
0
파일: ssma.py 프로젝트: dvro/scikit-protopy
class SSMA(InstanceReductionMixin):
    """Steady State Memetic Algorithm

    The Steady-State Memetic Algorithm is an evolutionary prototype
    selection algorithm. It uses a memetic algorithm in order to 
    perform a local search in the code.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    alpha   : float (default = 0.6)
        Parameter that ponderates the fitness function.

    max_loop    : int (default = 1000)
        Number of maximum loops performed by the algorithm.

    threshold   : int (default = 0)
        Threshold that regulates the substitution condition;

    chromosomes_count: int (default = 10)
        number of chromosomes used to find the optimal solution.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.ssma import SSMA
    >>> import numpy as np
    >>> X = np.array([[i] for i in range(100)])
    >>> y = np.asarray(50 * [0] + 50 * [1])
    >>> ssma = SSMA()
    >>> ssma.fit(X, y)
    SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0)
    >>> print ssma.predict([[40],[60]])
    [0 1]
    >>> print ssma.reduction_
    0.98

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    References
    ----------
    Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype
    selection based on a steady-state memetic algorithm: a study of scalability.
    Memetic Computing, 2(3):183–199, 2010.

    """
    def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.max_loop = max_loop
        self.threshold = threshold
        self.chromosomes_count = chromosomes_count

        self.evaluations = None
        self.chromosomes = None

        self.best_chromosome_ac = -1
        self.best_chromosome_rd = -1

        self.classifier = KNeighborsClassifier(n_neighbors = n_neighbors)


    def accuracy(self, chromosome, X, y):
        mask = np.asarray(chromosome, dtype=bool)
        cX, cy = X[mask], y[mask]
        #print len(cX), len(cy), sum(chromosome)

        self.classifier.fit(cX, cy)
        labels = self.classifier.predict(X)
        accuracy = (labels == y).sum()

        return float(accuracy)/len(y)


    def fitness(self, chromosome, X, y):
        #TODO add the possibility of use AUC for factor1
        ac = self.accuracy(chromosome, X, y)
        rd = 1.0 - (float(sum(chromosome))/len(chromosome))

        return self.alpha * ac + (1.0 - self.alpha) * rd


    def fitness_gain(self, gain, n):
        return self.alpha * (float(gain)/n) + (1 - self.alpha) * (1.0 / n)


    def update_threshold(self, X, y):
        best_index = np.argmax(self.evaluations)
        chromosome = self.chromosomes[best_index]

        best_ac = self.accuracy(chromosome, X, y)
        best_rd = 1.0 - float(sum(chromosome))/len(y)

        if best_ac <= self.best_chromosome_ac:
            self.threshold = self.threshold + 1
        if best_rd <= self.best_chromosome_rd:
            self.threshold = self.threshold - 1

        self.best_chromosome_ac = best_ac
        self.best_chromosome_rd = best_rd


    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U
            

    def memetic_looper(self, S, R):
        c = 0
        for i in range(len(S)):
            if S[i] == 1 and i not in R:
                c = c + 1
                if c == 2:
                    return True

        return False

    def memetic_select_j(self, S, R):
        indexs = []
        for i in range(len(S)):
            if i not in R and S[i] == 1:
                indexs.append(i)
        # if list is empty wlil return error
        return np.random.choice(indexs)


    def generate_population(self, X, y):
        self.chromosomes = [[np.random.choice([0,1]) for i in range(len(y))]
                            for c in range(self.chromosomes_count)]
        self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes]

        self.update_threshold(X, y)
        

    def select_parents(self, X, y):
        parents = []
        for i in range(2):
            samples = random.sample(self.chromosomes, 2)
            parents = parents + [samples[0] if self.fitness(samples[0], X, y) >
                                    self.fitness(samples[1], X, y) else samples[1]]
        return np.array(parents, copy=True)

    def crossover(self, parent_1, parent_2):
        size = len(parent_1)
        mask = [0] * (size/2) + [1] * (size - size/2)
        mask = np.asarray(mask, dtype=bool)
        np.random.shuffle(mask)

        off_1 = parent_1 * mask + parent_2 * ~mask
        off_2 = parent_2 * mask + parent_1 * ~mask
        
        return np.asarray([off_1, off_2])
        

    def mutation(self, offspring):
        for i in range(len(offspring)):
            if np.random.uniform(0,1) < 1.0/len(offspring):
                offspring[i] = not offspring[i]

        return offspring

    def memetic_search(self, chromosome, X, y, chromosome_fitness = None):
        S = np.array(chromosome, copy=True)
        if S.sum() == 0:
            return S, 0

        if chromosome_fitness == None:
            chromosome_fitness = self.fitness(chromosome, X, y)
        fitness_s = chromosome_fitness

        # List of visited genes in S 
        R = []
        # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i
        U = self.index_nearest_neighbor(S, X, y)
        
        while self.memetic_looper(S, R):
            j = self.memetic_select_j(S, R) 
            S[j] = 0
            gain = 0.0
            U_copy = list(U)
            mask = np.asarray(S, dtype=bool)
            X_tra, y_tra = X[mask], y[mask]
            real_idx = np.asarray(range(len(y)))[mask]

            if len(y_tra) > 0:
                for i in range(len(U)):
                    if U[i] == j:
                        self.classifier.fit(X_tra, y_tra)
                        [[idx]] = self.classifier.kneighbors(X[i], n_neighbors=1,
                                return_distance=False)
                        U[i] = real_idx[idx]
                        
                        if y[i] == y[U_copy[i]] and y[i] != y[U[i]]:
                            gain = gain - 1.0
                        if y[i] != y[U_copy[i]] and y[i] == y[U[i]]:
                            gain = gain + 1.0
                
            if gain >= self.threshold:
                n = S.sum()
                g = self.fitness_gain(gain, n)
                fitness_s = fitness_s + g
                R = []
            else:
                U = U_copy
                S[j] = 1
                R.append(j)

        return list(S), fitness_s

                    


    def main_loop(self, X, y):
        self.generate_population(X, y)
        n, worse_fit_index = 0, -1
        while (n < self.max_loop):
            parents = self.select_parents(X, y)
            offspring = self.crossover(parents[0], parents[1])
            offspring[0] = self.mutation(offspring[0])
            offspring[1] = self.mutation(offspring[1])

            fit_offs = [self.fitness(off, X, y) if sum(off) > 0 else -1 for off in offspring]
            
            if worse_fit_index == -1:
                worse_fit_index = np.argmin(self.evaluations)

            
            for i in range(len(offspring)):
                p_ls = 1.0 

                if fit_offs[i] == -1:
                    p_ls = -1

                if fit_offs[i] <= self.evaluations[worse_fit_index]:
                    p_ls = 0.0625

                if np.random.uniform(0,1) < p_ls:

                    offspring[i], fit_offs[i] = self.memetic_search(offspring[i], X, y, chromosome_fitness = fit_offs[i])

            for i in range(len(offspring)):
                if fit_offs[i] > self.evaluations[worse_fit_index]:
                    self.chromosomes[worse_fit_index] = offspring[i]
                    self.evaluations[worse_fit_index] = fit_offs[i]

                    worse_fit_index = np.argmin(self.evaluations)

            n = n + 1
            if n % 10 == 0:
                self.update_threshold(X, y)


    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        classes = np.unique(y)
        self.classes_ = classes

        self.main_loop(X, y)

        best_index = np.argmax(self.evaluations)
        mask = np.asarray(self.chromosomes[best_index], dtype=bool)
        self.X_ = X[mask]
        self.y_ = y[mask]
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)

        return self.X_, self.y_
예제 #26
0
def test_view_func_NN(model_classifier, model_rpn, model_inner, C):
    test_cls = 'aeroplane'
    input_train_file = 'pickle_data/train_data_Wflip_all.pickle'

    ## read the training data from pickle file or from annotations
    test_pickle = 'pickle_data/test_data_{}.pickle'.format(test_cls)
    if os.path.exists(test_pickle):
        with open(test_pickle) as f:
            all_imgs, classes_count, _ = pickle.load(f)

    class_mapping = C.class_mapping
    inv_class_mapping = {v: k for k, v in class_mapping.iteritems()}
    backend = K.image_dim_ordering()
    gt_cls_num = class_mapping[test_cls]
    print('work on class {}'.format(test_cls))
    base_path = os.getcwd()

    # turn off any data augmentation at test time
    C.use_horizontal_flips = False
    C.use_vertical_flips = False
    C.rot_90 = False
    count = 0
    good_img = 0
    not_good = 0

    def format_img_size(img, C):
        """ formats the image size based on config """
        img_min_side = float(C.im_size)
        (height, width, _) = img.shape

        if width <= height:
            ratio = img_min_side / width
            new_height = int(ratio * height)
            new_width = int(img_min_side)
        else:
            ratio = img_min_side / height
            new_width = int(ratio * width)
            new_height = int(img_min_side)
        img = cv2.resize(img, (new_width, new_height),
                         interpolation=cv2.INTER_CUBIC)
        return img, ratio

    def format_img_channels(img, C):
        """ formats the image channels based on config """
        img = img[:, :, (2, 1, 0)]
        img = img.astype(np.float32)
        img[:, :, 0] -= C.img_channel_mean[0]
        img[:, :, 1] -= C.img_channel_mean[1]
        img[:, :, 2] -= C.img_channel_mean[2]
        img /= C.img_scaling_factor
        img = np.transpose(img, (2, 0, 1))
        img = np.expand_dims(img, axis=0)
        return img

    def format_img(img, C):
        """ formats an image for model prediction based on config """
        img, ratio = format_img_size(img, C)
        img = format_img_channels(img, C)
        return img, ratio

    def display_image(img):
        img1 = img[:, :, (2, 1, 0)]
        # img1=img
        im = Image.fromarray(img1.astype('uint8'), 'RGB')
        im.show()

    # Method to transform the coordinates of the bounding box to its original size
    def get_real_coordinates(ratio, x1, y1, x2, y2):
        ## read the training data from pickle file or from annotations
        real_x1 = int(round(x1 // ratio))
        real_y1 = int(round(y1 // ratio))
        real_x2 = int(round(x2 // ratio))
        real_y2 = int(round(y2 // ratio))
        return (real_x1, real_y1, real_x2, real_y2)

    vnum_test = 24
    azimuth_vec = np.concatenate(
        ([0],
         np.linspace((360. / (vnum_test * 2)), 360. -
                     (360. / (vnum_test * 2)), vnum_test)),
        axis=0)

    def find_interval(azimuth, azimuth_vec):
        for i in range(len(azimuth_vec)):
            if azimuth < azimuth_vec[i]:
                break
        ind = i
        if azimuth > azimuth_vec[-1]:
            ind = 1
        return ind

    class_mapping = C.class_mapping

    if 'bg' not in class_mapping:
        class_mapping['bg'] = len(class_mapping)

    class_mapping = {v: k for k, v in class_mapping.items()}
    # print(class_mapping)
    class_to_color = {
        class_mapping[v]: np.random.randint(0, 255, 3)
        for v in class_mapping
    }
    C.num_rois = 32

    obj_num = 0
    bbox_threshold_orig = 0.6
    th_bbox = 0.4

    ## get GT for all az for single cls
    feature_az = []
    sorted_path = input_train_file
    tmp_ind = sorted_path.index('.pickle')
    sorted_path = sorted_path[:tmp_ind] + "_sorted_Angles" + sorted_path[
        tmp_ind:]
    if os.path.exists(sorted_path):
        print("loading sorted data")
        with open(sorted_path) as f:
            trip_data = pickle.load(f)
    im_file = []
    ind = []
    for ii in range(360):
        for jj in range(3):
            try:
                im_file.append(trip_data[test_cls][ii][jj])
                ind.append(ii)
            except:
                if jj == 0:
                    print('no azimuth {}'.format(ii))
    data_gen_train = data_generators.get_anchor_gt(im_file, [],
                                                   C,
                                                   K.image_dim_ordering(),
                                                   mode='test')
    azimuth_dict = []
    inner_NN = []
    azimuths = []
    for tt in range(len(ind)):
        try:
            if tt % 100 == 0:
                print('worked on {}/{}'.format(tt, len(ind)))
            # print ('im num {}'.format(good_img))
            X, Y, img_data = next(data_gen_train)

            P_rpn = model_rpn.predict_on_batch(X)

            R = roi_helpers.rpn_to_roi(P_rpn[0],
                                       P_rpn[1],
                                       C,
                                       K.image_dim_ordering(),
                                       use_regr=True,
                                       overlap_thresh=0.7,
                                       max_boxes=300)

            X2, Y1, Y2, Y_view = roi_helpers.calc_iou_new(
                R, img_data, C, C.class_mapping)

            pos_samples = np.where(Y1[0, :, -1] == 0)
            sel_samples = pos_samples[0].tolist()
            R = X2[0, sel_samples, :]
            for jk in range(R.shape[0] // C.num_rois + 1):
                ROIs = np.expand_dims(R[C.num_rois * jk:C.num_rois *
                                        (jk + 1), :],
                                      axis=0)
                if ROIs.shape[1] == 0:
                    break

                if jk == R.shape[0] // C.num_rois:
                    # pad R
                    curr_shape = ROIs.shape
                    target_shape = (curr_shape[0], C.num_rois, curr_shape[2])
                    ROIs_padded = np.zeros(target_shape).astype(ROIs.dtype)
                    ROIs_padded[:, :curr_shape[1], :] = ROIs
                    ROIs_padded[0, curr_shape[1]:, :] = ROIs[0, 0, :]
                    ROIs = ROIs_padded

                [P_cls, P_regr, P_view] = model_classifier.predict([X, ROIs])
                iner_f = model_inner.predict([X, ROIs])
                # oo = model_classifier_only.predict([F, ROIs])

                for ii in range(len(sel_samples)):

                    if np.max(P_cls[0,
                                    ii, :]) < bbox_threshold_orig or np.argmax(
                                        P_cls[0,
                                              ii, :]) == (P_cls.shape[2] - 1):
                        continue

                    ## get class from the net
                    # cls_num = np.argmax(P_cls[0, ii, :])

                    ## use gt class
                    cls_num = gt_cls_num

                    cls_name = inv_class_mapping[cls_num]
                    cls_view = P_view[0, ii, 360 * cls_num:360 * (cls_num + 1)]

                    # azimuths[cls_name].append(np.argmax(cls_view, axis=0))
                    inner_NN.append(iner_f[0, ii, :])
                    azimuth_dict.append(img_data['bboxes'][0]['azimuth'])
        except:
            print('failed on az {}'.format(img_data['bboxes'][0]['azimuth']))
    ## calculating some mean feature map for every az
    with open('pickle_data/{}_NN.pickle'.format(C.weight_name), 'w') as f:
        pickle.dump([inner_NN, azimuth_dict], f)
        print('saved PICKLE')

    with open('pickle_data/{}_NN.pickle'.format(C.weight_name)) as f:
        inner_NN, azimuth_dict = pickle.load(f)
    neigh = KNeighborsClassifier(n_neighbors=1)
    neigh.fit(inner_NN, azimuth_dict)

    jj = 0
    for im_file in all_imgs:
        jj += 1
        if jj % 50 == 0:
            print(jj)
        filepath = im_file['filepath']
        img = cv2.imread(filepath)
        img_gt = np.copy(img)
        if img is None:
            not_good += 1
            continue
        else:
            good_img += 1
            # print ('im num {}'.format(good_img))
        X, ratio = format_img(img, C)

        if backend == 'tf':
            X = np.transpose(X, (0, 2, 3, 1))

        # get the feature maps and output from the RPN
        Y1, Y2 = model_rpn.predict(X)
        R = roi_helpers.rpn_to_roi(Y1,
                                   Y2,
                                   C,
                                   K.image_dim_ordering(),
                                   overlap_thresh=0.7)
        # # convert from (x1,y1,x2,y2) to (x,y,w,h)
        R[:, 2] -= R[:, 0]
        R[:, 3] -= R[:, 1]

        width, height = int(im_file["width"]), int(im_file["height"])
        resized_width, resized_height = data_generators.get_new_img_size(
            width, height, C.im_size)
        # [_,_, F] = model_rpn.predict(X)
        ROIs = []
        ## pass on all the labels in the image, some of them are not equal to test_cls
        for bbox_gt in im_file['bboxes']:
            no_bbox_flag = 1
            bbox_threshold = bbox_threshold_orig
            if not bbox_gt['class'] == test_cls:
                continue
            if bbox_gt[
                    'class'] == test_cls and bbox_threshold == bbox_threshold_orig:
                obj_num += 1
            while no_bbox_flag and bbox_threshold > th_bbox:
                cls_gt = bbox_gt['class']
                az_gt = bbox_gt['azimuth']
                el_gt = bbox_gt['elevation']
                t_gt = bbox_gt['tilt']
                if len(ROIs) == 0:
                    # apply the spatial pyramid pooling to the proposed regions
                    bboxes = {}
                    probs = {}
                    azimuths = {}
                    inner_res = {}
                    # print ('obj num {}'.format(obj_num))

                    for jk in range(R.shape[0] // C.num_rois + 1):
                        ROIs = np.expand_dims(R[C.num_rois * jk:C.num_rois *
                                                (jk + 1), :],
                                              axis=0)
                        if ROIs.shape[1] == 0:
                            break

                        if jk == R.shape[0] // C.num_rois:
                            #pad R
                            curr_shape = ROIs.shape
                            target_shape = (curr_shape[0], C.num_rois,
                                            curr_shape[2])
                            ROIs_padded = np.zeros(target_shape).astype(
                                ROIs.dtype)
                            ROIs_padded[:, :curr_shape[1], :] = ROIs
                            ROIs_padded[0, curr_shape[1]:, :] = ROIs[0, 0, :]
                            ROIs = ROIs_padded

                        [P_cls, P_regr,
                         P_view] = model_classifier.predict([X, ROIs])
                        inner_out = model_inner.predict([X, ROIs])
                        # oo = model_classifier_only.predict([F, ROIs])

                        for ii in range(P_cls.shape[1]):

                            if np.max(P_cls[
                                    0, ii, :]) < bbox_threshold or np.argmax(
                                        P_cls[0,
                                              ii, :]) == (P_cls.shape[2] - 1):
                                continue

                            ## get class from the net
                            # cls_num = np.argmax(P_cls[0, ii, :])

                            ## use gt class
                            cls_num = gt_cls_num

                            cls_name = inv_class_mapping[cls_num]
                            cls_view = P_view[0, ii, 360 * cls_num:360 *
                                              (cls_num + 1)]

                            if cls_name not in bboxes:
                                bboxes[cls_name] = []
                                probs[cls_name] = []
                                azimuths[cls_name] = []
                                inner_res[cls_name] = []

                            (x, y, w, h) = ROIs[0, ii, :]

                            try:
                                (tx, ty, tw,
                                 th) = P_regr[0, ii,
                                              4 * cls_num:4 * (cls_num + 1)]
                                tx /= C.classifier_regr_std[0]
                                ty /= C.classifier_regr_std[1]
                                tw /= C.classifier_regr_std[2]
                                th /= C.classifier_regr_std[3]
                                x, y, w, h = roi_helpers.apply_regr(
                                    x, y, w, h, tx, ty, tw, th)
                            except:
                                pass
                            bboxes[cls_name].append([
                                C.rpn_stride * x, C.rpn_stride * y,
                                C.rpn_stride * (x + w), C.rpn_stride * (y + h)
                            ])
                            probs[cls_name].append(np.max(P_cls[0, ii, :]))
                            azimuths[cls_name].append(
                                np.argmax(cls_view, axis=0))
                            inner_res[cls_name].append(inner_out[0, ii, :])

                # cv2.rectangle(img_gt, (bbox_gt['x1'], bbox_gt['y1']), (bbox_gt['x2'], bbox_gt['y2']), (int(class_to_color[test_cls][0]), int(class_to_color[test_cls][1]), int(class_to_color[test_cls][2])), 2)
                for key in bboxes:
                    # if 1:
                    if key == test_cls and bbox_gt['class'] == test_cls:
                        bbox = np.array(bboxes[key])
                        prob = np.array(probs[key])
                        azimuth = np.array(azimuths[key])
                        inner_result = np.array(inner_res[key])
                        # img = draw_bbox(img,bbox, prob, azimuth, ratio)
                        azimuth = neigh.predict(inner_result)
                        ## get the azimuth from bbox that have more than 'overlap_thresh' overlap with gt_bbox
                        az = []
                        overlap_thresh = 0.5
                        try:
                            while np.size(az) == 0 and overlap_thresh > 0:
                                _, prob_bbox, az = roi_helpers.overlap_with_gt(
                                    bbox,
                                    prob,
                                    azimuth,
                                    bbox_gt,
                                    ratio=ratio,
                                    overlap_thresh=overlap_thresh,
                                    max_boxes=300,
                                    use_az=True)
                                overlap_thresh -= 0.1
                            if overlap_thresh == 0:
                                print("No good Bbox was found")
                            counts = np.bincount(az)
                        except:
                            az = []
                            counts = []
                        try:
                            az_fin = np.argmax(counts)
                            true_bin = find_interval(az_gt, azimuth_vec)
                            prob_bin = find_interval(az_fin, azimuth_vec)
                            no_bbox_flag = 0
                            if true_bin == prob_bin:
                                count += 1
                                break
                        except:
                            # print('here')
                            no_bbox_flag = 1
                            bbox_threshold -= 0.1

                    ## azimuth calculations

                    ## display

                bbox_threshold -= 0.1

    succ = float(count) / float(obj_num) * 100.
    print(
        'for class {} -true count is {} out of {} from {} images . {} success'.
        format(test_cls, count, obj_num, good_img, succ))
    return succ
예제 #27
0
                    azimuths[cls_name].append(np.argmax(cls_view, axis=0))
                    inner_res[cls_name].append(inner_out[0, ii, :])

            all_dets = []
            if len(bboxes) == 0:
                bbox_threshold -= 0.1
            # cv2.rectangle(img_gt, (bbox_gt['x1'], bbox_gt['y1']), (bbox_gt['x2'], bbox_gt['y2']), (int(class_to_color[test_cls][0]), int(class_to_color[test_cls][1]), int(class_to_color[test_cls][2])), 2)
            for key in bboxes:
                # if 1:
                if key == test_cls and bbox_gt['class'] == test_cls:
                    bbox = np.array(bboxes[key])
                    prob = np.array(probs[key])
                    azimuth = np.array(azimuths[key])
                    inner_result = np.array(inner_res[key])
                    # img = draw_bbox(img,bbox, prob, azimuth, ratio)
                    azimuth = neigh.predict(inner_result)
                    ## get the azimuth from bbox that have more than 'overlap_thresh' overlap with gt_bbox
                    az = []
                    overlap_thresh = 0.5
                    try:
                        while np.size(az) == 0 and overlap_thresh > 0:
                            _, prob_bbox, az = roi_helpers.overlap_with_gt(
                                bbox,
                                prob,
                                azimuth,
                                bbox_gt,
                                ratio=ratio,
                                overlap_thresh=overlap_thresh,
                                max_boxes=300,
                                use_az=True)
                            overlap_thresh -= 0.1
def KFoldCrossValidation(train_and_test_indexes,
                         X_data_frame,
                         y_data_frame,
                         k_value=3,
                         kcv_value=9,
                         smote=True,
                         debug=False):
    train_indexes = train_and_test_indexes[0]
    #print('Train Indexes:',train_indexes)
    test_indexes = train_and_test_indexes[1]
    #print('Test Indexes:',test_indexes)

    knn = KNeighborsClassifier(n_neighbors=k_value)

    #if debug:
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index, "\n")

    # STEP 1: split data between test and train sets
    if debug:
        print('* Starting train and test sets splitting... ', end='')

    y_data = np.ravel(y_data_frame)  # Added to solve column-vector issue

    X_train, X_test, y_train, y_test = X_data_frame[
        train_indexes], X_data_frame[test_indexes], y_data[
            train_indexes], y_data[test_indexes]
    #print('y_data[test_indexes]:',y_data[test_indexes])
    if debug:
        print('Done!')

    # print the shapes of the new X objects
    if debug:
        print('* Display X and y objects\'s shape:')
        print('\t X_train.shape: ', X_train.shape)
        print('\t X_test.shape: ', X_test.shape)
        print('\t y_train.shape: ', y_train.shape)
        print('\t y_test.shape: ', y_test.shape)

    # SMOTE HERE

    if smote:
        # Oversampling training data using SMOTE
        if debug:
            print('* Starting to oversample training data using SMOTE...')
            print(
                '\t -Number of instances inside TRAIN set from each class BEFORE to apply SMOTE=',
                (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2)))
            print(
                '\t -Number of instances inside TEST set from each class BEFORE to apply SMOTE=',
                (sum(y_test == 0), sum(y_test == 1), sum(y_test == 2)))
            print(
                '\t -Number of instances inside TRAIN set from each class BEFORE to apply SMOTE=',
                (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2)))
            print(
                '\t -Number of instances inside TEST set  from each class BEFORE to apply SMOTE=',
                (sum(y_test == 0), sum(y_test == 1), sum(y_test == 2)))

        from imblearn.over_sampling import SMOTE
        smt = SMOTE()
        X_train, y_train = smt.fit_sample(X_train, y_train)

        if debug:
            print('\t -Instances amount from each class AFTER to apply SMOTE=',
                  (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2)))

    #print('y_train:',y_train)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    #print('y_test:',y_data[test_indexes])
    #print('y_pred=',y_pred)

    # comparing actual response values (y_test) with predicted response values (y_pred)
    this_accuracy = metrics.accuracy_score(y_test, y_pred)
    this_confusion_matrix = metrics.confusion_matrix(y_test,
                                                     y_pred,
                                                     labels=None,
                                                     sample_weight=None)

    return this_accuracy, this_confusion_matrix
예제 #29
0
            out = Trainer.model.semantics(data, 1, 1, 1.0)

            n = data.shape[0]
            Xtrain[counter:counter +
                   n] = out[0].detach().cpu().numpy().reshape(n, -1)
            Ytrain[counter:counter + n] = labels.numpy()
            counter += n

        counter = 0
        for i, (data, labels) in enumerate(testloader):
            # Feed forward data
            data = data.reshape(-1, *Trainer.input_shape).to(torch.float32).to(
                Trainer.device)
            out = Trainer.model.semantics(data, 1, 1, 1.0)

            n = data.shape[0]
            Xtest[counter:counter + n] = out[0].detach().cpu().numpy().reshape(
                n, -1)
            Ytest[counter:counter + n] = labels.numpy()
            counter += n

    else:
        raise ValueError('Wrong dataset')

    #%%
    from sklearn.neighbors.classification import KNeighborsClassifier
    classifier = KNeighborsClassifier()
    classifier.fit(Xtrain, Ytrain)
    Ypred = classifier.predict(Xtest)
    print(np.mean(Ypred == Ytest))
예제 #30
0
def plot_decision_boundaries(
    X_train,
    y_train,
    y_pred_train,
    X_test,
    y_test,
    y_pred_test,
    resolution: int = 100,
    embedding=None,
):
    X = np.concatenate([X_train, X_test])
    y = np.concatenate([y_train, y_test])
    y_pred = np.concatenate([y_pred_train, y_pred_test])

    if embedding is None:
        try:
            embedding = umap.UMAP(n_components=2,
                                  random_state=160290).fit_transform(X)
        except:
            from sklearn.manifold import TSNE

            embedding = TSNE(n_components=2,
                             random_state=160290).fit_transform(X)
    x_min, x_max = safe_bounds(embedding[:, 0])
    y_min, y_max = safe_bounds(embedding[:, 1])
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, resolution),
                         np.linspace(y_min, y_max, resolution))

    # approximate Voronoi tesselation on resolution x resolution grid using 1-NN
    background_model = KNeighborsClassifier(n_neighbors=1).fit(
        embedding, y_pred)
    voronoi_bg = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
    voronoi_bg = voronoi_bg.reshape((resolution, resolution))

    mesh = hv.QuadMesh((xx, yy, voronoi_bg)).opts(cmap="viridis", alpha=0.6)
    points_train = hv.Scatter(
        {
            "x": embedding[:len(y_train), 0],
            "y": embedding[:len(y_train), 1],
            "pred": y_pred_train,
            "class": y_train,
        },
        kdims=["x", "y"],
        vdims=["pred", "class"],
    )
    points_test = hv.Scatter(
        {
            "x": embedding[len(y_train):, 0],
            "y": embedding[len(y_train):, 1],
            "pred": y_pred_test,
            "class": y_test,
        },
        kdims=["x", "y"],
        vdims=["pred", "class"],
    )
    errors = y_pred != y
    failed_points = hv.Scatter(
        {
            "x": embedding[errors, 0],
            "y": embedding[errors, 1]
        },
        kdims=["x", "y"]).opts(color="red", size=2, alpha=0.9)

    points_train = points_train.opts(color="class",
                                     cmap="viridis",
                                     line_color="grey",
                                     size=10,
                                     alpha=0.8,
                                     tools=["hover"])
    points_test = points_test.opts(
        color="class",
        cmap="viridis",
        line_color="grey",
        size=10,
        alpha=0.8,
        tools=["hover"],
        marker="square",
    )
    plot = mesh * points_train * points_test * failed_points
    plot = plot.opts(xaxis=None,
                     yaxis=None,
                     width=500,
                     height=450,
                     title="Fronteras de decisión")
    return plot
예제 #31
0
class ENN(InstanceReductionMixin):

    """Edited Nearest Neighbors.

    The Edited Nearest Neighbors  removes the instances in de 
    boundaries, maintaining redudant samples.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.enn import ENN
    >>> import numpy as np
    >>> X = np.array([[-1, 0], [-0.8, 1], [-0.8, -1], [-0.5, 0] , [0.5, 0], [1, 0], [0.8, 1], [0.8, -1]])
    >>> y = np.array([1, 1, 1, 2, 1, 2, 2, 2])
    >>> editednn = ENN()
    >>> editednn.fit(X, y)
    ENN(n_neighbors=3)
    >>> print(editednn.predict([[-0.6, 0.6]]))
    [1]
    >>> print editednn.reduction_
    0.75

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    References
    ----------
    Ruiqin Chang, Zheng Pei, and Chao Zhang. A modified editing k-nearest
    neighbor rule. JCP, 6(7):1493–1500, 2011.

    """

    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
        self.classifier = None


    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0
            return self.X_, self.y_

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
예제 #32
0
print(label_train.shape)
print(label_test.shape)

# 학습 모델 생성 후 처리.
clf = KNeighborsClassifier(n_neighbors=3).fit(data_train, label_train)  # 개량형.
print(clf)

# 데이터 검증
cross_vali = model_selection.cross_val_score(clf,
                                             data_train,
                                             label_train,
                                             cv=5)
print('각각의 검증 정답율 : ', cross_vali)
print('평균 검증 정답율 : ', cross_vali.mean())

pred = clf.predict(data_test)
print(data_test)  # 28650    0.91   0.835 ... 45834    0.37   0.975
print(pred)  # ['fat' 'fat' 'fat' ... 'fat' 'thin' 'thin']

ac_score = metrics.accuracy_score(label_test, pred)  # 검정 데이터와 모델
print('정확도 : ', ac_score)
cl_report = metrics.classification_report(label_test, pred)
print('리포트 : ', cl_report)

# 시각화
tbl2 = pd.read_csv("bmi.csv", index_col=2)
print(tbl2.tail(3))  # 끝에서 3개만.

fig = plt.figure()  # 이미지 저장 시작.

예제 #33
0
파일: baseNew.py 프로젝트: dvro/ml
class InstanceReductionMixin(InstanceReductionBase, ClassifierMixin):

    """Mixin class for all instance reduction techniques"""


    def set_classifier(self):
        """Sets the classified to be used in the instance reduction process
            and classification.

        Parameters
        ----------
        classifier : classifier, following the KNeighborsClassifier style
            (default = KNN)

        y : array-like, shape = [n_samples]
            Labels for X.

        Returns
        -------
        P : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        q : array-like, shape = [indertaminated]
            Labels for P
        """

        self.classifier = classifier


    def reduce_data(self, X, y):
        """Perform the instance reduction procedure on the given training data.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training set.0

        y : array-like, shape = [n_samples]
            Labels for X.

        Returns
        -------
        X_ : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        y_ : array-like, shape = [indertaminated]
            Labels for X_
        """
        pass
    
    def get_prototypes(self):
        return self.X_, self.y_

    def fit(self, X, y, reduce_data=True):
        """
        Fit the InstanceReduction model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        reduce_data : bool, flag indicating if the reduction would be performed
        """
        self.X = X
        self.y = y
        self.labels = set(y)
        self.prototypes = None
        self.prototypes_labels = None
        self.reduction_ratio = 0.0

        if reduce_data:
            self.reduce_data(X, y)

        return self

    def predict(self, X, n_neighbors=1):
        """Perform classification on an array of test vectors X.

        The predicted class C for each sample in X is returned.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array, shape = [n_samples]

        Notes
        -----
        The default prediction is using KNeighborsClassifier, if the
        instance reducition algorithm is to be performed with another
        classifier, it should be explicited overwritten and explained
        in the documentation.
        """
        X = atleast2d_or_csr(X)
        if not hasattr(self, "X_") or self.X_ is None:
            raise AttributeError("Model has not been trained yet.")

        if not hasattr(self, "y_") or self.y_ is None:
            raise AttributeError("Model has not been trained yet.")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict(X)


    def predict_proba(self, X):
        """Return probability estimates for the test data X.
        after a given prototype selection algorithm.
    
        Parameters
        ----------
        X : array, shape = (n_samples, n_features)
            A 2-D array representing the test points.
        
        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
        of such arrays if n_outputs > 1.
        The class probabilities of the input samples. Classes are ordered
        by lexicographic order.
        """
        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict_proba(X)
def demo():
    """ _test_knn_adwin

    This demo tests the KNNAdwin classifier on a file stream, which gives 
    instances coming from a SEA generator. 
    
    The test computes the performance of the KNNAdwin classifier as well as 
    the time to create the structure and classify max_samples (10000 by 
    default) instances.
    
    """
    start = timer()
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    # warnings.filterwarnings("ignore", ".*Passing 1d.*")
    stream = FileStream('../data/datasets/sea_big.csv', -1, 1)
    # stream = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_random_state=32523423,
    #                                  sample_seed=5435, n_classes=2, num_att=10, num_drift_centroids=50)
    stream.prepare_for_use()
    t = OneHotToCategorical([[10, 11, 12, 13],
                             [
                                 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                 47, 48, 49, 50, 51, 52, 53
                             ]])
    t2 = OneHotToCategorical([[10, 11, 12, 13],
                              [
                                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                  25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                  36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                  47, 48, 49, 50, 51, 52, 53
                              ]])

    # knn = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40)
    knn = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=2000)
    # pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)])

    compare = KNeighborsClassifier(n_neighbors=8,
                                   algorithm='kd_tree',
                                   leaf_size=40,
                                   metric='euclidean')
    # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)])
    first = True
    train = 200
    if train > 0:
        X, y = stream.next_sample(train)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe2.fit(X, y)

        knn.partial_fit(X, y, classes=stream.target_values)
        compare.fit(X, y)
        first = False
    n_samples = 0
    max_samples = 10000
    my_corrects = 0
    compare_corrects = 0

    while n_samples < max_samples:
        if n_samples % (max_samples / 20) == 0:
            logging.info('%s%%', str((n_samples // (max_samples / 20) * 5)))
        X, y = stream.next_sample()
        # my_pred = pipe.predict(X)
        my_pred = knn.predict(X)
        # my_pred = [1]
        if first:
            # pipe.partial_fit(X, y, classes=stream.target_values)
            # pipe.partial_fit(X, y, classes=stream.target_values)
            knn.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            # pipe.partial_fit(X, y)
            knn.partial_fit(X, y)
        # compare_pred = pipe2.predict(X)
        compare_pred = compare.predict(X)
        if y[0] == my_pred[0]:
            my_corrects += 1
        if y[0] == compare_pred[0]:
            compare_corrects += 1
        n_samples += 1

    end = timer()

    print('Evaluation time: ' + str(end - start))
    print(str(n_samples) + ' samples analyzed.')
    print('My performance: ' + str(my_corrects / n_samples))
    print('Compare performance: ' + str(compare_corrects / n_samples))
def plot_decision_boundaries(
    X_train,
    y_train,
    y_pred_train,
    X_test,
    y_test,
    y_pred_test,
    resolution: int = 100,
    embedding=None,
    figsize=(9, 8),
    cmap="viridis",
    title: str = "Decision boundaries",
    s=200,
):
    import umap
    X = np.concatenate([X_train, X_test])
    y = np.concatenate([y_train, y_test])
    y_pred = np.concatenate([y_pred_train, y_pred_test])

    if embedding is None:
        try:
            embedding = umap.UMAP(n_components=2,
                                  random_state=160290).fit_transform(X)
        except ImportError:
            from sklearn.manifold import TSNE

            embedding = TSNE(n_components=2,
                             random_state=160290).fit_transform(X)
    x_min, x_max = safe_bounds(embedding[:, 0])
    y_min, y_max = safe_bounds(embedding[:, 1])
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, resolution),
                         np.linspace(y_min, y_max, resolution))

    # approximate Voronoi tesselation on resolution x resolution grid using 1-NN
    background_model = KNeighborsClassifier(n_neighbors=1).fit(
        embedding, y_pred)
    voronoi_bg = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
    voronoi_bg = voronoi_bg.reshape((resolution, resolution))
    fig, ax = plt.subplots(figsize=figsize)
    ax.pcolormesh(xx, yy, voronoi_bg, cmap=cmap, alpha=0.1)
    emb_train = embedding[:len(y_train)]
    data = pd.DataFrame({
        "x": emb_train[:, 0],
        "y": emb_train[:, 1],
        "target": y_train
    })
    data.plot.scatter(x="x",
                      y="y",
                      c="target",
                      cmap=cmap,
                      s=s,
                      colorbar=False,
                      ax=ax,
                      alpha=0.7,
                      label="train set")
    emb_test = embedding[len(y_train):]
    data = pd.DataFrame({
        "x": emb_test[:, 0],
        "y": emb_test[:, 1],
        "target": y_test
    })
    data.plot.scatter(x="x",
                      y="y",
                      c="target",
                      cmap=cmap,
                      s=s,
                      colorbar=False,
                      ax=ax,
                      alpha=0.7,
                      marker="s",
                      label="test set")
    errors = y_pred != y
    failed_points = ax.scatter(embedding[errors, 0],
                               embedding[errors, 1],
                               c="red",
                               s=50,
                               alpha=0.9,
                               label="errors")
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xlabel(None)
    plt.ylabel(None)
    plt.legend()
    if title is not None:
        plt.title(title, fontsize=22)
    return fig, ax
예제 #36
0
파일: main.py 프로젝트: laurogama/mlpython
def knn_score(X, y, neighbors):
    knn5 = KNeighborsClassifier(n_neighbors=neighbors)
    knn5.fit(X, y)
    y_pred = knn5.predict(X)
    print "KNN{} accuracy_score: {}".format(neighbors,
                                            metrics.accuracy_score(y, y_pred))
예제 #37
0
from sklearn.neighbors.classification import KNeighborsClassifier
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
print np.unique(iris_y)

np.random.seed(0)
indices = np.random.permutation(len(iris_X))
iris_X_train = iris_X[indices[indices[:-10]]]
iris_y_train = iris_y[indices[indices[:-10]]]
iris_X_test = iris_X[indices[indices[:-10]]]
iris_y_test = iris_y[indices[indices[:-10]]]
knn = KNeighborsClassifier()
knn.fit(iris_X_train, iris_y_train)
print "预测:"
print knn.predict(iris_X_test)
print "实际:"
print iris_y_test
i = knn.predict(iris_X_test) == iris_y_test
k = 0
for j in i:
    if j == False:
        k += 1
print len(i)
print "预测错误数:"
print k

diabetes = datasets.load_diabetes()  #糖尿病数据集
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
예제 #38
0
reader = csv.reader(open("reduced_features.csv", "r"), delimiter=",")
X = list(reader)
X = np.array(X)
X = X.astype(np.float)

#create result vector
reader = csv.reader(open("target_output.csv", "r"), delimiter=",")
y = list(reader)
y = np.array(y)
y = y.astype(np.int)
y = y.ravel()

X_Train_embedded = TSNE(n_components=2).fit_transform(X)
print X_Train_embedded.shape
model = KNeighborsClassifier(n_neighbors=1).fit(X, y)
y_predicted = model.predict(X)
# replace the above by your data and model

# create meshgrid
resolution = 1024  # 100x100 background pixels
X2d_xmin, X2d_xmax = np.min(X_Train_embedded[:,
                                             0]), np.max(X_Train_embedded[:,
                                                                          0])
X2d_ymin, X2d_ymax = np.min(X_Train_embedded[:,
                                             1]), np.max(X_Train_embedded[:,
                                                                          1])
xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution),
                     np.linspace(X2d_ymin, X2d_ymax, resolution))

# approximate Voronoi tesselation on resolution x resolution grid using 1-NN
background_model = KNeighborsClassifier(n_neighbors=1).fit(
    '''

    # Load data.
    path_weights = "resources/knn_weights.bin"
    # path_train = "resources/crimes_training_ones.bin"
    path_train = "resources/crimes_samples_training.bin"
    # path_tests = "resources/crimes_testing_ones.bin"
    path_tests = "resources/crimes_samples_testing.bin"

    print "Normalizing train"
    crime_train = CrimeData(path_train)
    crime_train.data[:, 22:24], mean_x_y, std_x_y = z_norm_by_feature(crime_train.data[:, 22:24])
    crime_train.data[:, 1:5], mean_time, std_time = z_norm_by_feature(crime_train.data[:, 1:5])
    crime_train.data = np.hstack((crime_train.data[:, 0:24], crime_train.data[:, 141:241]))

    print "Normalizing test"
    crime_test = CrimeData(path_tests)
    crime_test.data[:, 22:24] = z_norm_by_feature(crime_test.data[:, 22:24], mean_x_y, std_x_y)
    crime_test.data[:, 1:5] = z_norm_by_feature(crime_test.data[:, 1:5], mean_time, std_time)
    crime_test.data = np.hstack((crime_test.data[:, 0:24], crime_test.data[:, 141:241]))

    n = 0.1
    for i in range(1, 10):
        n *= 10
        clf = KNeighborsClassifier(n_neighbors = n)
        print "Fitting"
        clf.fit(crime_train.data, crime_train.y)
        print "Testing"
        preds = clf.predict(crime_test.data[0:10000])
        print n, np.mean(crime_test.y[0:10000] == preds[0:10000])