def SequentialRadiusNeighborsClassifier(epsilon, X_train, X_test, Y_train):
    X_train_temp = np.copy(X_train)
    Y_train_temp = np.copy(Y_train)
    Reps = RadiusNeighborsClassifier(radius=epsilon)
    test_size = len(X_test)
    Y_predict = [-1 for x in range(test_size)]
    Y_current = list(set(Y_train))
    test_index = [x for x in range(test_size)]
    for test_time in range(test_size):
        Knn_temp = NearestNeighbors(n_neighbors=1)
        Knn_temp.fit(X_train_temp)
        min_distances = Knn_temp.kneighbors(X_test[test_index])[0]
        min_distances = [np.mean(x) for x in min_distances]
        optimal_indice = min_distances.index(min(min_distances))
        optimal_test = test_index[optimal_indice]
        test_index.remove(optimal_test)
        Reps.fit(X_train_temp, Y_train_temp)
        predict_set = Reps.radius_neighbors(X_test[optimal_test].reshape(
            1, -1))[1]
        predict_set = predict_set[0]
        if predict_set.size > 0:
            y_predict = Reps.predict(X_test[optimal_test].reshape(1, -1))
            y_predict = y_predict[0]
        else:
            y_predict = max(Y_current) + 1
            Y_current.append(y_predict)
        Y_predict[optimal_test] = y_predict
        X_train_temp = np.append(X_train_temp, [X_test[optimal_test]], axis=0)
        Y_train_temp = np.append(Y_train_temp, [y_predict], axis=0)
    return Y_predict
示例#2
0
    def clusterFacetSamplesRNN(self, reduceRadius=3):
        """
        cluster the samples of each facet using radius nearest neighbours
        the cluster center and their correspondent normals will be saved
        in self.objsamplepnts_refcls and self.objsamplenrmals_refcls

        :param: reduceRadius: the neighbors that fall inside the reduceradius will be removed
        :return: None

        author: weiwei
        date: 20161130, osaka
        """

        self.objsamplepnts_refcls = np.ndarray(shape=(self.facets.shape[0], ),
                                               dtype=np.object)
        self.objsamplenrmls_refcls = np.ndarray(shape=(self.facets.shape[0], ),
                                                dtype=np.object)
        for i, facet in enumerate(self.facets):
            # print "cluster"
            # print i,len(self.facets)
            self.objsamplepnts_refcls[i] = []
            self.objsamplenrmls_refcls[i] = []
            X = self.objsamplepnts_ref[i]
            nX = X.shape[0]
            if nX > 0:
                neigh = RadiusNeighborsClassifier(radius=1.0)
                neigh.fit(X, range(nX))
                neigharrays = neigh.radius_neighbors(X,
                                                     radius=reduceRadius,
                                                     return_distance=False)
                delset = set([])
                for j in range(nX):
                    if j not in delset:
                        self.objsamplepnts_refcls[i].append(np.array(X[j]))
                        self.objsamplenrmls_refcls[i].append(
                            np.array(self.objsamplenrmls_ref[i][j]))
                        # if self.objsamplepnts_refcls[i].size:
                        #     self.objsamplepnts_refcls[i] = np.vstack((self.objsamplepnts_refcls[i], X[j]))
                        #     self.objsamplenrmls_refcls[i] = np.vstack((self.objsamplenrmls_refcls[i],
                        #                                                 self.objsamplenrmls_ref[i][j]))
                        # else:
                        #     self.objsamplepnts_refcls[i] = np.array([])
                        #     self.objsamplenrmls_refcls[i] = np.array([])
                        #     self.objsamplepnts_refcls[i] = np.hstack((self.objsamplepnts_refcls[i], X[j]))
                        #     self.objsamplenrmls_refcls[i] = np.hstack((self.objsamplenrmls_refcls[i],
                        #                                                 self.objsamplenrmls_ref[i][j]))
                        delset.update(neigharrays[j].tolist())
            if self.objsamplepnts_refcls[i]:
                self.objsamplepnts_refcls[i] = np.vstack(
                    self.objsamplepnts_refcls[i])
                self.objsamplenrmls_refcls[i] = np.vstack(
                    self.objsamplenrmls_refcls[i])
            else:
                self.objsamplepnts_refcls[i] = np.empty(shape=(0, 0))
                self.objsamplenrmls_refcls[i] = np.empty(shape=(0, 0))
示例#3
0
    def clusterFacetSamplesRNN(self, reduceRadius=3):
        """
        cluster the samples of each facet using radius nearest neighbours
        the cluster center and their correspondent normals will be saved
        in self.objsamplepnts_refcls and self.objsamplenrmals_refcls

        :param: reduceRadius: the neighbors that fall inside the reduceradius will be removed
        :return: None

        author: weiwei
        date: 20161130, osaka
        """

        self.objsamplepnts_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object)
        self.objsamplenrmls_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object)
        for i, facet in enumerate(self.facets):
            # print "cluster"
            # print i,len(self.facets)
            self.objsamplepnts_refcls[i] = []
            self.objsamplenrmls_refcls[i] = []
            X = self.objsamplepnts_ref[i]
            nX = X.shape[0]
            if nX > 0:
                neigh = RadiusNeighborsClassifier(radius=1.0)
                neigh.fit(X, range(nX))
                neigharrays = neigh.radius_neighbors(X, radius=reduceRadius, return_distance=False)
                delset = set([])
                for j in range(nX):
                    if j not in delset:
                        self.objsamplepnts_refcls[i].append(np.array(X[j]))
                        self.objsamplenrmls_refcls[i].append(np.array(self.objsamplenrmls_ref[i][j]))
                        # if self.objsamplepnts_refcls[i].size:
                        #     self.objsamplepnts_refcls[i] = np.vstack((self.objsamplepnts_refcls[i], X[j]))
                        #     self.objsamplenrmls_refcls[i] = np.vstack((self.objsamplenrmls_refcls[i],
                        #                                                 self.objsamplenrmls_ref[i][j]))
                        # else:
                        #     self.objsamplepnts_refcls[i] = np.array([])
                        #     self.objsamplenrmls_refcls[i] = np.array([])
                        #     self.objsamplepnts_refcls[i] = np.hstack((self.objsamplepnts_refcls[i], X[j]))
                        #     self.objsamplenrmls_refcls[i] = np.hstack((self.objsamplenrmls_refcls[i],
                        #                                                 self.objsamplenrmls_ref[i][j]))
                        delset.update(neigharrays[j].tolist())
            if self.objsamplepnts_refcls[i]:
                self.objsamplepnts_refcls[i] = np.vstack(self.objsamplepnts_refcls[i])
                self.objsamplenrmls_refcls[i] = np.vstack(self.objsamplenrmls_refcls[i])
            else:
                self.objsamplepnts_refcls[i] = np.empty(shape=(0,0))
                self.objsamplenrmls_refcls[i] = np.empty(shape=(0,0))
示例#4
0
 def nncut_proc(distance, dt, dr, type):
     if dt.shape[0] == 0:
         return [dt, dr]
     nbrs = RadiusNeighborsClassifier().fit(
         dt,
         np.zeros_like(dr).reshape(dt.shape[0], ))
     colcnt = dt.shape[1]
     middle = nbrs.radius_neighbors(np.zeros(colcnt).reshape(1, colcnt),
                                    distance,
                                    return_distance=False)
     if type == 'inner':
         dt = dt.drop(dt.index[np.asarray(middle[0])])
         dr = dr.drop(dr.index[np.asarray(middle[0])])
     if type == 'outer':
         dt = dt[dt.index.isin(dt.index[np.asarray(middle[0])])]
         dr = dr[dr.index.isin(dr.index[np.asarray(middle[0])])]
     return [dt, dr]
示例#5
0
def Classifier(train_size, new_classes, optimal_test, epsilon_choice,
               X_train_temp, X_test, Y_train_temp, alg):
    clf = RadiusNeighborsClassifier(radius=epsilon_choice,
                                    weights='distance').fit(
                                        X_train_temp, Y_train_temp)
    predict_set = clf.radius_neighbors(X_test[optimal_test].reshape(1, -1))[1]
    predict_set = list(predict_set[0])
    if len(predict_set) > 0:
        if min(Y_train_temp[predict_set]) == max(Y_train_temp[predict_set]):
            return [min(Y_train_temp[predict_set]), predict_set]
        else:
            if alg == "srnc":
                y_predict = clf.predict(X_test[optimal_test].reshape(1, -1))
            else:
                if alg == "svm":
                    clf = svm.SVC().fit(X_train_temp[predict_set],
                                        Y_train_temp[predict_set])
                if alg == "LinearSVC":
                    clf = LinearSVC(max_iter=500000).fit(
                        X_train_temp[predict_set], Y_train_temp[predict_set])
                if alg == "sgd":
                    clf = linear_model.SGDClassifier().fit(
                        X_train_temp[predict_set], Y_train_temp[predict_set])
                if alg == "dt":
                    clf = DecisionTreeClassifier().fit(
                        X_train_temp[predict_set], Y_train_temp[predict_set])
                if alg == "rf":
                    clf = RandomForestClassifier(n_estimators=10).fit(
                        X_train_temp[predict_set], Y_train_temp[predict_set])
                if alg == "gb":
                    clf = GradientBoostingClassifier(n_estimators=10).fit(
                        X_train_temp[predict_set], Y_train_temp[predict_set])
                if alg == "lr":
                    clf = LogisticRegression(max_iter=1000).fit(
                        X_train_temp[predict_set], Y_train_temp[predict_set])
                if alg == "mlp":
                    clf = MLPClassifier().fit(X_train_temp[predict_set],
                                              Y_train_temp[predict_set])
                y_predict = clf.predict(X_test[optimal_test].reshape(1, -1))
            return [y_predict[0], predict_set]
    else:
        return [new_classes, predict_set]
示例#6
0
 def _transductive_classifier(self, X_train, y_train, test_instance):
     clf = RadiusNeighborsClassifier(radius=self.epsilon,
                                     weights='distance').fit(
                                         X_train, y_train)
     predict_set = clf.radius_neighbors(test_instance.reshape(1, -1))[1]
     predict_set = list(predict_set[0])
     if len(predict_set) > 0:
         X_train_local, y_train_local = X_train[predict_set], y_train[
             predict_set]
         if np.min(y_train_local) == np.max(y_train_local):
             prediction = y_train_local[0]
         else:
             clf = self._fit(X_train_local, y_train_local)
             if np.max(clf.predict_proba(test_instance.reshape(
                     1, -1))) < self.threshold_rejection:
                 prediction = self.new_classes
             else:
                 prediction = clf.predict(test_instance.reshape(1, -1))[0]
     else:
         prediction = self.new_classes
     return prediction
示例#7
0
    def nnradiussmooth(self,
                       columns=None,
                       rescolumn=None,
                       distance=0.2,
                       cycles=1):
        if columns == None:
            columns = range(0, self.dataset_width)
        colcnt = len(columns)
        dt = self.insample_data
        dataset = pd.DataFrame(dt.ix[:, columns])
        nbrs = RadiusNeighborsClassifier().fit(
            dt,
            np.zeros_like(self.insample_res).reshape(
                self.insample_res.shape[0], ))
        nb = nbrs.radius_neighbors(dt, distance, return_distance=False)

        for i in range(0, cycles):
            dr = self.insample_res
            for x in nb:
                mn = self.insample_res.ix[x, 0].mean()
                dr.ix[x[0], 0] = dr.ix[x[0], 0] * 0.8 + mn * 0.2
                self.insample_res = dr
print("Accuracy radius classifier")
print(confusion_matrix(y_test, y_pred_radius))
print(classification_report(y_test, y_pred_radius))

y_pred_radius_for_one = classifier_radius.predict(new_X)

print("radius prediction for one")
print(y_pred_radius_for_one)

print("Accuracy radius classifier for one")
print(confusion_matrix(new_y, y_pred_radius_for_one))
print(classification_report(new_y, y_pred_radius_for_one))

radius_neighbors = classifier_radius.radius_neighbors(X=new_X,
                                                      return_distance=True,
                                                      sort_results=True)

print("radius neighbors")
print("The closest neighbors are ([distance, row_index])")
print(radius_neighbors)

for i in range(0, nr_of_neighbors):
    # the id of the neighboars: neighbors[1][0][i]
    print(data_df.iloc[radius_neighbors[1][0][i], :])

# graph = classifier.kneighbors_graph(
#   X=new_X, n_neighbors=nr_of_neighbors, mode='distance')
# How to plot the graph?
#plt.figure(figsize=(12, 6))
#plt.plot(graph.toarray(), new_X, color='red', linestyle='dashed', marker='o',)
class RadiusNeighborsModel(Classifier):

    """Classifier implementing a vote among neighbors within a given radius
       The radius
       Classifier predicting the labels by counting occurrences among the
       neighbors within a given radius r from a query example.

       In cases where the data is not uniformly sampled,
       radius-based neighbors classifier can be a better choice compared to
       k-nearest neighbors classifier. Points in sparser neighborhoods use fewer
       nearest neighbors for the classification
       For high-dimensional parameter spaces, this method becomes less effective
       due to the so-called “curse of dimensionality”.

       The choice of the radius is highly data-dependent, similarly to k in the
       k-nearest neighbors classifier.
    """
    def __init__(self, radius=1.0, weights='uniform', p=2, metric='minkowski', ranking_size=30):
        """
           :param radius:
           Range of parameter space to use by default for query example
           :param weights:
           The weight function used in prediction.
           Possible values:
            - 'uniform' : uniform weights. All points in each neighborhood are
               weighted equally.
           :param p:
           Power parameter for the Minkowski metric
           :param metric:
           The distance metric to use for the tree. The default metric is
           Minkowski, and with p=2 is equivalent to the standard Euclidean
           metric. Choices are:
            - 'euclidean' for standard Euclidean distance
            - 'manhattan': for the Manhattan distance
            - 'haversine' for distances between (latitude,longitude) points only
            - 'cosine': for cosinus similarity
            - 'minkowski': the Minkowski distance (euclidean if p=2)

           :param how_outliers:
           The way outlier samples (samples with no neighbors on given radius)
           are predicted. Possible values:
           - 'most_common' : return the most common labels in the training set
           - 'random' : return a random label ranking from the training set
           - [callable] : a user-defined function which accepts an example and
              returns a label ranking.
        """
        self.radius = radius
        if weights != 'uniform':
            raise Exception("Only 'uniform' for the weights parameter is supported")
        self.weights = weights
        self.p = p
        self.metric = metric
        self.ranking_size = ranking_size
        # Scikit-learn Radius neighbors classifier
        self.clf = RadiusNeighborsClassifier(radius=radius,
                                        weights=weights,
                                        p=p,
                                        metric=metric,
                                        n_jobs=-1
                                        )
    def fit(self, X, y):

        super().fit(X, y)

        # The way outlier samples (samples with no neighbors on given radius)
        # are predicted is the following: predict only one label, the most
        # common one in the training set
        y_unique,counts = np.unique(y, return_counts=True)
        outlier_label = y_unique[np.argmax(counts)]
        self.outlier_label_ = outlier_label
        self.outlier_proba_ = np.max(counts)/len(y)

    def predict(self, X, return_proba=False):

        # check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])
        # input validation
        X = check_array(X)

        # Compute neighbors indexes and distances for every test example
        # The result points are not necessarily sorted by distance to their
        # query point.

        neigh_distances, neigh_indexes = self.clf.radius_neighbors(X, return_distance=True)
        # neigh_argsorts = [np.argsort(ngh_dist) for ngh_dist in distances]

        y_predicted = list()
        y_predicted_probas = list()
        for indexes,distances in zip(neigh_indexes, neigh_distances):

            try:
                y_neigh = self.y_[indexes]
            except IndexError:

                y_predicted.append([self.outlier_label_])
                y_predicted_probas.append([self.outlier_proba_]+[0. for k in range(self.ranking_size)])
                continue
            y_unique, counts = np.unique(y_neigh, return_counts=True)

            # Get the most frequent labels from the neighbors
            # probability estimate
            probas = counts/len(y_neigh)
            # get the indexes of the sorted probabilities, in decreasing order
            top_predictions = np.flip(np.argsort(probas)[-self.ranking_size:],axis=0)
            y_pred = y_neigh[top_predictions]
            y_pred_probas = probas[top_predictions]
            if len(y_unique) < self.ranking_size:
                rank_probas = np.zeros(self.ranking_size)
                rank_probas[:len(y_unique)] = y_pred_probas
                y_pred_probas = rank_probas

            y_predicted.append(y_pred)
            y_predicted_probas.append(y_pred_probas)

        if return_proba:
            return np.array(y_predicted),np.array(y_predicted_probas)

        return np.array(y_predicted)
def SequentialRadiusNeighborsClassifier(epsilon, X_train, X_test, Y_train, add,
                                        alg):
    #    size_train = len(Y_train)
    X_train_temp = np.copy(X_train)
    Y_train_temp = np.copy(Y_train)
    test_size = len(X_test)
    Y_predict = [-1 for x in range(test_size)]
    Y_current = list(set(Y_train))
    test_index = [x for x in range(test_size)]
    new_indices = []
    epsilon_update = epsilon
    #    epsilon_update = updateEpsilon(distances, test_index, choice)
    for test_time in range(test_size):
        Knn_temp = NearestNeighbors(n_neighbors=1)
        Knn_temp.fit(X_train_temp)
        min_distances = Knn_temp.kneighbors(X_test[test_index])[0]
        min_distances = [np.mean(x) for x in min_distances]
        optimal_indice = min_distances.index(min(min_distances))
        optimal_test = test_index[optimal_indice]
        clf = RadiusNeighborsClassifier(radius=epsilon_update,
                                        weights='distance').fit(
                                            X_train_temp, Y_train_temp)
        predict_set = clf.radius_neighbors(X_test[optimal_test].reshape(1,
                                                                        -1))[1]
        predict_set = list(predict_set[0])
        if len(predict_set) > 0:
            if min(Y[predict_set]) == max(Y[predict_set]):
                y_predict = min(Y[predict_set])
            else:
                if alg == "srnc":
                    y_predict = clf.predict(X_test[optimal_test].reshape(
                        1, -1))
                    y_predict = y_predict[0]
                else:
                    if alg == "svm":
                        clf = svm.SVC().fit(X[predict_set], Y[predict_set])
                    if alg == "LinearSVC":
                        clf = LinearSVC(max_iter=10000).fit(
                            X[predict_set], Y[predict_set])
                    if alg == "dt":
                        clf = DecisionTreeClassifier().fit(
                            X[predict_set], Y[predict_set])
                    if alg == "rf":
                        clf = RandomForestClassifier(n_estimators=10).fit(
                            X[predict_set], Y[predict_set])
                    if alg == "gb":
                        clf = GradientBoostingClassifier(n_estimators=10).fit(
                            X[predict_set], Y[predict_set])
                    if alg == "lr":
                        clf = LogisticRegression(max_iter=10000).fit(
                            X[predict_set], Y[predict_set])
                    if alg == "mlp":
                        clf = MLPClassifier().fit(X[predict_set],
                                                  Y[predict_set])
                    y_predict = clf.predict(X_test[optimal_test].reshape(
                        1, -1))
                    y_predict = y_predict[0]
            if add == 1:
                X_train_temp = np.append(X_train_temp, [X_test[optimal_test]],
                                         axis=0)
                Y_train_temp = np.append(Y_train_temp, [y_predict], axis=0)
        else:
            y_predict = max(Y_current) + 1
            Y_current.append(y_predict)
            X_train_temp = np.append(X_train_temp, [X_test[optimal_test]],
                                     axis=0)
            Y_train_temp = np.append(Y_train_temp, [y_predict], axis=0)
            new_indices.append(optimal_test)
#            epsilon_update = updateEpsilon(distances, test_index, choice)
        Y_predict[optimal_test] = y_predict
        test_index.remove(optimal_test)
    return Y_predict
                                                    random_state=0)
##建立 KNN Model
knn_model = RadiusNeighborsClassifier(radius=2)
## 訓練model, fit進model,創建屬於這個數據集的KNN模型, fit參數接受 train data 是matrix,test data 是array
## 用訓練集來創進KNN model, ravel()將多維轉換成一維matrix
knn_model.fit(X_train, y_train.values.ravel())

## Step 4: radius_neighbors 和 radius_neighbors_graph 實作
## a. radius_neighbors: 找到指定半徑下,一個或多個的近鄰,它會返回數據集中每個點的索引和距離值
## b. radius_neighbors_graph: 計算x中的點與在指定半徑內的近鄰的加權圖
## c. 步驟: 先指定一個或多個資料點,然後設定半徑,查看radius_neighbors與radius_neighbors_graph
## 找到指定半徑下,一個或多個的近鄰,它會返回數據集中每個點的索引和距離值
## 指訂一個或多個點資料,我這邊隨便設定一個資料,必須先從list轉array,然後再.reshape(1, -1),才能使用
X = [5.8, 2.8, 3.8, 6]
X = np.array(X).reshape(1, -1)
RN = knn_model.radius_neighbors(X, radius=10)
print(RN)
print(np.asarray(RN[0][0]))
# print(np.asarray(RN[1][2]))
## 計算x中的點與在指定半徑內的近鄰的加權圖
## radius neighbors graph
RNG = knn_model.radius_neighbors_graph(X, radius=10)
print(RNG)
print(RNG.toarray())

# ## 利用 test data裡的 X 來預測 y
# print(knn_model.predict(X_test))
# ## 查看實際y
# print(y_test.values.ravel())
# ## 這是 test data X 預測y是什麼的機率
# print(knn_model.predict_proba(X_test))