Пример #1
0
class DCS(object):

    @abstractmethod
    def select(self, ensemble, x):
        pass

    def __init__(self, Xval, yval, K=5, weighted=False, knn=None):
        self.Xval = Xval
        self.yval = yval
        self.K = K

        if knn == None:
            self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute')
        else:
            self.knn = knn

        self.knn.fit(Xval, yval)
        self.weighted = weighted


    def get_neighbors(self, x, return_distance=False):
        # obtain the K nearest neighbors of test sample in the validation set
        if not return_distance:
            [idx] = self.knn.kneighbors(x, 
                    return_distance=return_distance)
        else:
            [dists], [idx] = self.knn.kneighbors(x, 
                    return_distance=return_distance)
        X_nn = self.Xval[idx] # k neighbors
        y_nn = self.yval[idx] # k neighbors target

        if return_distance:
            return X_nn, y_nn, dists
        else:
            return X_nn, y_nn
Пример #2
0
class KNeighborsClassifierImpl():

    def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):
        self._hyperparams = {
            'n_neighbors': n_neighbors,
            'weights': weights,
            'algorithm': algorithm,
            'leaf_size': leaf_size,
            'p': p,
            'metric': metric,
            'metric_params': metric_params,
            'n_jobs': n_jobs}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
Пример #3
0
 def do(train_data, train_label, test_data, test_label=None, adjust_parameters=True, k=5):
     train_data = np.array(train_data).squeeze()
     train_label = np.array(train_label).squeeze()
     test_data = np.array(test_data).squeeze()
     if test_label is not None:
         test_label = np.array(test_label).squeeze()
     if not adjust_parameters:
         knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8)
         knn.fit(train_data, train_label)
         predicted_label = knn.predict(test_data)
         if test_label is not None:
             acc = accuracy_score(test_label, predicted_label)
             print 'acc is ', acc
         return predicted_label
     else:
         max_acc = 0.0
         max_k = 0
         max_predicted = None
         for k in range(1, 11):
             knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8)
             knn.fit(train_data, train_label)
             predicted_label = knn.predict(test_data)
             acc = accuracy_score(test_label, predicted_label)
             if acc > max_acc:
                 max_acc = acc
                 max_k = k
                 max_predicted = predicted_label
             print 'k = ', k, ' acc is ', acc
         print 'max acc is ', max_acc, ' responding to k is ', max_k
         return max_predicted, max_k
Пример #4
0
class PatchedRawModel:
    def __init__(self):
        self.baseModel = RawModel()
        self.model49 = KNeighborsClassifier(n_neighbors=10)
        self.model35 = KNeighborsClassifier(n_neighbors=10)
    
    def fit(self, trainExamples):
        self.baseModel.fit(trainExamples)

        X49 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [4, 9]] )
        Y49 = [x.Y for x in trainExamples if x.Y in [4, 9]]
        self.model49.fit(X49, Y49)

        X35 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [3, 5]] )
        Y35 = [x.Y for x in trainExamples if x.Y in [3, 5]]
        self.model35.fit(X35, Y35)

    def predict(self, examples):
        basePredictions = self.baseModel.predict(examples)

        for (x, y, i) in zip(examples, basePredictions, range(0, len(examples))):
            if y in [4, 9]:
                specializedPrediction = self.model49.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT)))
                if specializedPrediction != y:
                    basePredictions[i] = specializedPrediction
            elif y in [3, 5]:
                specializedPrediction = self.model35.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT)))
                if specializedPrediction != y:
                    basePredictions[i] = specializedPrediction

        return basePredictions
Пример #5
0
def compute_cnn(X, y):

  "condenced nearest neighbor. the cnn removes reduntant instances, maintaining the samples in the decision boundaries."

  classifier = KNeighborsClassifier(n_neighbors=3)

  prots_s = []
  labels_s = []

  classes = np.unique(y)
  classes_ = classes

  for cur_class in classes:
    mask = y == cur_class
    insts = X[mask]
    prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
    labels_s = labels_s + [cur_class]
    
  classifier.fit(prots_s, labels_s)
  for sample, label in zip(X, y):
    if classifier.predict(sample) != [label]:
      prots_s = prots_s + [sample]
      labels_s = labels_s + [label]
      classifier.fit(prots_s, labels_s)

  X_ = np.asarray(prots_s)
  y_ = np.asarray(labels_s)
  reduction_ = 1.0 - float(len(y_)/len(y))
  print reduction_
Пример #6
0
class RawModel:
    def __init__(self):
        # 2015-05-15 GEL Found that n_components=20 gives a nice balance of 
        # speed (substantial improvement), accuracy, and reduced memory usage 
        # (25% decrease).
        self.decomposer = TruncatedSVD(n_components=20)

        # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than 
        # algorithm='kd_tree'
        
        # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples)
        # euclidean        0.950025
        # manhattan        0.933533
        # chebyshev        0.675662
        # hamming          0.708646
        # canberra         0.934033
        # braycurtis       0.940530
        self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')

    def fit(self, trainExamples):       
        X = self.decomposer.fit_transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples] ) )
        Y = [x.Y for x in trainExamples]

        self.model.fit(X, Y)
        return self

    def predict(self, examples):
        X = self.decomposer.transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) )
        return self.model.predict( X )
Пример #7
0
def run_main():

    file_df = pd.read_csv('../dataset/voice.csv')
    #   print file_df
    insect_dataset(file_df)
    #填充空数据
    drop_na(file_df)
    #查看label的个数    分组显示
    #   print file_df['label'].value_counts()
    #特征分布可视化
    fea_name1 = 'meanfun'
    fea_name2 = 'centroid'

    #两个属性的特征图
    # visaulize_two_feature(file_df,fea_name1,fea_name2)

    #艺术性属性的特征图
    # visaulize_single_feature(file_df,fea_name1)

    #多个特征
    fea_name = ['meanfreq', 'Q25', 'Q75', 'skew', 'centroid', 'label']
    # visaulize_muilt_feature(file_df,fea_name)

    X = file_df.iloc[:, :-1].values
    file_df['label'].replace('male', 0, inplace=True)
    file_df['label'].replace('female', 1, inplace=True)
    y = file_df['label'].values

    #特征归一化
    X = preprocessing.scale(X)

    #分割训练集,测试集
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=1 / 3.,
                                                        random_state=5)

    #选择模型  交叉验证
    cv_scores = []
    k_range = range(1, 31)
    for k in k_range:
        knn = KNeighborsClassifier(k)
        #  print 'knn:',knn
        scores = cross_val_score(knn,
                                 X_train,
                                 y_train,
                                 cv=10,
                                 scoring='accuracy')
        score_mean = scores.mean()
        cv_scores.append(score_mean)
        print '%i:%.4f' % (k, score_mean)

    best_k = np.argmax(cv_scores) + 1

    #训练模型
    knn_model = KNeighborsClassifier(best_k)
    knn_model.fit(X_train, y_train)
    print '测试模型,准确率:', knn_model.score(X_test, y_test)

    return ''
Пример #8
0
class DCS(object):
    @abstractmethod
    def select(self, ensemble, x):
        pass

    def __init__(self, Xval, yval, K=5, weighted=False, knn=None):
        self.Xval = Xval
        self.yval = yval
        self.K = K

        if knn is None:
            self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute')
        else:
            self.knn = knn

        self.knn.fit(Xval, yval)
        self.weighted = weighted

    def get_neighbors(self, x, return_distance=False):
        # obtain the K nearest neighbors of test sample in the validation set
        if not return_distance:
            [idx] = self.knn.kneighbors(x, return_distance=return_distance)
        else:
            rd = return_distance
            [dists], [idx] = self.knn.kneighbors(x, return_distance=rd)
        X_nn = self.Xval[idx]  # k neighbors
        y_nn = self.yval[idx]  # k neighbors target

        if return_distance:
            return X_nn, y_nn, dists
        else:
            return X_nn, y_nn
Пример #9
0
def KNN_method(X, y):
    skf = StratifiedKFold(n_splits=4, random_state=42)
    skf.get_n_splits(X, y)

    for train_index, test_index in skf.split(X, y):
        print("Train:", train_index, "Validation:", test_index)
        trainX, testX = X[train_index], X[test_index]
        trainY, testY = y[train_index], y[test_index]

        #here starts KNN
        #how many neighbours want to use in the KNC
        kvalues = [1, 3, 5, 7, 9, 11, 13, 15, 19, 24, 30, 40, 50, 60, 70, 90]
        dist = ['manhattan', 'euclidean', 'chebyshev']
        results = {}
        for element in dist:
            accuracy_results = []
            for k in kvalues:
                knn = KNeighborsClassifier(n_neighbors=k, metric=element)
                knn.fit(trainX, trainY)
                predictedY = knn.predict(testX)
                accuracy_results.append(accuracy_score(testY, predictedY))
            results[element] = accuracy_results
        print("Results of model preparation for: " + str(results))

        plt.figure()
        multiple_line_chart(plt.gca(),
                            kvalues,
                            results,
                            'KNN variants',
                            'n',
                            'accuracy',
                            percentage=True)
        plt.show()
Пример #10
0
    def _main_loop(self):
        exit_count = 0
        knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute')
        while exit_count < len(self.groups):
            index, exit_count = 0, 0
            while index < len(self.groups):

                group = self.groups[index]
                reps_x = np.asarray([g.rep_x for g in self.groups])
                reps_y = np.asarray([g.label for g in self.groups])
                knn.fit(reps_x, reps_y)
                
                nn_idx = knn.kneighbors(group.X, n_neighbors=1, return_distance=False)
                nn_idx = nn_idx.T[0]
                mask = nn_idx == index
                
                # if all are correctly classified
                if not (False in mask):
                    exit_count = exit_count + 1
                
                # if all are misclasified
                elif not (group.label in reps_y[nn_idx]):
                    pca = PCA(n_components=1)
                    pca.fit(group.X)
                    # maybe use a 'for' instead of creating array
                    d = pca.transform(reps_x[index])
                    dis = [pca.transform(inst)[0] for inst in group.X]
                    mask_split = (dis < d).flatten()
                    
                    new_X = group.X[mask_split]
                    self.groups.append(_Group(new_X, group.label))
                    group.X = group.X[~mask_split]
                
                elif (reps_y[nn_idx] == group.label).all() and (nn_idx != index).any():
                    mask_mv = nn_idx != index
                    index_mv = np.asarray(range(len(group)))[mask_mv]
                    X_mv = group.remove_instances(index_mv)
                    G_mv = nn_idx[mask_mv]                        

                    for x, g in zip(X_mv, G_mv):
                        self.groups[g].add_instances([x])

                elif (reps_y[nn_idx] != group.label).sum()/float(len(group)) > self.r_mis:
                    mask_mv = reps_y[nn_idx] != group.label
                    new_X = group.X[mask_mv]
                    self.groups.append(_Group(new_X, group.label))
                    group.X = group.X[~mask_mv]
                else:
                   exit_count = exit_count + 1

                if len(group) == 0:
                    self.groups.remove(group)
                else:
                    index = index + 1

                for g in self.groups:
                    g.update_all()

        return self.groups                     
Пример #11
0
def evaluate(Xtra, ytra, Xtst, ytst, k=1, positive_label=1):
    knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute')
    knn.fit(Xtra, ytra)

    y_true = ytst
    y_pred = knn.predict(Xtst)

    return evaluate_results(y_true, y_pred, positive_label=positive_label)
Пример #12
0
def knn(X, y, model_path):
    model = KNeighborsClassifier()
    model.fit(X, y)
    print(model)
    #预测
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)
Пример #13
0
    def get_result(self):
        # file opener
        tkinter.Tk().withdraw()
        directory = filedialog.askdirectory()
        result = self.read_emails_from_directory(directory)

        train_labels = np.zeros(1430)
        train_labels[715:1430] = 1
        # This equates to 1-715 = HAM and 716-1430 = SPAM
        #                              If you change result[n] to something else
        #                              Make sure you change the same result down
        #                              down in line 251 (test_matrix)
        train_matrix = self.extract_features(directory, result[0])
        #print(train_matrix)
        # print("body words:", result[0])
        # print("\n\nsubject words:", result[1])
        # print("\n\nbody phrases:", result[2])
        # print("\n\nsubject phrases:", result[3])

        print("body words:", len(result[0]))
        print("subject words:", len(result[1]))
        print("body phrases:", len(result[2]))
        print("subject phrases:", len(result[3]))

        model1 = MultinomialNB()
        model2 = LinearSVC()
        model3 = RandomForestClassifier()
        model4 = KNeighborsClassifier()
        model1.fit(train_matrix, train_labels)
        model2.fit(train_matrix, train_labels)
        model3.fit(train_matrix, train_labels)
        model4.fit(train_matrix, train_labels)

        test_dir = filedialog.askdirectory()
        #                                       Here -----v
        test_matrix = self.extract_features(test_dir, result[0])
        test_labels = np.zeros(600)
        # This equates to 1-300 = HAM and 301-600 = SPAM
        test_labels[300:600] = 1
        result1 = model1.predict(test_matrix)
        result2 = model2.predict(test_matrix)
        result3 = model3.predict(test_matrix)
        result4 = model4.predict(test_matrix)

        print(confusion_matrix(test_labels, result1))
        print(confusion_matrix(test_labels, result2))
        print(confusion_matrix(test_labels, result3))
        print(confusion_matrix(test_labels, result4))
        return result
Пример #14
0
    def _pruning(self):

        if len(self.groups) < 2:
            return self.groups

        pruned, fst = False, True
        knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute')
        
        while pruned or fst:
            index = 0
            pruned, fst = False, False

            while index < len(self.groups):
                group = self.groups[index]

                mask = np.ones(len(self.groups), dtype=bool)
                mask[index] = False
                reps_x = np.asarray([g.rep_x for g in self.groups])[mask]
                reps_y = np.asarray([g.label for g in self.groups])[mask]
                labels = knn.fit(reps_x, reps_y).predict(group.X)

                if (labels == group.label).all():
                    self.groups.remove(group)
                    pruned = True
                else:
                    index = index + 1

                if len(self.groups) == 1:
                    index = len(self.groups)
                    pruned = False

        return self.groups
Пример #15
0
    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U
Пример #16
0
    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U
Пример #17
0
def compute_enn(X, y):
  """
  the edited nearest neighbors removes the instances in the boundaries, maintaining reduntant samples
  """

  classifier = KNeighborsClassifier(n_neighbors=3)

  classes = np.unique(y)
  classes_ = classes

  mask = np.zeros(y.size, dtype=bool)
  classifier.fit(X, y)

  for i in xrange(y.size):
    sample, label = X[i], y[i]
    if classifier.predict(sample) == [label]:
      mask[i] = not mask[i]

  X_ = np.asarray(X[mask])
  y_ = np.asarray(y[mask])
  reduction_ = 1.0 - float(len(y_)) / len(y)
  print reduction_
    def check_accuracy_f1(self, path):
        data_after_feature_selected = []
        for i in path:
            data_after_feature_selected.append(feature[:, i])
        data_after_feature_selected = np.array(data_after_feature_selected)
        data_after_feature_selected = data_after_feature_selected.transpose(
        )  # 矩阵转置
        X_train2, X_test2, y_train2, y_test2 = train_test_split(
            data_after_feature_selected, label, test_size=0.3)

        if select_model == "SVM":
            model_svm = svm.SVC(kernel='poly', gamma=0.125, C=20)
            model_svm.fit(X_train2, y_train2)
            model_svm.get_params(deep=True)
            prediction2 = model_svm.predict(X_test2)
        elif select_model == "KNN":
            model_knn = KNeighborsClassifier(n_neighbors=1)
            model_knn.fit(X_train2, y_train2)
            model_knn.get_params(deep=True)
            prediction2 = model_knn.predict(X_test2)
        elif select_model == "RF":
            model_rf2 = RandomForestClassifier()
            model_rf2.fit(X_train2, y_train2)
            model_rf2.get_params(deep=True)
            prediction2 = model_rf2.predict(X_test2)
        elif select_model == "LR":
            model_lr2 = LogisticRegression()
            model_lr2.fit(X_train2, y_train2)
            prediction2 = model_lr2.predict(X_test2)
        elif select_model == "DT":
            model_dt = DecisionTreeClassifier()
            model_dt.fit(X_train2, y_train2)
            prediction2 = model_dt.predict(X_test2)

        return accuracy_score(y_test2, prediction2), f1_score(y_test2,
                                                              prediction2,
                                                              average='macro')
        # model = DecisionTreeClassifier()
        # model.fit(feature[train_index,:][:,top], label[train_index])
        # prediction = model.predict(feature[test_index,:][:,top])
        # acc, f = get_result(label[test_index], prediction)
        # accuracy['DT'].append(acc)
        # f1['DT'].append(f)

        # model = LogisticRegression()
        # model.fit(feature[train_index,:][:,top], label[train_index])
        # prediction = model.predict(feature[test_index,:][:,top])
        # acc, f = get_result(label[test_index], prediction)
        # accuracy['LR'].append(acc)
        # f1['LR'].append(f)

        model = KNeighborsClassifier(n_neighbors=1)
        model.fit(feature[train_index, :][:, top], label[train_index])
        prediction = model.predict(feature[test_index, :][:, top])
        acc, f = get_result(label[test_index], prediction)
        accuracy['KNN'].append(acc)
        f1['KNN'].append(f)

        model = RandomForestClassifier(n_estimators=250)
        model.fit(feature[train_index, :][:, top], label[train_index])
        prediction = model.predict(feature[test_index, :][:, top])
        acc, f = get_result(label[test_index], prediction)
        accuracy['RF'].append(acc)
        f1['RF'].append(f)

        # model = MLPClassifier()
        # model.fit(feature[train_index,:][:,top], label[train_index])
        # prediction = model.predict(feature[test_index,:][:,top])
Пример #20
0
def nearest_fit(X, y):
    clf = KNeighborsClassifier(7, 'distance')
    return clf.fit(X, y)
    '''

    # Load data.
    path_weights = "resources/knn_weights.bin"
    # path_train = "resources/crimes_training_ones.bin"
    path_train = "resources/crimes_samples_training.bin"
    # path_tests = "resources/crimes_testing_ones.bin"
    path_tests = "resources/crimes_samples_testing.bin"

    print "Normalizing train"
    crime_train = CrimeData(path_train)
    crime_train.data[:, 22:24], mean_x_y, std_x_y = z_norm_by_feature(crime_train.data[:, 22:24])
    crime_train.data[:, 1:5], mean_time, std_time = z_norm_by_feature(crime_train.data[:, 1:5])
    crime_train.data = np.hstack((crime_train.data[:, 0:24], crime_train.data[:, 141:241]))

    print "Normalizing test"
    crime_test = CrimeData(path_tests)
    crime_test.data[:, 22:24] = z_norm_by_feature(crime_test.data[:, 22:24], mean_x_y, std_x_y)
    crime_test.data[:, 1:5] = z_norm_by_feature(crime_test.data[:, 1:5], mean_time, std_time)
    crime_test.data = np.hstack((crime_test.data[:, 0:24], crime_test.data[:, 141:241]))

    n = 0.1
    for i in range(1, 10):
        n *= 10
        clf = KNeighborsClassifier(n_neighbors = n)
        print "Fitting"
        clf.fit(crime_train.data, crime_train.y)
        print "Testing"
        preds = clf.predict(crime_test.data[0:10000])
        print n, np.mean(crime_test.y[0:10000] == preds[0:10000])
Пример #22
0
class CNN(InstanceReductionMixin):
    """Condensed Nearest Neighbors.

    Each class is represented by a set of prototypes, with test samples
    classified to the class with the nearest prototype.
    The Condensed Nearest Neighbors removes the redundant instances,
    maintaining the samples in the decision boundaries.

    Parameters
    ----------
    n_neighbors : int, optional (default = 1)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    Attributes
    ----------
    `prototypes_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `labels_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.cnn import CNN
    >>> import numpy as np
    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    >>> y = np.array([1, 1, 1, 2, 2, 2])
    >>> cnn = CNN()
    >>> cnn.fit(X, y)
    CNN(n_neighbors=1)
    >>> print(cnn.predict([[-0.8, -1]]))
    [1]

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    Notes
    -----
    The Condensed Nearest Neighbor is one the first prototype selection
    technique in literature.

    References
    ----------
    P. E. Hart, The condensed nearest neighbor rule, IEEE Transactions on 
    Information Theory 14 (1968) 515–516.

    """

    def __init__(self, n_neighbors=1):
        self.n_neighbors = n_neighbors
        self.classifier = None

    def reduce_data(self, X, y):
        
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]


        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)
       
        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
Пример #23
0
def knn_score(X, y, neighbors):
    knn5 = KNeighborsClassifier(n_neighbors=neighbors)
    knn5.fit(X, y)
    y_pred = knn5.predict(X)
    print "KNN{} accuracy_score: {}".format(neighbors,
                                            metrics.accuracy_score(y, y_pred))
Пример #24
0
# 当前最优 : KNC-K近邻算法, SVC-资瓷矢量机
############################################################
# 审查结果可视化
_Fig = plt.figure()
_Fig.suptitle(t="ALGORITHM COMPARISION")
_Ax = _Fig.add_subplot(111)
plt.boxplot(x=_ALGORITHM_CMP_RESULT_LIST)
_Ax.set_xticklabels(labels=_MODELS.keys())
plt.show()
################################################################################
# 预测程序启动...
from sklearn import metrics
############################################################
# K近邻算法预测
_KNC_MODEL = KNC()
_KNC_MODEL.fit(X=_X_TRAIN, y=_Y_TRAIN)
_KNC_PREDICTIONS = _KNC_MODEL.predict(X=_X_VAL)
print(
    "KNC-K近邻算法预测结果:\n",
    #
    " " * 4,
    "ACCURACY_SCORE:\n",
    " " * 8,
    metrics.accuracy_score(y_true=_Y_VAL, y_pred=_KNC_PREDICTIONS),
    "\n",
    #
    " " * 4,
    "CONFUSION_MATRIX:\n",
    metrics.confusion_matrix(y_true=_Y_VAL, y_pred=_KNC_PREDICTIONS),
    "\n",
    #
Пример #25
0
class ENN(InstanceReductionMixin):

    """Edited Nearest Neighbors.

    The Edited Nearest Neighbors  removes the instances in de 
    boundaries, maintaining redudant samples.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.enn import ENN
    >>> import numpy as np
    >>> X = np.array([[-1, 0], [-0.8, 1], [-0.8, -1], [-0.5, 0] , [0.5, 0], [1, 0], [0.8, 1], [0.8, -1]])
    >>> y = np.array([1, 1, 1, 2, 1, 2, 2, 2])
    >>> editednn = ENN()
    >>> editednn.fit(X, y)
    ENN(n_neighbors=3)
    >>> print(editednn.predict([[-0.6, 0.6]]))
    [1]
    >>> print editednn.reduction_
    0.75

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    References
    ----------
    Ruiqin Chang, Zheng Pei, and Chao Zhang. A modified editing k-nearest
    neighbor rule. JCP, 6(7):1493–1500, 2011.

    """

    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
        self.classifier = None


    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0
            return self.X_, self.y_

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
Пример #26
0
import matplotlib.pyplot as plt
import mglearn

cancer = load_breast_cancer()
#print(cancer.data)            # X : data
#print(cancer.target_names)    # y : label
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target,
                                                    stratify = cancer.target, random_state=66)

training_accuracy = []
test_accuracy = []

# 1 ~ 10까지 n_neighbors를 적용
neighbors_setting = range(1,11)

for n_neighbors in neighbors_setting:
    # 모델 생성
    clf = KNeighborsClassifier(n_neighbors= n_neighbors)
    clf.fit(X_train, y_train)
    # train 세트 정확도 저장
    training_accuracy.append(clf.score(X_train, y_train))
    # 일반화 정확도 저장
    test_accuracy.append(clf.score(X_test, y_test))

plt.plot(neighbors_setting, training_accuracy, label='traing accuracy')
plt.plot(neighbors_setting, test_accuracy, label='test accuracy')
plt.ylabel('Accuracy')
plt.xlabel('n_neighbors')
plt.legend()
plt.show()
Пример #27
0
class SSMA(InstanceReductionMixin):
    """Steady State Memetic Algorithm

    The Steady-State Memetic Algorithm is an evolutionary prototype
    selection algorithm. It uses a memetic algorithm in order to 
    perform a local search in the code.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    alpha   : float (default = 0.6)
        Parameter that ponderates the fitness function.

    max_loop    : int (default = 1000)
        Number of maximum loops performed by the algorithm.

    threshold   : int (default = 0)
        Threshold that regulates the substitution condition;

    chromosomes_count: int (default = 10)
        number of chromosomes used to find the optimal solution.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.ssma import SSMA
    >>> import numpy as np
    >>> X = np.array([[i] for i in range(100)])
    >>> y = np.asarray(50 * [0] + 50 * [1])
    >>> ssma = SSMA()
    >>> ssma.fit(X, y)
    SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0)
    >>> print ssma.predict([[40],[60]])
    [0 1]
    >>> print ssma.reduction_
    0.98

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    References
    ----------
    Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype
    selection based on a steady-state memetic algorithm: a study of scalability.
    Memetic Computing, 2(3):183–199, 2010.

    """
    def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.max_loop = max_loop
        self.threshold = threshold
        self.chromosomes_count = chromosomes_count

        self.evaluations = None
        self.chromosomes = None

        self.best_chromosome_ac = -1
        self.best_chromosome_rd = -1

        self.classifier = KNeighborsClassifier(n_neighbors = n_neighbors)


    def accuracy(self, chromosome, X, y):
        mask = np.asarray(chromosome, dtype=bool)
        cX, cy = X[mask], y[mask]
        #print len(cX), len(cy), sum(chromosome)

        self.classifier.fit(cX, cy)
        labels = self.classifier.predict(X)
        accuracy = (labels == y).sum()

        return float(accuracy)/len(y)


    def fitness(self, chromosome, X, y):
        #TODO add the possibility of use AUC for factor1
        ac = self.accuracy(chromosome, X, y)
        rd = 1.0 - (float(sum(chromosome))/len(chromosome))

        return self.alpha * ac + (1.0 - self.alpha) * rd


    def fitness_gain(self, gain, n):
        return self.alpha * (float(gain)/n) + (1 - self.alpha) * (1.0 / n)


    def update_threshold(self, X, y):
        best_index = np.argmax(self.evaluations)
        chromosome = self.chromosomes[best_index]

        best_ac = self.accuracy(chromosome, X, y)
        best_rd = 1.0 - float(sum(chromosome))/len(y)

        if best_ac <= self.best_chromosome_ac:
            self.threshold = self.threshold + 1
        if best_rd <= self.best_chromosome_rd:
            self.threshold = self.threshold - 1

        self.best_chromosome_ac = best_ac
        self.best_chromosome_rd = best_rd


    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U
            

    def memetic_looper(self, S, R):
        c = 0
        for i in range(len(S)):
            if S[i] == 1 and i not in R:
                c = c + 1
                if c == 2:
                    return True

        return False

    def memetic_select_j(self, S, R):
        indexs = []
        for i in range(len(S)):
            if i not in R and S[i] == 1:
                indexs.append(i)
        # if list is empty wlil return error
        return np.random.choice(indexs)


    def generate_population(self, X, y):
        self.chromosomes = [[np.random.choice([0,1]) for i in range(len(y))]
                            for c in range(self.chromosomes_count)]
        self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes]

        self.update_threshold(X, y)
        

    def select_parents(self, X, y):
        parents = []
        for i in range(2):
            samples = random.sample(self.chromosomes, 2)
            parents = parents + [samples[0] if self.fitness(samples[0], X, y) >
                                    self.fitness(samples[1], X, y) else samples[1]]
        return np.array(parents, copy=True)

    def crossover(self, parent_1, parent_2):
        size = len(parent_1)
        mask = [0] * (size/2) + [1] * (size - size/2)
        mask = np.asarray(mask, dtype=bool)
        np.random.shuffle(mask)

        off_1 = parent_1 * mask + parent_2 * ~mask
        off_2 = parent_2 * mask + parent_1 * ~mask
        
        return np.asarray([off_1, off_2])
        

    def mutation(self, offspring):
        for i in range(len(offspring)):
            if np.random.uniform(0,1) < 1.0/len(offspring):
                offspring[i] = not offspring[i]

        return offspring

    def memetic_search(self, chromosome, X, y, chromosome_fitness = None):
        S = np.array(chromosome, copy=True)
        if S.sum() == 0:
            return S, 0

        if chromosome_fitness == None:
            chromosome_fitness = self.fitness(chromosome, X, y)
        fitness_s = chromosome_fitness

        # List of visited genes in S 
        R = []
        # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i
        U = self.index_nearest_neighbor(S, X, y)
        
        while self.memetic_looper(S, R):
            j = self.memetic_select_j(S, R) 
            S[j] = 0
            gain = 0.0
            U_copy = list(U)
            mask = np.asarray(S, dtype=bool)
            X_tra, y_tra = X[mask], y[mask]
            real_idx = np.asarray(range(len(y)))[mask]

            if len(y_tra) > 0:
                for i in range(len(U)):
                    if U[i] == j:
                        self.classifier.fit(X_tra, y_tra)
                        [[idx]] = self.classifier.kneighbors(X[i], n_neighbors=1,
                                return_distance=False)
                        U[i] = real_idx[idx]
                        
                        if y[i] == y[U_copy[i]] and y[i] != y[U[i]]:
                            gain = gain - 1.0
                        if y[i] != y[U_copy[i]] and y[i] == y[U[i]]:
                            gain = gain + 1.0
                
            if gain >= self.threshold:
                n = S.sum()
                g = self.fitness_gain(gain, n)
                fitness_s = fitness_s + g
                R = []
            else:
                U = U_copy
                S[j] = 1
                R.append(j)

        return list(S), fitness_s

                    


    def main_loop(self, X, y):
        self.generate_population(X, y)
        n, worse_fit_index = 0, -1
        while (n < self.max_loop):
            parents = self.select_parents(X, y)
            offspring = self.crossover(parents[0], parents[1])
            offspring[0] = self.mutation(offspring[0])
            offspring[1] = self.mutation(offspring[1])

            fit_offs = [self.fitness(off, X, y) if sum(off) > 0 else -1 for off in offspring]
            
            if worse_fit_index == -1:
                worse_fit_index = np.argmin(self.evaluations)

            
            for i in range(len(offspring)):
                p_ls = 1.0 

                if fit_offs[i] == -1:
                    p_ls = -1

                if fit_offs[i] <= self.evaluations[worse_fit_index]:
                    p_ls = 0.0625

                if np.random.uniform(0,1) < p_ls:

                    offspring[i], fit_offs[i] = self.memetic_search(offspring[i], X, y, chromosome_fitness = fit_offs[i])

            for i in range(len(offspring)):
                if fit_offs[i] > self.evaluations[worse_fit_index]:
                    self.chromosomes[worse_fit_index] = offspring[i]
                    self.evaluations[worse_fit_index] = fit_offs[i]

                    worse_fit_index = np.argmin(self.evaluations)

            n = n + 1
            if n % 10 == 0:
                self.update_threshold(X, y)


    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        classes = np.unique(y)
        self.classes_ = classes

        self.main_loop(X, y)

        best_index = np.argmax(self.evaluations)
        mask = np.asarray(self.chromosomes[best_index], dtype=bool)
        self.X_ = X[mask]
        self.y_ = y[mask]
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)

        return self.X_, self.y_
Пример #28
0
def test_view_func_NN(model_classifier, model_rpn, model_inner, C):
    test_cls = 'aeroplane'
    input_train_file = 'pickle_data/train_data_Wflip_all.pickle'

    ## read the training data from pickle file or from annotations
    test_pickle = 'pickle_data/test_data_{}.pickle'.format(test_cls)
    if os.path.exists(test_pickle):
        with open(test_pickle) as f:
            all_imgs, classes_count, _ = pickle.load(f)

    class_mapping = C.class_mapping
    inv_class_mapping = {v: k for k, v in class_mapping.iteritems()}
    backend = K.image_dim_ordering()
    gt_cls_num = class_mapping[test_cls]
    print('work on class {}'.format(test_cls))
    base_path = os.getcwd()

    # turn off any data augmentation at test time
    C.use_horizontal_flips = False
    C.use_vertical_flips = False
    C.rot_90 = False
    count = 0
    good_img = 0
    not_good = 0

    def format_img_size(img, C):
        """ formats the image size based on config """
        img_min_side = float(C.im_size)
        (height, width, _) = img.shape

        if width <= height:
            ratio = img_min_side / width
            new_height = int(ratio * height)
            new_width = int(img_min_side)
        else:
            ratio = img_min_side / height
            new_width = int(ratio * width)
            new_height = int(img_min_side)
        img = cv2.resize(img, (new_width, new_height),
                         interpolation=cv2.INTER_CUBIC)
        return img, ratio

    def format_img_channels(img, C):
        """ formats the image channels based on config """
        img = img[:, :, (2, 1, 0)]
        img = img.astype(np.float32)
        img[:, :, 0] -= C.img_channel_mean[0]
        img[:, :, 1] -= C.img_channel_mean[1]
        img[:, :, 2] -= C.img_channel_mean[2]
        img /= C.img_scaling_factor
        img = np.transpose(img, (2, 0, 1))
        img = np.expand_dims(img, axis=0)
        return img

    def format_img(img, C):
        """ formats an image for model prediction based on config """
        img, ratio = format_img_size(img, C)
        img = format_img_channels(img, C)
        return img, ratio

    def display_image(img):
        img1 = img[:, :, (2, 1, 0)]
        # img1=img
        im = Image.fromarray(img1.astype('uint8'), 'RGB')
        im.show()

    # Method to transform the coordinates of the bounding box to its original size
    def get_real_coordinates(ratio, x1, y1, x2, y2):
        ## read the training data from pickle file or from annotations
        real_x1 = int(round(x1 // ratio))
        real_y1 = int(round(y1 // ratio))
        real_x2 = int(round(x2 // ratio))
        real_y2 = int(round(y2 // ratio))
        return (real_x1, real_y1, real_x2, real_y2)

    vnum_test = 24
    azimuth_vec = np.concatenate(
        ([0],
         np.linspace((360. / (vnum_test * 2)), 360. -
                     (360. / (vnum_test * 2)), vnum_test)),
        axis=0)

    def find_interval(azimuth, azimuth_vec):
        for i in range(len(azimuth_vec)):
            if azimuth < azimuth_vec[i]:
                break
        ind = i
        if azimuth > azimuth_vec[-1]:
            ind = 1
        return ind

    class_mapping = C.class_mapping

    if 'bg' not in class_mapping:
        class_mapping['bg'] = len(class_mapping)

    class_mapping = {v: k for k, v in class_mapping.items()}
    # print(class_mapping)
    class_to_color = {
        class_mapping[v]: np.random.randint(0, 255, 3)
        for v in class_mapping
    }
    C.num_rois = 32

    obj_num = 0
    bbox_threshold_orig = 0.6
    th_bbox = 0.4

    ## get GT for all az for single cls
    feature_az = []
    sorted_path = input_train_file
    tmp_ind = sorted_path.index('.pickle')
    sorted_path = sorted_path[:tmp_ind] + "_sorted_Angles" + sorted_path[
        tmp_ind:]
    if os.path.exists(sorted_path):
        print("loading sorted data")
        with open(sorted_path) as f:
            trip_data = pickle.load(f)
    im_file = []
    ind = []
    for ii in range(360):
        for jj in range(3):
            try:
                im_file.append(trip_data[test_cls][ii][jj])
                ind.append(ii)
            except:
                if jj == 0:
                    print('no azimuth {}'.format(ii))
    data_gen_train = data_generators.get_anchor_gt(im_file, [],
                                                   C,
                                                   K.image_dim_ordering(),
                                                   mode='test')
    azimuth_dict = []
    inner_NN = []
    azimuths = []
    for tt in range(len(ind)):
        try:
            if tt % 100 == 0:
                print('worked on {}/{}'.format(tt, len(ind)))
            # print ('im num {}'.format(good_img))
            X, Y, img_data = next(data_gen_train)

            P_rpn = model_rpn.predict_on_batch(X)

            R = roi_helpers.rpn_to_roi(P_rpn[0],
                                       P_rpn[1],
                                       C,
                                       K.image_dim_ordering(),
                                       use_regr=True,
                                       overlap_thresh=0.7,
                                       max_boxes=300)

            X2, Y1, Y2, Y_view = roi_helpers.calc_iou_new(
                R, img_data, C, C.class_mapping)

            pos_samples = np.where(Y1[0, :, -1] == 0)
            sel_samples = pos_samples[0].tolist()
            R = X2[0, sel_samples, :]
            for jk in range(R.shape[0] // C.num_rois + 1):
                ROIs = np.expand_dims(R[C.num_rois * jk:C.num_rois *
                                        (jk + 1), :],
                                      axis=0)
                if ROIs.shape[1] == 0:
                    break

                if jk == R.shape[0] // C.num_rois:
                    # pad R
                    curr_shape = ROIs.shape
                    target_shape = (curr_shape[0], C.num_rois, curr_shape[2])
                    ROIs_padded = np.zeros(target_shape).astype(ROIs.dtype)
                    ROIs_padded[:, :curr_shape[1], :] = ROIs
                    ROIs_padded[0, curr_shape[1]:, :] = ROIs[0, 0, :]
                    ROIs = ROIs_padded

                [P_cls, P_regr, P_view] = model_classifier.predict([X, ROIs])
                iner_f = model_inner.predict([X, ROIs])
                # oo = model_classifier_only.predict([F, ROIs])

                for ii in range(len(sel_samples)):

                    if np.max(P_cls[0,
                                    ii, :]) < bbox_threshold_orig or np.argmax(
                                        P_cls[0,
                                              ii, :]) == (P_cls.shape[2] - 1):
                        continue

                    ## get class from the net
                    # cls_num = np.argmax(P_cls[0, ii, :])

                    ## use gt class
                    cls_num = gt_cls_num

                    cls_name = inv_class_mapping[cls_num]
                    cls_view = P_view[0, ii, 360 * cls_num:360 * (cls_num + 1)]

                    # azimuths[cls_name].append(np.argmax(cls_view, axis=0))
                    inner_NN.append(iner_f[0, ii, :])
                    azimuth_dict.append(img_data['bboxes'][0]['azimuth'])
        except:
            print('failed on az {}'.format(img_data['bboxes'][0]['azimuth']))
    ## calculating some mean feature map for every az
    with open('pickle_data/{}_NN.pickle'.format(C.weight_name), 'w') as f:
        pickle.dump([inner_NN, azimuth_dict], f)
        print('saved PICKLE')

    with open('pickle_data/{}_NN.pickle'.format(C.weight_name)) as f:
        inner_NN, azimuth_dict = pickle.load(f)
    neigh = KNeighborsClassifier(n_neighbors=1)
    neigh.fit(inner_NN, azimuth_dict)

    jj = 0
    for im_file in all_imgs:
        jj += 1
        if jj % 50 == 0:
            print(jj)
        filepath = im_file['filepath']
        img = cv2.imread(filepath)
        img_gt = np.copy(img)
        if img is None:
            not_good += 1
            continue
        else:
            good_img += 1
            # print ('im num {}'.format(good_img))
        X, ratio = format_img(img, C)

        if backend == 'tf':
            X = np.transpose(X, (0, 2, 3, 1))

        # get the feature maps and output from the RPN
        Y1, Y2 = model_rpn.predict(X)
        R = roi_helpers.rpn_to_roi(Y1,
                                   Y2,
                                   C,
                                   K.image_dim_ordering(),
                                   overlap_thresh=0.7)
        # # convert from (x1,y1,x2,y2) to (x,y,w,h)
        R[:, 2] -= R[:, 0]
        R[:, 3] -= R[:, 1]

        width, height = int(im_file["width"]), int(im_file["height"])
        resized_width, resized_height = data_generators.get_new_img_size(
            width, height, C.im_size)
        # [_,_, F] = model_rpn.predict(X)
        ROIs = []
        ## pass on all the labels in the image, some of them are not equal to test_cls
        for bbox_gt in im_file['bboxes']:
            no_bbox_flag = 1
            bbox_threshold = bbox_threshold_orig
            if not bbox_gt['class'] == test_cls:
                continue
            if bbox_gt[
                    'class'] == test_cls and bbox_threshold == bbox_threshold_orig:
                obj_num += 1
            while no_bbox_flag and bbox_threshold > th_bbox:
                cls_gt = bbox_gt['class']
                az_gt = bbox_gt['azimuth']
                el_gt = bbox_gt['elevation']
                t_gt = bbox_gt['tilt']
                if len(ROIs) == 0:
                    # apply the spatial pyramid pooling to the proposed regions
                    bboxes = {}
                    probs = {}
                    azimuths = {}
                    inner_res = {}
                    # print ('obj num {}'.format(obj_num))

                    for jk in range(R.shape[0] // C.num_rois + 1):
                        ROIs = np.expand_dims(R[C.num_rois * jk:C.num_rois *
                                                (jk + 1), :],
                                              axis=0)
                        if ROIs.shape[1] == 0:
                            break

                        if jk == R.shape[0] // C.num_rois:
                            #pad R
                            curr_shape = ROIs.shape
                            target_shape = (curr_shape[0], C.num_rois,
                                            curr_shape[2])
                            ROIs_padded = np.zeros(target_shape).astype(
                                ROIs.dtype)
                            ROIs_padded[:, :curr_shape[1], :] = ROIs
                            ROIs_padded[0, curr_shape[1]:, :] = ROIs[0, 0, :]
                            ROIs = ROIs_padded

                        [P_cls, P_regr,
                         P_view] = model_classifier.predict([X, ROIs])
                        inner_out = model_inner.predict([X, ROIs])
                        # oo = model_classifier_only.predict([F, ROIs])

                        for ii in range(P_cls.shape[1]):

                            if np.max(P_cls[
                                    0, ii, :]) < bbox_threshold or np.argmax(
                                        P_cls[0,
                                              ii, :]) == (P_cls.shape[2] - 1):
                                continue

                            ## get class from the net
                            # cls_num = np.argmax(P_cls[0, ii, :])

                            ## use gt class
                            cls_num = gt_cls_num

                            cls_name = inv_class_mapping[cls_num]
                            cls_view = P_view[0, ii, 360 * cls_num:360 *
                                              (cls_num + 1)]

                            if cls_name not in bboxes:
                                bboxes[cls_name] = []
                                probs[cls_name] = []
                                azimuths[cls_name] = []
                                inner_res[cls_name] = []

                            (x, y, w, h) = ROIs[0, ii, :]

                            try:
                                (tx, ty, tw,
                                 th) = P_regr[0, ii,
                                              4 * cls_num:4 * (cls_num + 1)]
                                tx /= C.classifier_regr_std[0]
                                ty /= C.classifier_regr_std[1]
                                tw /= C.classifier_regr_std[2]
                                th /= C.classifier_regr_std[3]
                                x, y, w, h = roi_helpers.apply_regr(
                                    x, y, w, h, tx, ty, tw, th)
                            except:
                                pass
                            bboxes[cls_name].append([
                                C.rpn_stride * x, C.rpn_stride * y,
                                C.rpn_stride * (x + w), C.rpn_stride * (y + h)
                            ])
                            probs[cls_name].append(np.max(P_cls[0, ii, :]))
                            azimuths[cls_name].append(
                                np.argmax(cls_view, axis=0))
                            inner_res[cls_name].append(inner_out[0, ii, :])

                # cv2.rectangle(img_gt, (bbox_gt['x1'], bbox_gt['y1']), (bbox_gt['x2'], bbox_gt['y2']), (int(class_to_color[test_cls][0]), int(class_to_color[test_cls][1]), int(class_to_color[test_cls][2])), 2)
                for key in bboxes:
                    # if 1:
                    if key == test_cls and bbox_gt['class'] == test_cls:
                        bbox = np.array(bboxes[key])
                        prob = np.array(probs[key])
                        azimuth = np.array(azimuths[key])
                        inner_result = np.array(inner_res[key])
                        # img = draw_bbox(img,bbox, prob, azimuth, ratio)
                        azimuth = neigh.predict(inner_result)
                        ## get the azimuth from bbox that have more than 'overlap_thresh' overlap with gt_bbox
                        az = []
                        overlap_thresh = 0.5
                        try:
                            while np.size(az) == 0 and overlap_thresh > 0:
                                _, prob_bbox, az = roi_helpers.overlap_with_gt(
                                    bbox,
                                    prob,
                                    azimuth,
                                    bbox_gt,
                                    ratio=ratio,
                                    overlap_thresh=overlap_thresh,
                                    max_boxes=300,
                                    use_az=True)
                                overlap_thresh -= 0.1
                            if overlap_thresh == 0:
                                print("No good Bbox was found")
                            counts = np.bincount(az)
                        except:
                            az = []
                            counts = []
                        try:
                            az_fin = np.argmax(counts)
                            true_bin = find_interval(az_gt, azimuth_vec)
                            prob_bin = find_interval(az_fin, azimuth_vec)
                            no_bbox_flag = 0
                            if true_bin == prob_bin:
                                count += 1
                                break
                        except:
                            # print('here')
                            no_bbox_flag = 1
                            bbox_threshold -= 0.1

                    ## azimuth calculations

                    ## display

                bbox_threshold -= 0.1

    succ = float(count) / float(obj_num) * 100.
    print(
        'for class {} -true count is {} out of {} from {} images . {} success'.
        format(test_cls, count, obj_num, good_img, succ))
    return succ
Пример #29
0
#
#
# 				# azimuths[cls_name].append(np.argmax(cls_view, axis=0))
# 				inner_NN.append(iner_f[0,ii,:])
# 				azimuth_dict.append(img_data['bboxes'][0]['azimuth'])
# 	except:
# 		print('failed on az {}'.format(img_data['bboxes'][0]['azimuth']))
# ## calculating some mean feature map for every az
# with open('NN.pickle','w') as f:
# 	pickle.dump([inner_NN,azimuth_dict],f)
# 	print('saved PICKLE')

with open('NN.pickle') as f:
    inner_NN, azimuth_dict = pickle.load(f)
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(inner_NN, azimuth_dict)

jj = 0
for im_file in all_imgs:
    jj += 1
    if jj % 50 == 0:
        print(jj)
    filepath = im_file['filepath']
    img = cv2.imread(filepath)
    img_gt = np.copy(img)
    if img is None:
        not_good += 1
        continue
    else:
        good_img += 1
        # print ('im num {}'.format(good_img))
Пример #30
0
            out = Trainer.model.semantics(data, 1, 1, 1.0)

            n = data.shape[0]
            Xtrain[counter:counter +
                   n] = out[0].detach().cpu().numpy().reshape(n, -1)
            Ytrain[counter:counter + n] = labels.numpy()
            counter += n

        counter = 0
        for i, (data, labels) in enumerate(testloader):
            # Feed forward data
            data = data.reshape(-1, *Trainer.input_shape).to(torch.float32).to(
                Trainer.device)
            out = Trainer.model.semantics(data, 1, 1, 1.0)

            n = data.shape[0]
            Xtest[counter:counter + n] = out[0].detach().cpu().numpy().reshape(
                n, -1)
            Ytest[counter:counter + n] = labels.numpy()
            counter += n

    else:
        raise ValueError('Wrong dataset')

    #%%
    from sklearn.neighbors.classification import KNeighborsClassifier
    classifier = KNeighborsClassifier()
    classifier.fit(Xtrain, Ytrain)
    Ypred = classifier.predict(Xtest)
    print(np.mean(Ypred == Ytest))
Пример #31
0
class KNNClassifier():
    '''
    classdocs
    '''
    def __init__(self, csv_path_train, csv_path_test, k):
        '''
        Constructor
        '''
        self.csv_path_train = csv_path_train
        self.csv_path_test = csv_path_test
        self.classifier = KNeighborsClassifier(n_neighbors=k,
                                               p=2,
                                               metric='minkowski')

    def create_arrays(self):
        arr_train = np.genfromtxt(self.csv_path_train,
                                  delimiter=',',
                                  skip_header=1)
        self.X_train = np.delete(arr_train, [arr_train.shape[1] - 1], axis=1)
        self.y_train = np.delete(arr_train,
                                 list(range(arr_train.shape[1] - 1)),
                                 axis=1)

        arr_test = np.genfromtxt(self.csv_path_test,
                                 delimiter=',',
                                 skip_header=1)
        self.X_test = np.delete(arr_test, [arr_test.shape[1] - 1], axis=1)
        self.y_test = np.delete(arr_test,
                                list(range(arr_test.shape[1] - 1)),
                                axis=1)

    def preprocess(self):
        #         X_train, X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=0)
        sc = StandardScaler()
        sc.fit(self.X_train)
        self.X_train_std = sc.transform(self.X_train)
        self.X_test_std = sc.transform(self.X_test)

    def train(self):
        self.create_arrays()
        self.preprocess()
        self.classifier.fit(self.X_train_std, self.y_train.ravel())

    def test(self, f, patient_num, total_fpr, total_tpr):
        y_pred = self.classifier.predict(self.X_test_std)
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = accuracy_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        print("Accuracy: %.2f" % accuracy)
        print("Precision: %.2f" % precision)
        print("Recall: %.2f" % recall)
        line = str(accuracy) + "," + str(precision) + "," + str(recall)
        f.write(line)
        f.write("\n")
        confmat = confusion_matrix(self.y_test, y_pred)
        fig, ax = plt.subplots(figsize=(2.5, 2.5))
        ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
        for i in range(confmat.shape[0]):
            for j in range(confmat.shape[1]):
                ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
        plt.xlabel('predicted label')
        plt.ylabel('true label')
        plt.savefig("D:\\Documents\\KNN\\FFT\\chb" + patient_num +
                    "_confmat.png")
        plt.close()
        fpr, tpr, thresholds = roc_curve(self.y_test, y_pred)
        print("fpr", fpr)
        print("tpr", tpr)
        total_fpr[1] += fpr[len(fpr) - 2]
        total_tpr[1] += tpr[len(tpr) - 2]
        print(total_fpr)
        print(total_tpr)
        roc_auc = auc(fpr, tpr)
        plt.title('ROC Curve')
        plt.plot(fpr, tpr, 'b', label='AUC = %.2F' % roc_auc)
        plt.legend(loc='lower right')
        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([-0.1, 1.2])
        plt.ylim([-0.1, 1.2])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.savefig("D:\\Documents\\KNN\\FFT\\chb" + patient_num + "roc.png")
        plt.close()
        return total_fpr, total_tpr
Пример #32
0
class InstanceReductionMixin(InstanceReductionBase, ClassifierMixin):
    """Mixin class for all instance reduction techniques"""
    def set_classifier(self):
        """Sets the classified to be used in the instance reduction process
            and classification.

        Parameters
        ----------
        classifier : classifier, following the KNeighborsClassifier style
            (default = KNN)

        y : array-like, shape = [n_samples]
            Labels for X.

        Returns
        -------
        P : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        q : array-like, shape = [indertaminated]
            Labels for P
        """

        self.classifier = classifier

    def reduce_data(self, X, y):
        """Perform the instance reduction procedure on the given training data.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training set.0

        y : array-like, shape = [n_samples]
            Labels for X.

        Returns
        -------
        X_ : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        y_ : array-like, shape = [indertaminated]
            Labels for X_
        """
        pass

    def fit(self, X, y, reduce_data=True):
        """
        Fit the InstanceReduction model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        reduce_data : bool, flag indicating if the reduction would be performed
        """
        self.X = X
        self.y = y

        if reduce_data:
            self.reduce_data(X, y)

        return self

    def predict(self, X, n_neighbors=1):
        """Perform classification on an array of test vectors X.

        The predicted class C for each sample in X is returned.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array, shape = [n_samples]

        Notes
        -----
        The default prediction is using KNeighborsClassifier, if the
        instance reducition algorithm is to be performed with another
        classifier, it should be explicited overwritten and explained
        in the documentation.
        """
        X = check_array(X)
        if not hasattr(self, "X_") or self.X_ is None:
            raise AttributeError("Model has not been trained yet.")

        if not hasattr(self, "y_") or self.y_ is None:
            raise AttributeError("Model has not been trained yet.")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict(X)

    def predict_proba(self, X):
        """Return probability estimates for the test data X.
        after a given prototype selection algorithm.
    
        Parameters
        ----------
        X : array, shape = (n_samples, n_features)
            A 2-D array representing the test points.
        
        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
        of such arrays if n_outputs > 1.
        The class probabilities of the input samples. Classes are ordered
        by lexicographic order.
        """
        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict_proba(X)
Пример #33
0
Файл: baseNew.py Проект: dvro/ml
class InstanceReductionMixin(InstanceReductionBase, ClassifierMixin):

    """Mixin class for all instance reduction techniques"""


    def set_classifier(self):
        """Sets the classified to be used in the instance reduction process
            and classification.

        Parameters
        ----------
        classifier : classifier, following the KNeighborsClassifier style
            (default = KNN)

        y : array-like, shape = [n_samples]
            Labels for X.

        Returns
        -------
        P : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        q : array-like, shape = [indertaminated]
            Labels for P
        """

        self.classifier = classifier


    def reduce_data(self, X, y):
        """Perform the instance reduction procedure on the given training data.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training set.0

        y : array-like, shape = [n_samples]
            Labels for X.

        Returns
        -------
        X_ : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        y_ : array-like, shape = [indertaminated]
            Labels for X_
        """
        pass
    
    def get_prototypes(self):
        return self.X_, self.y_

    def fit(self, X, y, reduce_data=True):
        """
        Fit the InstanceReduction model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        reduce_data : bool, flag indicating if the reduction would be performed
        """
        self.X = X
        self.y = y
        self.labels = set(y)
        self.prototypes = None
        self.prototypes_labels = None
        self.reduction_ratio = 0.0

        if reduce_data:
            self.reduce_data(X, y)

        return self

    def predict(self, X, n_neighbors=1):
        """Perform classification on an array of test vectors X.

        The predicted class C for each sample in X is returned.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array, shape = [n_samples]

        Notes
        -----
        The default prediction is using KNeighborsClassifier, if the
        instance reducition algorithm is to be performed with another
        classifier, it should be explicited overwritten and explained
        in the documentation.
        """
        X = atleast2d_or_csr(X)
        if not hasattr(self, "X_") or self.X_ is None:
            raise AttributeError("Model has not been trained yet.")

        if not hasattr(self, "y_") or self.y_ is None:
            raise AttributeError("Model has not been trained yet.")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict(X)


    def predict_proba(self, X):
        """Return probability estimates for the test data X.
        after a given prototype selection algorithm.
    
        Parameters
        ----------
        X : array, shape = (n_samples, n_features)
            A 2-D array representing the test points.
        
        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
        of such arrays if n_outputs > 1.
        The class probabilities of the input samples. Classes are ordered
        by lexicographic order.
        """
        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict_proba(X)
def demo():
    """ _test_knn_adwin

    This demo tests the KNNAdwin classifier on a file stream, which gives 
    instances coming from a SEA generator. 
    
    The test computes the performance of the KNNAdwin classifier as well as 
    the time to create the structure and classify max_samples (10000 by 
    default) instances.
    
    """
    start = timer()
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    # warnings.filterwarnings("ignore", ".*Passing 1d.*")
    stream = FileStream('../data/datasets/sea_big.csv', -1, 1)
    # stream = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_random_state=32523423,
    #                                  sample_seed=5435, n_classes=2, num_att=10, num_drift_centroids=50)
    stream.prepare_for_use()
    t = OneHotToCategorical([[10, 11, 12, 13],
                             [
                                 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                 47, 48, 49, 50, 51, 52, 53
                             ]])
    t2 = OneHotToCategorical([[10, 11, 12, 13],
                              [
                                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                  25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                  36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                  47, 48, 49, 50, 51, 52, 53
                              ]])

    # knn = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40)
    knn = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=2000)
    # pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)])

    compare = KNeighborsClassifier(n_neighbors=8,
                                   algorithm='kd_tree',
                                   leaf_size=40,
                                   metric='euclidean')
    # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)])
    first = True
    train = 200
    if train > 0:
        X, y = stream.next_sample(train)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe2.fit(X, y)

        knn.partial_fit(X, y, classes=stream.target_values)
        compare.fit(X, y)
        first = False
    n_samples = 0
    max_samples = 10000
    my_corrects = 0
    compare_corrects = 0

    while n_samples < max_samples:
        if n_samples % (max_samples / 20) == 0:
            logging.info('%s%%', str((n_samples // (max_samples / 20) * 5)))
        X, y = stream.next_sample()
        # my_pred = pipe.predict(X)
        my_pred = knn.predict(X)
        # my_pred = [1]
        if first:
            # pipe.partial_fit(X, y, classes=stream.target_values)
            # pipe.partial_fit(X, y, classes=stream.target_values)
            knn.partial_fit(X, y, classes=stream.target_values)
            first = False
        else:
            # pipe.partial_fit(X, y)
            knn.partial_fit(X, y)
        # compare_pred = pipe2.predict(X)
        compare_pred = compare.predict(X)
        if y[0] == my_pred[0]:
            my_corrects += 1
        if y[0] == compare_pred[0]:
            compare_corrects += 1
        n_samples += 1

    end = timer()

    print('Evaluation time: ' + str(end - start))
    print(str(n_samples) + ' samples analyzed.')
    print('My performance: ' + str(my_corrects / n_samples))
    print('Compare performance: ' + str(compare_corrects / n_samples))
Пример #35
0
class TomekLinks(InstanceReductionMixin):

    """Tomek Links.

    The Tomek Links algorithm removes a pair instances that
    forms a Tomek Link. This techniques removes instances in
    the decision region.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default in the classification (only).
        The Tomek Links uses only n_neighbors=1 in the reduction.

    keep_class : int, optional (default = None)
        Label of the class to not be removed in the tomek links. If None,
        it removes all nodes of the links.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------

    >>> from protopy.selection.tomek_links import TomekLinks
    >>> import numpy as np
    >>> X = np.array([[0],[1],[2.1],[2.9],[4],[5],[6],[7.1],[7.9],[9]])
    >>> y = np.array([1,1,2,1,2,2,2,1,2,2])
    >>> tl = TomekLinks()
    >>> tl.fit(X, y)
    TomekLinks(keep_class=None)
    >>> print tl.predict([[2.5],[7.5]])
    [1, 2]
    >>> print tl.reduction_
    0.4

    See also
    --------
    protopy.selection.enn.ENN: edited nearest neighbor

    References
    ----------
    I. Tomek, “Two modifications of cnn,” IEEE Transactions on Systems,
    Man and Cybernetics, vol. SMC-6, pp. 769–772, 1976.

    """

    def __init__(self, n_neighbors=3, keep_class=None):
        self.n_neighbors = n_neighbors
        self.classifier = None
        self.keep_class = keep_class


    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors, algorithm='brute')
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes
        self.classifier.fit(X, y)
        nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False)
        nn_idx = nn_idx.T[1]

        mask = [nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0])]
        mask = ~np.asarray(mask) 
        if self.keep_class != None and self.keep_class in self.classes_:
            mask[y==self.keep_class] = True

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
Пример #36
0
class SSMA(InstanceReductionMixin):
    """Steady State Memetic Algorithm

    The Steady-State Memetic Algorithm is an evolutionary prototype
    selection algorithm. It uses a memetic algorithm in order to 
    perform a local search in the code.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    alpha   : float (default = 0.6)
        Parameter that ponderates the fitness function.

    max_loop    : int (default = 1000)
        Number of maximum loops performed by the algorithm.

    threshold   : int (default = 0)
        Threshold that regulates the substitution condition;

    chromosomes_count: int (default = 10)
        number of chromosomes used to find the optimal solution.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.ssma import SSMA
    >>> import numpy as np
    >>> X = np.array([[i] for i in range(100)])
    >>> y = np.asarray(50 * [0] + 50 * [1])
    >>> ssma = SSMA()
    >>> ssma.fit(X, y)
    SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0)
    >>> print ssma.predict([[40],[60]])
    [0 1]
    >>> print ssma.reduction_
    0.98

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    References
    ----------
    Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype
    selection based on a steady-state memetic algorithm: a study of scalability.
    Memetic Computing, 2(3):183–199, 2010.

    """
    def __init__(self,
                 n_neighbors=1,
                 alpha=0.6,
                 max_loop=1000,
                 threshold=0,
                 chromosomes_count=10):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.max_loop = max_loop
        self.threshold = threshold
        self.chromosomes_count = chromosomes_count

        self.evaluations = None
        self.chromosomes = None

        self.best_chromosome_ac = -1
        self.best_chromosome_rd = -1

        self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

    def accuracy(self, chromosome, X, y):
        mask = np.asarray(chromosome, dtype=bool)
        cX, cy = X[mask], y[mask]
        #print len(cX), len(cy), sum(chromosome)

        self.classifier.fit(cX, cy)
        labels = self.classifier.predict(X)
        accuracy = (labels == y).sum()

        return float(accuracy) / len(y)

    def fitness(self, chromosome, X, y):
        #TODO add the possibility of use AUC for factor1
        ac = self.accuracy(chromosome, X, y)
        rd = 1.0 - (float(sum(chromosome)) / len(chromosome))

        return self.alpha * ac + (1.0 - self.alpha) * rd

    def fitness_gain(self, gain, n):
        return self.alpha * (float(gain) / n) + (1 - self.alpha) * (1.0 / n)

    def update_threshold(self, X, y):
        best_index = np.argmax(self.evaluations)
        chromosome = self.chromosomes[best_index]

        best_ac = self.accuracy(chromosome, X, y)
        best_rd = 1.0 - float(sum(chromosome)) / len(y)

        if best_ac <= self.best_chromosome_ac:
            self.threshold = self.threshold + 1
        if best_rd <= self.best_chromosome_rd:
            self.threshold = self.threshold - 1

        self.best_chromosome_ac = best_ac
        self.best_chromosome_rd = best_rd

    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U

    def memetic_looper(self, S, R):
        c = 0
        for i in range(len(S)):
            if S[i] == 1 and i not in R:
                c = c + 1
                if c == 2:
                    return True

        return False

    def memetic_select_j(self, S, R):
        indexs = []
        for i in range(len(S)):
            if i not in R and S[i] == 1:
                indexs.append(i)
        # if list is empty wlil return error
        return np.random.choice(indexs)

    def generate_population(self, X, y):
        self.chromosomes = [[np.random.choice([0, 1]) for i in range(len(y))]
                            for c in range(self.chromosomes_count)]
        self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes]

        self.update_threshold(X, y)

    def select_parents(self, X, y):
        parents = []
        for i in range(2):
            samples = random.sample(self.chromosomes, 2)
            parents = parents + [
                samples[0] if self.fitness(samples[0], X, y) > self.fitness(
                    samples[1], X, y) else samples[1]
            ]
        return np.array(parents, copy=True)

    def crossover(self, parent_1, parent_2):
        size = len(parent_1)
        mask = [0] * (size / 2) + [1] * (size - size / 2)
        mask = np.asarray(mask, dtype=bool)
        np.random.shuffle(mask)

        off_1 = parent_1 * mask + parent_2 * ~mask
        off_2 = parent_2 * mask + parent_1 * ~mask

        return np.asarray([off_1, off_2])

    def mutation(self, offspring):
        for i in range(len(offspring)):
            if np.random.uniform(0, 1) < 1.0 / len(offspring):
                offspring[i] = not offspring[i]

        return offspring

    def memetic_search(self, chromosome, X, y, chromosome_fitness=None):
        S = np.array(chromosome, copy=True)
        if S.sum() == 0:
            return S, 0

        if chromosome_fitness == None:
            chromosome_fitness = self.fitness(chromosome, X, y)
        fitness_s = chromosome_fitness

        # List of visited genes in S
        R = []
        # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i
        U = self.index_nearest_neighbor(S, X, y)

        while self.memetic_looper(S, R):
            j = self.memetic_select_j(S, R)
            S[j] = 0
            gain = 0.0
            U_copy = list(U)
            mask = np.asarray(S, dtype=bool)
            X_tra, y_tra = X[mask], y[mask]
            real_idx = np.asarray(range(len(y)))[mask]

            if len(y_tra) > 0:
                for i in range(len(U)):
                    if U[i] == j:
                        self.classifier.fit(X_tra, y_tra)
                        [[idx]
                         ] = self.classifier.kneighbors(X[i],
                                                        n_neighbors=1,
                                                        return_distance=False)
                        U[i] = real_idx[idx]

                        if y[i] == y[U_copy[i]] and y[i] != y[U[i]]:
                            gain = gain - 1.0
                        if y[i] != y[U_copy[i]] and y[i] == y[U[i]]:
                            gain = gain + 1.0

            if gain >= self.threshold:
                n = S.sum()
                g = self.fitness_gain(gain, n)
                fitness_s = fitness_s + g
                R = []
            else:
                U = U_copy
                S[j] = 1
                R.append(j)

        return list(S), fitness_s

    def main_loop(self, X, y):
        self.generate_population(X, y)
        n, worse_fit_index = 0, -1
        while (n < self.max_loop):
            parents = self.select_parents(X, y)
            offspring = self.crossover(parents[0], parents[1])
            offspring[0] = self.mutation(offspring[0])
            offspring[1] = self.mutation(offspring[1])

            fit_offs = [
                self.fitness(off, X, y) if sum(off) > 0 else -1
                for off in offspring
            ]

            if worse_fit_index == -1:
                worse_fit_index = np.argmin(self.evaluations)

            for i in range(len(offspring)):
                p_ls = 1.0

                if fit_offs[i] == -1:
                    p_ls = -1

                if fit_offs[i] <= self.evaluations[worse_fit_index]:
                    p_ls = 0.0625

                if np.random.uniform(0, 1) < p_ls:

                    offspring[i], fit_offs[i] = self.memetic_search(
                        offspring[i], X, y, chromosome_fitness=fit_offs[i])

            for i in range(len(offspring)):
                if fit_offs[i] > self.evaluations[worse_fit_index]:
                    self.chromosomes[worse_fit_index] = offspring[i]
                    self.evaluations[worse_fit_index] = fit_offs[i]

                    worse_fit_index = np.argmin(self.evaluations)

            n = n + 1
            if n % 10 == 0:
                self.update_threshold(X, y)

    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        classes = np.unique(y)
        self.classes_ = classes

        self.main_loop(X, y)

        best_index = np.argmax(self.evaluations)
        mask = np.asarray(self.chromosomes[best_index], dtype=bool)
        self.X_ = X[mask]
        self.y_ = y[mask]
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
Пример #37
0
class CNN(InstanceReductionMixin):
    """Condensed Nearest Neighbors.

    Each class is represented by a set of prototypes, with test samples
    classified to the class with the nearest prototype.
    The Condensed Nearest Neighbors removes the redundant instances,
    maintaining the samples in the decision boundaries.

    Parameters
    ----------
    n_neighbors : int, optional (default = 1)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    Attributes
    ----------
    `prototypes_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `labels_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.cnn import CNN
    >>> import numpy as np
    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    >>> y = np.array([1, 1, 1, 2, 2, 2])
    >>> cnn = CNN()
    >>> cnn.fit(X, y)
    CNN(n_neighbors=1)
    >>> print(cnn.predict([[-0.8, -1]]))
    [1]

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    Notes
    -----
    The Condensed Nearest Neighbor is one the first prototype selection
    technique in literature.

    References
    ----------
    P. E. Hart, The condensed nearest neighbor rule, IEEE Transactions on 
    Information Theory 14 (1968) 515–516.

    """
    def __init__(self, n_neighbors=1):
        self.n_neighbors = n_neighbors
        self.classifier = None

    def reduce_data(self, X, y):

        X, y = check_arrays(X, y, sparse_format="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(
                n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]

        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)

        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
Пример #38
0
def run_knn_probabilistic_classifier(train, train_labels, validate,
                                     validate_labels):
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(train, train_labels)
    predicted_labels = knn.predict_proba(validate)
    return metrics.log_loss(validate_labels, predicted_labels)
Пример #39
0
import numpy as np
from sklearn import datasets
from sklearn.neighbors.classification import KNeighborsClassifier
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
print np.unique(iris_y)

np.random.seed(0)
indices = np.random.permutation(len(iris_X))
iris_X_train = iris_X[indices[indices[:-10]]]
iris_y_train = iris_y[indices[indices[:-10]]]
iris_X_test = iris_X[indices[indices[:-10]]]
iris_y_test = iris_y[indices[indices[:-10]]]
knn = KNeighborsClassifier()
knn.fit(iris_X_train, iris_y_train)
print "预测:"
print knn.predict(iris_X_test)
print "实际:"
print iris_y_test
i = knn.predict(iris_X_test) == iris_y_test
k = 0
for j in i:
    if j == False:
        k += 1
print len(i)
print "预测错误数:"
print k

diabetes = datasets.load_diabetes()  #糖尿病数据集
diabetes_X_train = diabetes.data[:-20]
Пример #40
0
class SGP2(SGP):
    """Self-Generating Prototypes 2

    The Self-Generating Prototypes 2 is the second version of the
    Self-Generating Prototypes algorithm.
    It has a higher generalization power, including the procedures
    merge and pruning.

    Parameters
    ----------
    r_min: float, optional (default = 0.0)
        Determine the minimum size of a cluster [0.00, 0.20]

    r_mis: float, optional (default = 0.0)
        Determine the error tolerance before split a group

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.generation.sgp import SGP2
    >>> import numpy as np
    >>> X = [np.asarray(range(1,13)) + np.asarray([0.1,0,-0.1,0.1,0,-0.1,0.1,-0.1,0.1,-0.1,0.1,-0.1])]
    >>> X = np.asarray(X).T
    >>> y = np.array([1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1])
    >>> sgp2 = SGP2()
    >>> sgp2.fit(X, y)
    SGP2(r_min=0.0, r_mis=0.0)
    >>> print sgp2.reduction_
    0.5

    See also
    --------
    protopy.generation.SGP: self-generating prototypes
    protopy.generation.sgp.ASGP: adaptive self-generating prototypes

    References
    ----------
    Hatem A. Fayed, Sherif R Hashem, and Amir F Atiya. Self-generating prototypes
    for pattern classification. Pattern Recognition, 40(5):1498–1509, 2007.
    """
    def __init__(self, r_min=0.0, r_mis=0.0):
        self.groups = None
        self.r_min = r_min
        self.r_mis = r_mis
        self.n_neighbors = 1
        self.classifier = None
        self.groups = None


    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        classes = np.unique(y)
        self.classes_ = classes

        # loading inicial groups
        self.groups = []
        for label in classes:
            mask = y == label
            self.groups = self.groups + [_Group(X[mask], label)]

        self._main_loop()
        self._generalization_step()
        self._merge()
        self._pruning()
        self.X_ = np.asarray([g.rep_x for g in self.groups])
        self.y_ = np.asarray([g.label for g in self.groups])
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_


    def _merge(self):

        if len(self.groups) < 2:
            return self.groups

        merged = False
        for group in self.groups:
            reps_x = np.asarray([g.rep_x for g in self.groups])
            reps_y = np.asarray([g.label for g in self.groups])
            self.classifier.fit(reps_x, reps_y)

            nn2_idx = self.classifier.kneighbors(group.X, n_neighbors=2, return_distance=False)
            nn2_idx = nn2_idx.T[1]

            # could use a threshold
            if len(set(nn2_idx)) == 1 and reps_y[nn2_idx[0]] == group.label:
                ng_group = self.groups[nn2_idx[0]]
                ng2_idx = self.classifier.kneighbors(ng_group.X, n_neighbors=2, return_distance=False)
                ng2_idx = ng2_idx.T[1]
                if len(set(ng2_idx)) == 1 and self.groups[ng2_idx[0]] == group:
                    group.add_instances(ng_group.X, update=True)
                    self.groups.remove(ng_group)
                    merged = True
                
        if merged:
            self._merge()

        return self.groups


    def _pruning(self):

        if len(self.groups) < 2:
            return self.groups

        pruned, fst = False, True
        knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute')
        
        while pruned or fst:
            index = 0
            pruned, fst = False, False

            while index < len(self.groups):
                group = self.groups[index]

                mask = np.ones(len(self.groups), dtype=bool)
                mask[index] = False
                reps_x = np.asarray([g.rep_x for g in self.groups])[mask]
                reps_y = np.asarray([g.label for g in self.groups])[mask]
                labels = knn.fit(reps_x, reps_y).predict(group.X)

                if (labels == group.label).all():
                    self.groups.remove(group)
                    pruned = True
                else:
                    index = index + 1

                if len(self.groups) == 1:
                    index = len(self.groups)
                    pruned = False

        return self.groups
Пример #41
0
class TomekLinks(InstanceReductionMixin):
    """Tomek Links.

    The Tomek Links algorithm removes a pair instances that
    forms a Tomek Link. This techniques removes instances in
    the decision region.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default in the classification (only).
        The Tomek Links uses only n_neighbors=1 in the reduction.

    keep_class : int, optional (default = None)
        Label of the class to not be removed in the tomek links. If None,
        it removes all nodes of the links.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------

    >>> from protopy.selection.tomek_links import TomekLinks
    >>> import numpy as np
    >>> X = np.array([[0],[1],[2.1],[2.9],[4],[5],[6],[7.1],[7.9],[9]])
    >>> y = np.array([1,1,2,1,2,2,2,1,2,2])
    >>> tl = TomekLinks()
    >>> tl.fit(X, y)
    TomekLinks(keep_class=None)
    >>> print tl.predict([[2.5],[7.5]])
    [1, 2]
    >>> print tl.reduction_
    0.4

    See also
    --------
    protopy.selection.enn.ENN: edited nearest neighbor

    References
    ----------
    I. Tomek, “Two modifications of cnn,” IEEE Transactions on Systems,
    Man and Cybernetics, vol. SMC-6, pp. 769–772, 1976.

    """
    def __init__(self, n_neighbors=3, keep_class=None):
        self.n_neighbors = n_neighbors
        self.classifier = None
        self.keep_class = keep_class

    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(
                n_neighbors=self.n_neighbors, algorithm='brute')
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes
        self.classifier.fit(X, y)
        nn_idx = self.classifier.kneighbors(X,
                                            n_neighbors=2,
                                            return_distance=False)
        nn_idx = nn_idx.T[1]

        mask = [
            nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]]
            for index in xrange(nn_idx.shape[0])
        ]
        mask = ~np.asarray(mask)
        if self.keep_class != None and self.keep_class in self.classes_:
            mask[y == self.keep_class] = True

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
Пример #42
0
class ENN(InstanceReductionMixin):

    """Edited Nearest Neighbors.

    The Edited Nearest Neighbors  removes the instances in de 
    boundaries, maintaining redudant samples.

    Parameters
    ----------
    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    Attributes
    ----------
    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    Examples
    --------
    >>> from protopy.selection.enn import ENN
    >>> import numpy as np
    >>> X = np.array([[-1, 0], [-0.8, 1], [-0.8, -1], [-0.5, 0] , [0.5, 0], [1, 0], [0.8, 1], [0.8, -1]])
    >>> y = np.array([1, 1, 1, 2, 1, 2, 2, 2])
    >>> editednn = ENN()
    >>> editednn.fit(X, y)
    ENN(n_neighbors=3)
    >>> print(editednn.predict([[-0.6, 0.6]]))
    [1]
    >>> print editednn.reduction_
    0.75

    See also
    --------
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    References
    ----------
    Ruiqin Chang, Zheng Pei, and Chao Zhang. A modified editing k-nearest
    neighbor rule. JCP, 6(7):1493–1500, 2011.

    """

    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
        self.classifier = None


    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
def nearest_fit(X,y):
    clf = KNeighborsClassifier(7, 'distance')
    return clf.fit(X, y)