Пример #1
class DCS(object):

    def select(self, ensemble, x):

    def __init__(self, Xval, yval, K=5, weighted=False, knn=None):
        self.Xval = Xval
        self.yval = yval
        self.K = K

        if knn == None:
            self.knn = KNeighborsClassifier(n_neighbors=K, algorithm='brute')
            self.knn = knn

        self.knn.fit(Xval, yval)
        self.weighted = weighted

    def get_neighbors(self, x, return_distance=False):
        # obtain the K nearest neighbors of test sample in the validation set
        if not return_distance:
            [idx] = self.knn.kneighbors(x, 
            [dists], [idx] = self.knn.kneighbors(x, 
        X_nn = self.Xval[idx] # k neighbors
        y_nn = self.yval[idx] # k neighbors target

        if return_distance:
            return X_nn, y_nn, dists
            return X_nn, y_nn
Пример #2
class KNeighborsClassifierImpl():

    def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):
        self._hyperparams = {
            'n_neighbors': n_neighbors,
            'weights': weights,
            'algorithm': algorithm,
            'leaf_size': leaf_size,
            'p': p,
            'metric': metric,
            'metric_params': metric_params,
            'n_jobs': n_jobs}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
Пример #3
 def do(train_data, train_label, test_data, test_label=None, adjust_parameters=True, k=5):
     train_data = np.array(train_data).squeeze()
     train_label = np.array(train_label).squeeze()
     test_data = np.array(test_data).squeeze()
     if test_label is not None:
         test_label = np.array(test_label).squeeze()
     if not adjust_parameters:
         knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8)
         knn.fit(train_data, train_label)
         predicted_label = knn.predict(test_data)
         if test_label is not None:
             acc = accuracy_score(test_label, predicted_label)
             print 'acc is ', acc
         return predicted_label
         max_acc = 0.0
         max_k = 0
         max_predicted = None
         for k in range(1, 11):
             knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8)
             knn.fit(train_data, train_label)
             predicted_label = knn.predict(test_data)
             acc = accuracy_score(test_label, predicted_label)
             if acc > max_acc:
                 max_acc = acc
                 max_k = k
                 max_predicted = predicted_label
             print 'k = ', k, ' acc is ', acc
         print 'max acc is ', max_acc, ' responding to k is ', max_k
         return max_predicted, max_k
Пример #4
class PatchedRawModel:
    def __init__(self):
        self.baseModel = RawModel()
        self.model49 = KNeighborsClassifier(n_neighbors=10)
        self.model35 = KNeighborsClassifier(n_neighbors=10)
    def fit(self, trainExamples):

        X49 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [4, 9]] )
        Y49 = [x.Y for x in trainExamples if x.Y in [4, 9]]
        self.model49.fit(X49, Y49)

        X35 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [3, 5]] )
        Y35 = [x.Y for x in trainExamples if x.Y in [3, 5]]
        self.model35.fit(X35, Y35)

    def predict(self, examples):
        basePredictions = self.baseModel.predict(examples)

        for (x, y, i) in zip(examples, basePredictions, range(0, len(examples))):
            if y in [4, 9]:
                specializedPrediction = self.model49.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT)))
                if specializedPrediction != y:
                    basePredictions[i] = specializedPrediction
            elif y in [3, 5]:
                specializedPrediction = self.model35.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT)))
                if specializedPrediction != y:
                    basePredictions[i] = specializedPrediction

        return basePredictions
Пример #5
def compute_cnn(X, y):

  "condenced nearest neighbor. the cnn removes reduntant instances, maintaining the samples in the decision boundaries."

  classifier = KNeighborsClassifier(n_neighbors=3)

  prots_s = []
  labels_s = []

  classes = np.unique(y)
  classes_ = classes

  for cur_class in classes:
    mask = y == cur_class
    insts = X[mask]
    prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
    labels_s = labels_s + [cur_class]
  classifier.fit(prots_s, labels_s)
  for sample, label in zip(X, y):
    if classifier.predict(sample) != [label]:
      prots_s = prots_s + [sample]
      labels_s = labels_s + [label]
      classifier.fit(prots_s, labels_s)

  X_ = np.asarray(prots_s)
  y_ = np.asarray(labels_s)
  reduction_ = 1.0 - float(len(y_)/len(y))
  print reduction_
Пример #6
class RawModel:
    def __init__(self):
        # 2015-05-15 GEL Found that n_components=20 gives a nice balance of 
        # speed (substantial improvement), accuracy, and reduced memory usage 
        # (25% decrease).
        self.decomposer = TruncatedSVD(n_components=20)

        # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than 
        # algorithm='kd_tree'
        # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples)
        # euclidean        0.950025
        # manhattan        0.933533
        # chebyshev        0.675662
        # hamming          0.708646
        # canberra         0.934033
        # braycurtis       0.940530
        self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')

    def fit(self, trainExamples):       
        X = self.decomposer.fit_transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples] ) )
        Y = [x.Y for x in trainExamples]

        self.model.fit(X, Y)
        return self

    def predict(self, examples):
        X = self.decomposer.transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) )
        return self.model.predict( X )
Пример #7
def run_main():

    file_df = pd.read_csv('../dataset/voice.csv')
    #   print file_df
    #查看label的个数    分组显示
    #   print file_df['label'].value_counts()
    fea_name1 = 'meanfun'
    fea_name2 = 'centroid'

    # visaulize_two_feature(file_df,fea_name1,fea_name2)

    # visaulize_single_feature(file_df,fea_name1)

    fea_name = ['meanfreq', 'Q25', 'Q75', 'skew', 'centroid', 'label']
    # visaulize_muilt_feature(file_df,fea_name)

    X = file_df.iloc[:, :-1].values
    file_df['label'].replace('male', 0, inplace=True)
    file_df['label'].replace('female', 1, inplace=True)
    y = file_df['label'].values

    X = preprocessing.scale(X)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        test_size=1 / 3.,

    #选择模型  交叉验证
    cv_scores = []
    k_range = range(1, 31)
    for k in k_range:
        knn = KNeighborsClassifier(k)
        #  print 'knn:',knn
        scores = cross_val_score(knn,
        score_mean = scores.mean()
        print '%i:%.4f' % (k, score_mean)

    best_k = np.argmax(cv_scores) + 1

    knn_model = KNeighborsClassifier(best_k)
    knn_model.fit(X_train, y_train)
    print '测试模型,准确率:', knn_model.score(X_test, y_test)

    return ''
Пример #8
Пример #9
def KNN_method(X, y):
    skf = StratifiedKFold(n_splits=4, random_state=42)
    skf.get_n_splits(X, y)

    for train_index, test_index in skf.split(X, y):
        print("Train:", train_index, "Validation:", test_index)
        trainX, testX = X[train_index], X[test_index]
        trainY, testY = y[train_index], y[test_index]

        #here starts KNN
        #how many neighbours want to use in the KNC
        kvalues = [1, 3, 5, 7, 9, 11, 13, 15, 19, 24, 30, 40, 50, 60, 70, 90]
        dist = ['manhattan', 'euclidean', 'chebyshev']
        results = {}
        for element in dist:
            accuracy_results = []
            for k in kvalues:
                knn = KNeighborsClassifier(n_neighbors=k, metric=element)
                knn.fit(trainX, trainY)
                predictedY = knn.predict(testX)
                accuracy_results.append(accuracy_score(testY, predictedY))
            results[element] = accuracy_results
        print("Results of model preparation for: " + str(results))

                            'KNN variants',
Пример #10
    def _main_loop(self):
        exit_count = 0
        knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute')
        while exit_count < len(self.groups):
            index, exit_count = 0, 0
            while index < len(self.groups):

                group = self.groups[index]
                reps_x = np.asarray([g.rep_x for g in self.groups])
                reps_y = np.asarray([g.label for g in self.groups])
                knn.fit(reps_x, reps_y)
                nn_idx = knn.kneighbors(group.X, n_neighbors=1, return_distance=False)
                nn_idx = nn_idx.T[0]
                mask = nn_idx == index
                # if all are correctly classified
                if not (False in mask):
                    exit_count = exit_count + 1
                # if all are misclasified
                elif not (group.label in reps_y[nn_idx]):
                    pca = PCA(n_components=1)
                    # maybe use a 'for' instead of creating array
                    d = pca.transform(reps_x[index])
                    dis = [pca.transform(inst)[0] for inst in group.X]
                    mask_split = (dis < d).flatten()
                    new_X = group.X[mask_split]
                    self.groups.append(_Group(new_X, group.label))
                    group.X = group.X[~mask_split]
                elif (reps_y[nn_idx] == group.label).all() and (nn_idx != index).any():
                    mask_mv = nn_idx != index
                    index_mv = np.asarray(range(len(group)))[mask_mv]
                    X_mv = group.remove_instances(index_mv)
                    G_mv = nn_idx[mask_mv]                        

                    for x, g in zip(X_mv, G_mv):

                elif (reps_y[nn_idx] != group.label).sum()/float(len(group)) > self.r_mis:
                    mask_mv = reps_y[nn_idx] != group.label
                    new_X = group.X[mask_mv]
                    self.groups.append(_Group(new_X, group.label))
                    group.X = group.X[~mask_mv]
                   exit_count = exit_count + 1

                if len(group) == 0:
                    index = index + 1

                for g in self.groups:

        return self.groups                     
Пример #11
def evaluate(Xtra, ytra, Xtst, ytst, k=1, positive_label=1):
    knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute')
    knn.fit(Xtra, ytra)

    y_true = ytst
    y_pred = knn.predict(Xtst)

    return evaluate_results(y_true, y_pred, positive_label=positive_label)
Пример #12
def knn(X, y, model_path):
    model = KNeighborsClassifier()
    model.fit(X, y)
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)
Пример #13
    def get_result(self):
        # file opener
        directory = filedialog.askdirectory()
        result = self.read_emails_from_directory(directory)

        train_labels = np.zeros(1430)
        train_labels[715:1430] = 1
        # This equates to 1-715 = HAM and 716-1430 = SPAM
        #                              If you change result[n] to something else
        #                              Make sure you change the same result down
        #                              down in line 251 (test_matrix)
        train_matrix = self.extract_features(directory, result[0])
        # print("body words:", result[0])
        # print("\n\nsubject words:", result[1])
        # print("\n\nbody phrases:", result[2])
        # print("\n\nsubject phrases:", result[3])

        print("body words:", len(result[0]))
        print("subject words:", len(result[1]))
        print("body phrases:", len(result[2]))
        print("subject phrases:", len(result[3]))

        model1 = MultinomialNB()
        model2 = LinearSVC()
        model3 = RandomForestClassifier()
        model4 = KNeighborsClassifier()
        model1.fit(train_matrix, train_labels)
        model2.fit(train_matrix, train_labels)
        model3.fit(train_matrix, train_labels)
        model4.fit(train_matrix, train_labels)

        test_dir = filedialog.askdirectory()
        #                                       Here -----v
        test_matrix = self.extract_features(test_dir, result[0])
        test_labels = np.zeros(600)
        # This equates to 1-300 = HAM and 301-600 = SPAM
        test_labels[300:600] = 1
        result1 = model1.predict(test_matrix)
        result2 = model2.predict(test_matrix)
        result3 = model3.predict(test_matrix)
        result4 = model4.predict(test_matrix)

        print(confusion_matrix(test_labels, result1))
        print(confusion_matrix(test_labels, result2))
        print(confusion_matrix(test_labels, result3))
        print(confusion_matrix(test_labels, result4))
        return result
Пример #14
    def _pruning(self):

        if len(self.groups) < 2:
            return self.groups

        pruned, fst = False, True
        knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute')
        while pruned or fst:
            index = 0
            pruned, fst = False, False

            while index < len(self.groups):
                group = self.groups[index]

                mask = np.ones(len(self.groups), dtype=bool)
                mask[index] = False
                reps_x = np.asarray([g.rep_x for g in self.groups])[mask]
                reps_y = np.asarray([g.label for g in self.groups])[mask]
                labels = knn.fit(reps_x, reps_y).predict(group.X)

                if (labels == group.label).all():
                    pruned = True
                    index = index + 1

                if len(self.groups) == 1:
                    index = len(self.groups)
                    pruned = False

        return self.groups
Пример #15
    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U
Пример #17
def compute_enn(X, y):
  the edited nearest neighbors removes the instances in the boundaries, maintaining reduntant samples

  classifier = KNeighborsClassifier(n_neighbors=3)

  classes = np.unique(y)
  classes_ = classes

  mask = np.zeros(y.size, dtype=bool)
  classifier.fit(X, y)

  for i in xrange(y.size):
    sample, label = X[i], y[i]
    if classifier.predict(sample) == [label]:
      mask[i] = not mask[i]

  X_ = np.asarray(X[mask])
  y_ = np.asarray(y[mask])
  reduction_ = 1.0 - float(len(y_)) / len(y)
  print reduction_
    def check_accuracy_f1(self, path):
        data_after_feature_selected = []
        for i in path:
            data_after_feature_selected.append(feature[:, i])
        data_after_feature_selected = np.array(data_after_feature_selected)
        data_after_feature_selected = data_after_feature_selected.transpose(
        )  # 矩阵转置
        X_train2, X_test2, y_train2, y_test2 = train_test_split(
            data_after_feature_selected, label, test_size=0.3)

        if select_model == "SVM":
            model_svm = svm.SVC(kernel='poly', gamma=0.125, C=20)
            model_svm.fit(X_train2, y_train2)
            prediction2 = model_svm.predict(X_test2)
        elif select_model == "KNN":
            model_knn = KNeighborsClassifier(n_neighbors=1)
            model_knn.fit(X_train2, y_train2)
            prediction2 = model_knn.predict(X_test2)
        elif select_model == "RF":
            model_rf2 = RandomForestClassifier()
            model_rf2.fit(X_train2, y_train2)
            prediction2 = model_rf2.predict(X_test2)
        elif select_model == "LR":
            model_lr2 = LogisticRegression()
            model_lr2.fit(X_train2, y_train2)
            prediction2 = model_lr2.predict(X_test2)
        elif select_model == "DT":
            model_dt = DecisionTreeClassifier()
            model_dt.fit(X_train2, y_train2)
            prediction2 = model_dt.predict(X_test2)

        return accuracy_score(y_test2, prediction2), f1_score(y_test2,
        model = KNeighborsClassifier(n_neighbors=1)
        model.fit(feature[train_index, :][:, top], label[train_index])
        prediction = model.predict(feature[test_index, :][:, top])
        acc, f = get_result(label[test_index], prediction)

        model = RandomForestClassifier(n_estimators=250)
        model.fit(feature[train_index, :][:, top], label[train_index])
        prediction = model.predict(feature[test_index, :][:, top])
        acc, f = get_result(label[test_index], prediction)

def nearest_fit(X, y):
    clf = KNeighborsClassifier(7, 'distance')
    return clf.fit(X, y)

Пример #22
class CNN(InstanceReductionMixin):
    """Condensed Nearest Neighbors.

    Each class is represented by a set of prototypes, with test samples
    classified to the class with the nearest prototype.
    The Condensed Nearest Neighbors removes the redundant instances,
    maintaining the samples in the decision boundaries.

    n_neighbors : int, optional (default = 1)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    `prototypes_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `labels_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    >>> from protopy.selection.cnn import CNN
    >>> import numpy as np
    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    >>> y = np.array([1, 1, 1, 2, 2, 2])
    >>> cnn = CNN()
    >>> cnn.fit(X, y)
    >>> print(cnn.predict([[-0.8, -1]]))

    See also
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    The Condensed Nearest Neighbor is one the first prototype selection
    technique in literature.

    P. E. Hart, The condensed nearest neighbor rule, IEEE Transactions on 
    Information Theory 14 (1968) 515–516.


    def __init__(self, n_neighbors=1):
        self.n_neighbors = n_neighbors
        self.classifier = None

    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]

        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)
        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
Пример #23
def knn_score(X, y, neighbors):
    knn5 = KNeighborsClassifier(n_neighbors=neighbors)
    knn5.fit(X, y)
    y_pred = knn5.predict(X)
    print "KNN{} accuracy_score: {}".format(neighbors,
                                            metrics.accuracy_score(y, y_pred))
Пример #24
# 当前最优 : KNC-K近邻算法, SVC-资瓷矢量机
# 审查结果可视化
_Fig = plt.figure()
_Ax = _Fig.add_subplot(111)
# 预测程序启动...
from sklearn import metrics
# K近邻算法预测
    " " * 4,
    " " * 8,
    metrics.accuracy_score(y_true=_Y_VAL, y_pred=_KNC_PREDICTIONS),
    " " * 4,
    metrics.confusion_matrix(y_true=_Y_VAL, y_pred=_KNC_PREDICTIONS),
Пример #25
class ENN(InstanceReductionMixin):

    """Edited Nearest Neighbors.

    The Edited Nearest Neighbors  removes the instances in de 
    boundaries, maintaining redudant samples.

    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    >>> from protopy.selection.enn import ENN
    >>> import numpy as np
    >>> X = np.array([[-1, 0], [-0.8, 1], [-0.8, -1], [-0.5, 0] , [0.5, 0], [1, 0], [0.8, 1], [0.8, -1]])
    >>> y = np.array([1, 1, 1, 2, 1, 2, 2, 2])
    >>> editednn = ENN()
    >>> editednn.fit(X, y)
    >>> print(editednn.predict([[-0.6, 0.6]]))
    >>> print editednn.reduction_

    See also
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    Ruiqin Chang, Zheng Pei, and Chao Zhang. A modified editing k-nearest
    neighbor rule. JCP, 6(7):1493–1500, 2011.


    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
        self.classifier = None

    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0
            return self.X_, self.y_

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
Пример #26
import matplotlib.pyplot as plt
import mglearn

cancer = load_breast_cancer()
#print(cancer.data)            # X : data
#print(cancer.target_names)    # y : label
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target,
                                                    stratify = cancer.target, random_state=66)

training_accuracy = []
test_accuracy = []

# 1 ~ 10까지 n_neighbors를 적용
neighbors_setting = range(1,11)

for n_neighbors in neighbors_setting:
    # 모델 생성
    clf = KNeighborsClassifier(n_neighbors= n_neighbors)
    clf.fit(X_train, y_train)
    # train 세트 정확도 저장
    training_accuracy.append(clf.score(X_train, y_train))
    # 일반화 정확도 저장
    test_accuracy.append(clf.score(X_test, y_test))

plt.plot(neighbors_setting, training_accuracy, label='traing accuracy')
plt.plot(neighbors_setting, test_accuracy, label='test accuracy')
Пример #27
class SSMA(InstanceReductionMixin):
    """Steady State Memetic Algorithm

    The Steady-State Memetic Algorithm is an evolutionary prototype
    selection algorithm. It uses a memetic algorithm in order to 
    perform a local search in the code.

    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    alpha   : float (default = 0.6)
        Parameter that ponderates the fitness function.

    max_loop    : int (default = 1000)
        Number of maximum loops performed by the algorithm.

    threshold   : int (default = 0)
        Threshold that regulates the substitution condition;

    chromosomes_count: int (default = 10)
        number of chromosomes used to find the optimal solution.

    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    >>> from protopy.selection.ssma import SSMA
    >>> import numpy as np
    >>> X = np.array([[i] for i in range(100)])
    >>> y = np.asarray(50 * [0] + 50 * [1])
    >>> ssma = SSMA()
    >>> ssma.fit(X, y)
    SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0)
    >>> print ssma.predict([[40],[60]])
    [0 1]
    >>> print ssma.reduction_

    See also
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype
    selection based on a steady-state memetic algorithm: a study of scalability.
    Memetic Computing, 2(3):183–199, 2010.

    def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.max_loop = max_loop
        self.threshold = threshold
        self.chromosomes_count = chromosomes_count

        self.evaluations = None
        self.chromosomes = None

        self.best_chromosome_ac = -1
        self.best_chromosome_rd = -1

        self.classifier = KNeighborsClassifier(n_neighbors = n_neighbors)

    def accuracy(self, chromosome, X, y):
        mask = np.asarray(chromosome, dtype=bool)
        cX, cy = X[mask], y[mask]
        #print len(cX), len(cy), sum(chromosome)

        self.classifier.fit(cX, cy)
        labels = self.classifier.predict(X)
        accuracy = (labels == y).sum()

        return float(accuracy)/len(y)

    def fitness(self, chromosome, X, y):
        #TODO add the possibility of use AUC for factor1
        ac = self.accuracy(chromosome, X, y)
        rd = 1.0 - (float(sum(chromosome))/len(chromosome))

        return self.alpha * ac + (1.0 - self.alpha) * rd

    def fitness_gain(self, gain, n):
        return self.alpha * (float(gain)/n) + (1 - self.alpha) * (1.0 / n)

    def update_threshold(self, X, y):
        best_index = np.argmax(self.evaluations)
        chromosome = self.chromosomes[best_index]

        best_ac = self.accuracy(chromosome, X, y)
        best_rd = 1.0 - float(sum(chromosome))/len(y)

        if best_ac <= self.best_chromosome_ac:
            self.threshold = self.threshold + 1
        if best_rd <= self.best_chromosome_rd:
            self.threshold = self.threshold - 1

        self.best_chromosome_ac = best_ac
        self.best_chromosome_rd = best_rd

    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U

    def memetic_looper(self, S, R):
        c = 0
        for i in range(len(S)):
            if S[i] == 1 and i not in R:
                c = c + 1
                if c == 2:
                    return True

        return False

    def memetic_select_j(self, S, R):
        indexs = []
        for i in range(len(S)):
            if i not in R and S[i] == 1:
        # if list is empty wlil return error
        return np.random.choice(indexs)

    def generate_population(self, X, y):
        self.chromosomes = [[np.random.choice([0,1]) for i in range(len(y))]
                            for c in range(self.chromosomes_count)]
        self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes]

        self.update_threshold(X, y)

    def select_parents(self, X, y):
        parents = []
        for i in range(2):
            samples = random.sample(self.chromosomes, 2)
            parents = parents + [samples[0] if self.fitness(samples[0], X, y) >
                                    self.fitness(samples[1], X, y) else samples[1]]
        return np.array(parents, copy=True)

    def crossover(self, parent_1, parent_2):
        size = len(parent_1)
        mask = [0] * (size/2) + [1] * (size - size/2)
        mask = np.asarray(mask, dtype=bool)

        off_1 = parent_1 * mask + parent_2 * ~mask
        off_2 = parent_2 * mask + parent_1 * ~mask
        return np.asarray([off_1, off_2])

    def mutation(self, offspring):
        for i in range(len(offspring)):
            if np.random.uniform(0,1) < 1.0/len(offspring):
                offspring[i] = not offspring[i]

        return offspring

    def memetic_search(self, chromosome, X, y, chromosome_fitness = None):
        S = np.array(chromosome, copy=True)
        if S.sum() == 0:
            return S, 0

        if chromosome_fitness == None:
            chromosome_fitness = self.fitness(chromosome, X, y)
        fitness_s = chromosome_fitness

        # List of visited genes in S 
        R = []
        # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i
        U = self.index_nearest_neighbor(S, X, y)
        while self.memetic_looper(S, R):
            j = self.memetic_select_j(S, R) 
            S[j] = 0
            gain = 0.0
            U_copy = list(U)
            mask = np.asarray(S, dtype=bool)
            X_tra, y_tra = X[mask], y[mask]
            real_idx = np.asarray(range(len(y)))[mask]

            if len(y_tra) > 0:
                for i in range(len(U)):
                    if U[i] == j:
                        self.classifier.fit(X_tra, y_tra)
                        [[idx]] = self.classifier.kneighbors(X[i], n_neighbors=1,
                        U[i] = real_idx[idx]
                        if y[i] == y[U_copy[i]] and y[i] != y[U[i]]:
                            gain = gain - 1.0
                        if y[i] != y[U_copy[i]] and y[i] == y[U[i]]:
                            gain = gain + 1.0
            if gain >= self.threshold:
                n = S.sum()
                g = self.fitness_gain(gain, n)
                fitness_s = fitness_s + g
                R = []
                U = U_copy
                S[j] = 1

        return list(S), fitness_s


    def main_loop(self, X, y):
        self.generate_population(X, y)
        n, worse_fit_index = 0, -1
        while (n < self.max_loop):
            parents = self.select_parents(X, y)
            offspring = self.crossover(parents[0], parents[1])
            offspring[0] = self.mutation(offspring[0])
            offspring[1] = self.mutation(offspring[1])

            fit_offs = [self.fitness(off, X, y) if sum(off) > 0 else -1 for off in offspring]
            if worse_fit_index == -1:
                worse_fit_index = np.argmin(self.evaluations)

            for i in range(len(offspring)):
                p_ls = 1.0 

                if fit_offs[i] == -1:
                    p_ls = -1

                if fit_offs[i] <= self.evaluations[worse_fit_index]:
                    p_ls = 0.0625

                if np.random.uniform(0,1) < p_ls:

                    offspring[i], fit_offs[i] = self.memetic_search(offspring[i], X, y, chromosome_fitness = fit_offs[i])

            for i in range(len(offspring)):
                if fit_offs[i] > self.evaluations[worse_fit_index]:
                    self.chromosomes[worse_fit_index] = offspring[i]
                    self.evaluations[worse_fit_index] = fit_offs[i]

                    worse_fit_index = np.argmin(self.evaluations)

            n = n + 1
            if n % 10 == 0:
                self.update_threshold(X, y)

    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        classes = np.unique(y)
        self.classes_ = classes

        self.main_loop(X, y)

        best_index = np.argmax(self.evaluations)
        mask = np.asarray(self.chromosomes[best_index], dtype=bool)
        self.X_ = X[mask]
        self.y_ = y[mask]
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)

        return self.X_, self.y_
Пример #28
def test_view_func_NN(model_classifier, model_rpn, model_inner, C):
    test_cls = 'aeroplane'
    input_train_file = 'pickle_data/train_data_Wflip_all.pickle'

    ## read the training data from pickle file or from annotations
    test_pickle = 'pickle_data/test_data_{}.pickle'.format(test_cls)
    if os.path.exists(test_pickle):
        with open(test_pickle) as f:
            all_imgs, classes_count, _ = pickle.load(f)

    class_mapping = C.class_mapping
    inv_class_mapping = {v: k for k, v in class_mapping.iteritems()}
    backend = K.image_dim_ordering()
    gt_cls_num = class_mapping[test_cls]
    print('work on class {}'.format(test_cls))
    base_path = os.getcwd()

    # turn off any data augmentation at test time
    C.use_horizontal_flips = False
    C.use_vertical_flips = False
    C.rot_90 = False
    count = 0
    good_img = 0
    not_good = 0

    def format_img_size(img, C):
        """ formats the image size based on config """
        img_min_side = float(C.im_size)
        (height, width, _) = img.shape

        if width <= height:
            ratio = img_min_side / width
            new_height = int(ratio * height)
            new_width = int(img_min_side)
            ratio = img_min_side / height
            new_width = int(ratio * width)
            new_height = int(img_min_side)
        img = cv2.resize(img, (new_width, new_height),
        return img, ratio

    def format_img_channels(img, C):
        """ formats the image channels based on config """
        img = img[:, :, (2, 1, 0)]
        img = img.astype(np.float32)
        img[:, :, 0] -= C.img_channel_mean[0]
        img[:, :, 1] -= C.img_channel_mean[1]
        img[:, :, 2] -= C.img_channel_mean[2]
        img /= C.img_scaling_factor
        img = np.transpose(img, (2, 0, 1))
        img = np.expand_dims(img, axis=0)
        return img

    def format_img(img, C):
        """ formats an image for model prediction based on config """
        img, ratio = format_img_size(img, C)
        img = format_img_channels(img, C)
        return img, ratio

    def display_image(img):
        img1 = img[:, :, (2, 1, 0)]
        # img1=img
        im = Image.fromarray(img1.astype('uint8'), 'RGB')

    # Method to transform the coordinates of the bounding box to its original size
    def get_real_coordinates(ratio, x1, y1, x2, y2):
        ## read the training data from pickle file or from annotations
        real_x1 = int(round(x1 // ratio))
        real_y1 = int(round(y1 // ratio))
        real_x2 = int(round(x2 // ratio))
        real_y2 = int(round(y2 // ratio))
        return (real_x1, real_y1, real_x2, real_y2)

    vnum_test = 24
    azimuth_vec = np.concatenate(
         np.linspace((360. / (vnum_test * 2)), 360. -
                     (360. / (vnum_test * 2)), vnum_test)),

    def find_interval(azimuth, azimuth_vec):
        for i in range(len(azimuth_vec)):
            if azimuth < azimuth_vec[i]:
        ind = i
        if azimuth > azimuth_vec[-1]:
            ind = 1
        return ind

    class_mapping = C.class_mapping

    if 'bg' not in class_mapping:
        class_mapping['bg'] = len(class_mapping)

    class_mapping = {v: k for k, v in class_mapping.items()}
    # print(class_mapping)
    class_to_color = {
        class_mapping[v]: np.random.randint(0, 255, 3)
        for v in class_mapping
    C.num_rois = 32

    obj_num = 0
    bbox_threshold_orig = 0.6
    th_bbox = 0.4

    ## get GT for all az for single cls
    feature_az = []
    sorted_path = input_train_file
    tmp_ind = sorted_path.index('.pickle')
    sorted_path = sorted_path[:tmp_ind] + "_sorted_Angles" + sorted_path[
    if os.path.exists(sorted_path):
        print("loading sorted data")
        with open(sorted_path) as f:
            trip_data = pickle.load(f)
    im_file = []
    ind = []
    for ii in range(360):
        for jj in range(3):
                if jj == 0:
                    print('no azimuth {}'.format(ii))
    data_gen_train = data_generators.get_anchor_gt(im_file, [],
    azimuth_dict = []
    inner_NN = []
    azimuths = []
    for tt in range(len(ind)):
            if tt % 100 == 0:
                print('worked on {}/{}'.format(tt, len(ind)))
            # print ('im num {}'.format(good_img))
            X, Y, img_data = next(data_gen_train)

            P_rpn = model_rpn.predict_on_batch(X)

            R = roi_helpers.rpn_to_roi(P_rpn[0],

            X2, Y1, Y2, Y_view = roi_helpers.calc_iou_new(
                R, img_data, C, C.class_mapping)

            pos_samples = np.where(Y1[0, :, -1] == 0)
            sel_samples = pos_samples[0].tolist()
            R = X2[0, sel_samples, :]
            for jk in range(R.shape[0] // C.num_rois + 1):
                ROIs = np.expand_dims(R[C.num_rois * jk:C.num_rois *
                                        (jk + 1), :],
                if ROIs.shape[1] == 0:

                if jk == R.shape[0] // C.num_rois:
                    # pad R
                    curr_shape = ROIs.shape
                    target_shape = (curr_shape[0], C.num_rois, curr_shape[2])
                    ROIs_padded = np.zeros(target_shape).astype(ROIs.dtype)
                    ROIs_padded[:, :curr_shape[1], :] = ROIs
                    ROIs_padded[0, curr_shape[1]:, :] = ROIs[0, 0, :]
                    ROIs = ROIs_padded

                [P_cls, P_regr, P_view] = model_classifier.predict([X, ROIs])
                iner_f = model_inner.predict([X, ROIs])
                # oo = model_classifier_only.predict([F, ROIs])

                for ii in range(len(sel_samples)):

                    if np.max(P_cls[0,
                                    ii, :]) < bbox_threshold_orig or np.argmax(
                                              ii, :]) == (P_cls.shape[2] - 1):

                    ## get class from the net
                    # cls_num = np.argmax(P_cls[0, ii, :])

                    ## use gt class
                    cls_num = gt_cls_num

                    cls_name = inv_class_mapping[cls_num]
                    cls_view = P_view[0, ii, 360 * cls_num:360 * (cls_num + 1)]

                    # azimuths[cls_name].append(np.argmax(cls_view, axis=0))
                    inner_NN.append(iner_f[0, ii, :])
            print('failed on az {}'.format(img_data['bboxes'][0]['azimuth']))
    ## calculating some mean feature map for every az
    with open('pickle_data/{}_NN.pickle'.format(C.weight_name), 'w') as f:
        pickle.dump([inner_NN, azimuth_dict], f)
        print('saved PICKLE')

    with open('pickle_data/{}_NN.pickle'.format(C.weight_name)) as f:
        inner_NN, azimuth_dict = pickle.load(f)
    neigh = KNeighborsClassifier(n_neighbors=1)
    neigh.fit(inner_NN, azimuth_dict)

    jj = 0
    for im_file in all_imgs:
        jj += 1
        if jj % 50 == 0:
        filepath = im_file['filepath']
        img = cv2.imread(filepath)
        img_gt = np.copy(img)
        if img is None:
            not_good += 1
            good_img += 1
            # print ('im num {}'.format(good_img))
        X, ratio = format_img(img, C)

        if backend == 'tf':
            X = np.transpose(X, (0, 2, 3, 1))

        # get the feature maps and output from the RPN
        Y1, Y2 = model_rpn.predict(X)
        R = roi_helpers.rpn_to_roi(Y1,
        # # convert from (x1,y1,x2,y2) to (x,y,w,h)
        R[:, 2] -= R[:, 0]
        R[:, 3] -= R[:, 1]

        width, height = int(im_file["width"]), int(im_file["height"])
        resized_width, resized_height = data_generators.get_new_img_size(
            width, height, C.im_size)
        # [_,_, F] = model_rpn.predict(X)
        ROIs = []
        ## pass on all the labels in the image, some of them are not equal to test_cls
        for bbox_gt in im_file['bboxes']:
            no_bbox_flag = 1
            bbox_threshold = bbox_threshold_orig
            if not bbox_gt['class'] == test_cls:
            if bbox_gt[
                    'class'] == test_cls and bbox_threshold == bbox_threshold_orig:
                obj_num += 1
            while no_bbox_flag and bbox_threshold > th_bbox:
                cls_gt = bbox_gt['class']
                az_gt = bbox_gt['azimuth']
                el_gt = bbox_gt['elevation']
                t_gt = bbox_gt['tilt']
                if len(ROIs) == 0:
                    # apply the spatial pyramid pooling to the proposed regions
                    bboxes = {}
                    probs = {}
                    azimuths = {}
                    inner_res = {}
                    # print ('obj num {}'.format(obj_num))

                    for jk in range(R.shape[0] // C.num_rois + 1):
                        ROIs = np.expand_dims(R[C.num_rois * jk:C.num_rois *
                                                (jk + 1), :],
                        if ROIs.shape[1] == 0:

                        if jk == R.shape[0] // C.num_rois:
                            #pad R
                            curr_shape = ROIs.shape
                            target_shape = (curr_shape[0], C.num_rois,
                            ROIs_padded = np.zeros(target_shape).astype(
                            ROIs_padded[:, :curr_shape[1], :] = ROIs
                            ROIs_padded[0, curr_shape[1]:, :] = ROIs[0, 0, :]
                            ROIs = ROIs_padded

                        [P_cls, P_regr,
                         P_view] = model_classifier.predict([X, ROIs])
                        inner_out = model_inner.predict([X, ROIs])
                        # oo = model_classifier_only.predict([F, ROIs])

                        for ii in range(P_cls.shape[1]):

                            if np.max(P_cls[
                                    0, ii, :]) < bbox_threshold or np.argmax(
                                              ii, :]) == (P_cls.shape[2] - 1):

                            ## get class from the net
                            # cls_num = np.argmax(P_cls[0, ii, :])

                            ## use gt class
                            cls_num = gt_cls_num

                            cls_name = inv_class_mapping[cls_num]
                            cls_view = P_view[0, ii, 360 * cls_num:360 *
                                              (cls_num + 1)]

                            if cls_name not in bboxes:
                                bboxes[cls_name] = []
                                probs[cls_name] = []
                                azimuths[cls_name] = []
                                inner_res[cls_name] = []

                            (x, y, w, h) = ROIs[0, ii, :]

                                (tx, ty, tw,
                                 th) = P_regr[0, ii,
                                              4 * cls_num:4 * (cls_num + 1)]
                                tx /= C.classifier_regr_std[0]
                                ty /= C.classifier_regr_std[1]
                                tw /= C.classifier_regr_std[2]
                                th /= C.classifier_regr_std[3]
                                x, y, w, h = roi_helpers.apply_regr(
                                    x, y, w, h, tx, ty, tw, th)
                                C.rpn_stride * x, C.rpn_stride * y,
                                C.rpn_stride * (x + w), C.rpn_stride * (y + h)
                            probs[cls_name].append(np.max(P_cls[0, ii, :]))
                                np.argmax(cls_view, axis=0))
                            inner_res[cls_name].append(inner_out[0, ii, :])

                # cv2.rectangle(img_gt, (bbox_gt['x1'], bbox_gt['y1']), (bbox_gt['x2'], bbox_gt['y2']), (int(class_to_color[test_cls][0]), int(class_to_color[test_cls][1]), int(class_to_color[test_cls][2])), 2)
                for key in bboxes:
                    # if 1:
                    if key == test_cls and bbox_gt['class'] == test_cls:
                        bbox = np.array(bboxes[key])
                        prob = np.array(probs[key])
                        azimuth = np.array(azimuths[key])
                        inner_result = np.array(inner_res[key])
                        # img = draw_bbox(img,bbox, prob, azimuth, ratio)
                        azimuth = neigh.predict(inner_result)
                        ## get the azimuth from bbox that have more than 'overlap_thresh' overlap with gt_bbox
                        az = []
                        overlap_thresh = 0.5
                            while np.size(az) == 0 and overlap_thresh > 0:
                                _, prob_bbox, az = roi_helpers.overlap_with_gt(
                                overlap_thresh -= 0.1
                            if overlap_thresh == 0:
                                print("No good Bbox was found")
                            counts = np.bincount(az)
                            az = []
                            counts = []
                            az_fin = np.argmax(counts)
                            true_bin = find_interval(az_gt, azimuth_vec)
                            prob_bin = find_interval(az_fin, azimuth_vec)
                            no_bbox_flag = 0
                            if true_bin == prob_bin:
                                count += 1
                            # print('here')
                            no_bbox_flag = 1
                            bbox_threshold -= 0.1

                    ## azimuth calculations

                    ## display

                bbox_threshold -= 0.1

    succ = float(count) / float(obj_num) * 100.
        'for class {} -true count is {} out of {} from {} images . {} success'.
        format(test_cls, count, obj_num, good_img, succ))
    return succ
Пример #30
            out = Trainer.model.semantics(data, 1, 1, 1.0)

            n = data.shape[0]
            Xtrain[counter:counter +
                   n] = out[0].detach().cpu().numpy().reshape(n, -1)
            Ytrain[counter:counter + n] = labels.numpy()
            counter += n

        counter = 0
        for i, (data, labels) in enumerate(testloader):
            # Feed forward data
            data = data.reshape(-1, *Trainer.input_shape).to(torch.float32).to(
            out = Trainer.model.semantics(data, 1, 1, 1.0)

            n = data.shape[0]
            Xtest[counter:counter + n] = out[0].detach().cpu().numpy().reshape(
                n, -1)
            Ytest[counter:counter + n] = labels.numpy()
            counter += n

        raise ValueError('Wrong dataset')

    from sklearn.neighbors.classification import KNeighborsClassifier
    classifier = KNeighborsClassifier()
    classifier.fit(Xtrain, Ytrain)
    Ypred = classifier.predict(Xtest)
    print(np.mean(Ypred == Ytest))
Пример #31
class KNNClassifier():
    def __init__(self, csv_path_train, csv_path_test, k):
        self.csv_path_train = csv_path_train
        self.csv_path_test = csv_path_test
        self.classifier = KNeighborsClassifier(n_neighbors=k,

    def create_arrays(self):
        arr_train = np.genfromtxt(self.csv_path_train,
        self.X_train = np.delete(arr_train, [arr_train.shape[1] - 1], axis=1)
        self.y_train = np.delete(arr_train,
                                 list(range(arr_train.shape[1] - 1)),

        arr_test = np.genfromtxt(self.csv_path_test,
        self.X_test = np.delete(arr_test, [arr_test.shape[1] - 1], axis=1)
        self.y_test = np.delete(arr_test,
                                list(range(arr_test.shape[1] - 1)),

    def preprocess(self):
        #         X_train, X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=0)
        sc = StandardScaler()
        self.X_train_std = sc.transform(self.X_train)
        self.X_test_std = sc.transform(self.X_test)

    def train(self):
        self.classifier.fit(self.X_train_std, self.y_train.ravel())

    def test(self, f, patient_num, total_fpr, total_tpr):
        y_pred = self.classifier.predict(self.X_test_std)
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = accuracy_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        print("Accuracy: %.2f" % accuracy)
        print("Precision: %.2f" % precision)
        print("Recall: %.2f" % recall)
        line = str(accuracy) + "," + str(precision) + "," + str(recall)
        confmat = confusion_matrix(self.y_test, y_pred)
        fig, ax = plt.subplots(figsize=(2.5, 2.5))
        ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
        for i in range(confmat.shape[0]):
            for j in range(confmat.shape[1]):
                ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
        plt.xlabel('predicted label')
        plt.ylabel('true label')
        plt.savefig("D:\\Documents\\KNN\\FFT\\chb" + patient_num +
        fpr, tpr, thresholds = roc_curve(self.y_test, y_pred)
        print("fpr", fpr)
        print("tpr", tpr)
        total_fpr[1] += fpr[len(fpr) - 2]
        total_tpr[1] += tpr[len(tpr) - 2]
        roc_auc = auc(fpr, tpr)
        plt.title('ROC Curve')
        plt.plot(fpr, tpr, 'b', label='AUC = %.2F' % roc_auc)
        plt.legend(loc='lower right')
        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([-0.1, 1.2])
        plt.ylim([-0.1, 1.2])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.savefig("D:\\Documents\\KNN\\FFT\\chb" + patient_num + "roc.png")
        return total_fpr, total_tpr
Пример #32
class InstanceReductionMixin(InstanceReductionBase, ClassifierMixin):
    """Mixin class for all instance reduction techniques"""
    def set_classifier(self):
        """Sets the classified to be used in the instance reduction process
            and classification.

        classifier : classifier, following the KNeighborsClassifier style
            (default = KNN)

        y : array-like, shape = [n_samples]
            Labels for X.

        P : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        q : array-like, shape = [indertaminated]
            Labels for P

        self.classifier = classifier

    def reduce_data(self, X, y):
        """Perform the instance reduction procedure on the given training data.

        X : array-like, shape = [n_samples, n_features]
            Training set.0

        y : array-like, shape = [n_samples]
            Labels for X.

        X_ : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        y_ : array-like, shape = [indertaminated]
            Labels for X_

    def fit(self, X, y, reduce_data=True):
        Fit the InstanceReduction model according to the given training data.

        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        reduce_data : bool, flag indicating if the reduction would be performed
        self.X = X
        self.y = y

        if reduce_data:
            self.reduce_data(X, y)

        return self

    def predict(self, X, n_neighbors=1):
        """Perform classification on an array of test vectors X.

        The predicted class C for each sample in X is returned.

        X : array-like, shape = [n_samples, n_features]

        C : array, shape = [n_samples]

        The default prediction is using KNeighborsClassifier, if the
        instance reducition algorithm is to be performed with another
        classifier, it should be explicited overwritten and explained
        in the documentation.
        X = check_array(X)
        if not hasattr(self, "X_") or self.X_ is None:
            raise AttributeError("Model has not been trained yet.")

        if not hasattr(self, "y_") or self.y_ is None:
            raise AttributeError("Model has not been trained yet.")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict(X)

    def predict_proba(self, X):
        """Return probability estimates for the test data X.
        after a given prototype selection algorithm.
        X : array, shape = (n_samples, n_features)
            A 2-D array representing the test points.
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
        of such arrays if n_outputs > 1.
        The class probabilities of the input samples. Classes are ordered
        by lexicographic order.
        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict_proba(X)
Пример #33
Файл: baseNew.py Проект: dvro/ml
class InstanceReductionMixin(InstanceReductionBase, ClassifierMixin):

    """Mixin class for all instance reduction techniques"""

    def set_classifier(self):
        """Sets the classified to be used in the instance reduction process
            and classification.

        classifier : classifier, following the KNeighborsClassifier style
            (default = KNN)

        y : array-like, shape = [n_samples]
            Labels for X.

        P : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        q : array-like, shape = [indertaminated]
            Labels for P

        self.classifier = classifier

    def reduce_data(self, X, y):
        """Perform the instance reduction procedure on the given training data.

        X : array-like, shape = [n_samples, n_features]
            Training set.0

        y : array-like, shape = [n_samples]
            Labels for X.

        X_ : array-like, shape = [indeterminated, n_features]
            Resulting training set.

        y_ : array-like, shape = [indertaminated]
            Labels for X_
    def get_prototypes(self):
        return self.X_, self.y_

    def fit(self, X, y, reduce_data=True):
        Fit the InstanceReduction model according to the given training data.

        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        reduce_data : bool, flag indicating if the reduction would be performed
        self.X = X
        self.y = y
        self.labels = set(y)
        self.prototypes = None
        self.prototypes_labels = None
        self.reduction_ratio = 0.0

        if reduce_data:
            self.reduce_data(X, y)

        return self

    def predict(self, X, n_neighbors=1):
        """Perform classification on an array of test vectors X.

        The predicted class C for each sample in X is returned.

        X : array-like, shape = [n_samples, n_features]

        C : array, shape = [n_samples]

        The default prediction is using KNeighborsClassifier, if the
        instance reducition algorithm is to be performed with another
        classifier, it should be explicited overwritten and explained
        in the documentation.
        X = atleast2d_or_csr(X)
        if not hasattr(self, "X_") or self.X_ is None:
            raise AttributeError("Model has not been trained yet.")

        if not hasattr(self, "y_") or self.y_ is None:
            raise AttributeError("Model has not been trained yet.")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict(X)

    def predict_proba(self, X):
        """Return probability estimates for the test data X.
        after a given prototype selection algorithm.
        X : array, shape = (n_samples, n_features)
            A 2-D array representing the test points.
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
        of such arrays if n_outputs > 1.
        The class probabilities of the input samples. Classes are ordered
        by lexicographic order.
        self.classifier.fit(self.X_, self.y_)
        return self.classifier.predict_proba(X)
def demo():
    """ _test_knn_adwin

    This demo tests the KNNAdwin classifier on a file stream, which gives 
    instances coming from a SEA generator. 
    The test computes the performance of the KNNAdwin classifier as well as 
    the time to create the structure and classify max_samples (10000 by 
    default) instances.
    start = timer()
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    # warnings.filterwarnings("ignore", ".*Passing 1d.*")
    stream = FileStream('../data/datasets/sea_big.csv', -1, 1)
    # stream = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_random_state=32523423,
    #                                  sample_seed=5435, n_classes=2, num_att=10, num_drift_centroids=50)
    t = OneHotToCategorical([[10, 11, 12, 13],
                                 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                 47, 48, 49, 50, 51, 52, 53
    t2 = OneHotToCategorical([[10, 11, 12, 13],
                                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                  25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                                  36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
                                  47, 48, 49, 50, 51, 52, 53

    # knn = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40)
    knn = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=2000)
    # pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)])

    compare = KNeighborsClassifier(n_neighbors=8,
    # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)])
    first = True
    train = 200
    if train > 0:
        X, y = stream.next_sample(train)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe.partial_fit(X, y, classes=stream.target_values)
        # pipe2.fit(X, y)

        knn.partial_fit(X, y, classes=stream.target_values)
        compare.fit(X, y)
        first = False
    n_samples = 0
    max_samples = 10000
    my_corrects = 0
    compare_corrects = 0

    while n_samples < max_samples:
        if n_samples % (max_samples / 20) == 0:
            logging.info('%s%%', str((n_samples // (max_samples / 20) * 5)))
        X, y = stream.next_sample()
        # my_pred = pipe.predict(X)
        my_pred = knn.predict(X)
        # my_pred = [1]
        if first:
            # pipe.partial_fit(X, y, classes=stream.target_values)
            # pipe.partial_fit(X, y, classes=stream.target_values)
            knn.partial_fit(X, y, classes=stream.target_values)
            first = False
            # pipe.partial_fit(X, y)
            knn.partial_fit(X, y)
        # compare_pred = pipe2.predict(X)
        compare_pred = compare.predict(X)
        if y[0] == my_pred[0]:
            my_corrects += 1
        if y[0] == compare_pred[0]:
            compare_corrects += 1
        n_samples += 1

    end = timer()

    print('Evaluation time: ' + str(end - start))
    print(str(n_samples) + ' samples analyzed.')
    print('My performance: ' + str(my_corrects / n_samples))
    print('Compare performance: ' + str(compare_corrects / n_samples))
Пример #35
class TomekLinks(InstanceReductionMixin):

    """Tomek Links.

    The Tomek Links algorithm removes a pair instances that
    forms a Tomek Link. This techniques removes instances in
    the decision region.

    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default in the classification (only).
        The Tomek Links uses only n_neighbors=1 in the reduction.

    keep_class : int, optional (default = None)
        Label of the class to not be removed in the tomek links. If None,
        it removes all nodes of the links.

    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.


    >>> from protopy.selection.tomek_links import TomekLinks
    >>> import numpy as np
    >>> X = np.array([[0],[1],[2.1],[2.9],[4],[5],[6],[7.1],[7.9],[9]])
    >>> y = np.array([1,1,2,1,2,2,2,1,2,2])
    >>> tl = TomekLinks()
    >>> tl.fit(X, y)
    >>> print tl.predict([[2.5],[7.5]])
    [1, 2]
    >>> print tl.reduction_

    See also
    protopy.selection.enn.ENN: edited nearest neighbor

    I. Tomek, “Two modifications of cnn,” IEEE Transactions on Systems,
    Man and Cybernetics, vol. SMC-6, pp. 769–772, 1976.


    def __init__(self, n_neighbors=3, keep_class=None):
        self.n_neighbors = n_neighbors
        self.classifier = None
        self.keep_class = keep_class

    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors, algorithm='brute')
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes
        self.classifier.fit(X, y)
        nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False)
        nn_idx = nn_idx.T[1]

        mask = [nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0])]
        mask = ~np.asarray(mask) 
        if self.keep_class != None and self.keep_class in self.classes_:
            mask[y==self.keep_class] = True

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
Пример #36
class SSMA(InstanceReductionMixin):
    """Steady State Memetic Algorithm

    The Steady-State Memetic Algorithm is an evolutionary prototype
    selection algorithm. It uses a memetic algorithm in order to 
    perform a local search in the code.

    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    alpha   : float (default = 0.6)
        Parameter that ponderates the fitness function.

    max_loop    : int (default = 1000)
        Number of maximum loops performed by the algorithm.

    threshold   : int (default = 0)
        Threshold that regulates the substitution condition;

    chromosomes_count: int (default = 10)
        number of chromosomes used to find the optimal solution.

    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    >>> from protopy.selection.ssma import SSMA
    >>> import numpy as np
    >>> X = np.array([[i] for i in range(100)])
    >>> y = np.asarray(50 * [0] + 50 * [1])
    >>> ssma = SSMA()
    >>> ssma.fit(X, y)
    SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0)
    >>> print ssma.predict([[40],[60]])
    [0 1]
    >>> print ssma.reduction_

    See also
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype
    selection based on a steady-state memetic algorithm: a study of scalability.
    Memetic Computing, 2(3):183–199, 2010.

    def __init__(self,
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.max_loop = max_loop
        self.threshold = threshold
        self.chromosomes_count = chromosomes_count

        self.evaluations = None
        self.chromosomes = None

        self.best_chromosome_ac = -1
        self.best_chromosome_rd = -1

        self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

    def accuracy(self, chromosome, X, y):
        mask = np.asarray(chromosome, dtype=bool)
        cX, cy = X[mask], y[mask]
        #print len(cX), len(cy), sum(chromosome)

        self.classifier.fit(cX, cy)
        labels = self.classifier.predict(X)
        accuracy = (labels == y).sum()

        return float(accuracy) / len(y)

    def fitness(self, chromosome, X, y):
        #TODO add the possibility of use AUC for factor1
        ac = self.accuracy(chromosome, X, y)
        rd = 1.0 - (float(sum(chromosome)) / len(chromosome))

        return self.alpha * ac + (1.0 - self.alpha) * rd

    def fitness_gain(self, gain, n):
        return self.alpha * (float(gain) / n) + (1 - self.alpha) * (1.0 / n)

    def update_threshold(self, X, y):
        best_index = np.argmax(self.evaluations)
        chromosome = self.chromosomes[best_index]

        best_ac = self.accuracy(chromosome, X, y)
        best_rd = 1.0 - float(sum(chromosome)) / len(y)

        if best_ac <= self.best_chromosome_ac:
            self.threshold = self.threshold + 1
        if best_rd <= self.best_chromosome_rd:
            self.threshold = self.threshold - 1

        self.best_chromosome_ac = best_ac
        self.best_chromosome_rd = best_rd

    def index_nearest_neighbor(self, S, X, y):
        classifier = KNeighborsClassifier(n_neighbors=1)

        U = []
        S_mask = np.array(S, dtype=bool, copy=True)
        indexs = np.asarray(range(len(y)))[S_mask]
        X_tra, y_tra = X[S_mask], y[S_mask]

        for i in range(len(y)):
            real_indexes = np.asarray(range(len(y)))[S_mask]
            X_tra, y_tra = X[S_mask], y[S_mask]
            #print len(X_tra), len(y_tra)
            classifier.fit(X_tra, y_tra)
            [[index]] = classifier.kneighbors(X[i], return_distance=False)
            U = U + [real_indexes[index]]

        return U

    def memetic_looper(self, S, R):
        c = 0
        for i in range(len(S)):
            if S[i] == 1 and i not in R:
                c = c + 1
                if c == 2:
                    return True

        return False

    def memetic_select_j(self, S, R):
        indexs = []
        for i in range(len(S)):
            if i not in R and S[i] == 1:
        # if list is empty wlil return error
        return np.random.choice(indexs)

    def generate_population(self, X, y):
        self.chromosomes = [[np.random.choice([0, 1]) for i in range(len(y))]
                            for c in range(self.chromosomes_count)]
        self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes]

        self.update_threshold(X, y)

    def select_parents(self, X, y):
        parents = []
        for i in range(2):
            samples = random.sample(self.chromosomes, 2)
            parents = parents + [
                samples[0] if self.fitness(samples[0], X, y) > self.fitness(
                    samples[1], X, y) else samples[1]
        return np.array(parents, copy=True)

    def crossover(self, parent_1, parent_2):
        size = len(parent_1)
        mask = [0] * (size / 2) + [1] * (size - size / 2)
        mask = np.asarray(mask, dtype=bool)

        off_1 = parent_1 * mask + parent_2 * ~mask
        off_2 = parent_2 * mask + parent_1 * ~mask

        return np.asarray([off_1, off_2])

    def mutation(self, offspring):
        for i in range(len(offspring)):
            if np.random.uniform(0, 1) < 1.0 / len(offspring):
                offspring[i] = not offspring[i]

        return offspring

    def memetic_search(self, chromosome, X, y, chromosome_fitness=None):
        S = np.array(chromosome, copy=True)
        if S.sum() == 0:
            return S, 0

        if chromosome_fitness == None:
            chromosome_fitness = self.fitness(chromosome, X, y)
        fitness_s = chromosome_fitness

        # List of visited genes in S
        R = []
        # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i
        U = self.index_nearest_neighbor(S, X, y)

        while self.memetic_looper(S, R):
            j = self.memetic_select_j(S, R)
            S[j] = 0
            gain = 0.0
            U_copy = list(U)
            mask = np.asarray(S, dtype=bool)
            X_tra, y_tra = X[mask], y[mask]
            real_idx = np.asarray(range(len(y)))[mask]

            if len(y_tra) > 0:
                for i in range(len(U)):
                    if U[i] == j:
                        self.classifier.fit(X_tra, y_tra)
                         ] = self.classifier.kneighbors(X[i],
                        U[i] = real_idx[idx]

                        if y[i] == y[U_copy[i]] and y[i] != y[U[i]]:
                            gain = gain - 1.0
                        if y[i] != y[U_copy[i]] and y[i] == y[U[i]]:
                            gain = gain + 1.0

            if gain >= self.threshold:
                n = S.sum()
                g = self.fitness_gain(gain, n)
                fitness_s = fitness_s + g
                R = []
                U = U_copy
                S[j] = 1

        return list(S), fitness_s

    def main_loop(self, X, y):
        self.generate_population(X, y)
        n, worse_fit_index = 0, -1
        while (n < self.max_loop):
            parents = self.select_parents(X, y)
            offspring = self.crossover(parents[0], parents[1])
            offspring[0] = self.mutation(offspring[0])
            offspring[1] = self.mutation(offspring[1])

            fit_offs = [
                self.fitness(off, X, y) if sum(off) > 0 else -1
                for off in offspring

            if worse_fit_index == -1:
                worse_fit_index = np.argmin(self.evaluations)

            for i in range(len(offspring)):
                p_ls = 1.0

                if fit_offs[i] == -1:
                    p_ls = -1

                if fit_offs[i] <= self.evaluations[worse_fit_index]:
                    p_ls = 0.0625

                if np.random.uniform(0, 1) < p_ls:

                    offspring[i], fit_offs[i] = self.memetic_search(
                        offspring[i], X, y, chromosome_fitness=fit_offs[i])

            for i in range(len(offspring)):
                if fit_offs[i] > self.evaluations[worse_fit_index]:
                    self.chromosomes[worse_fit_index] = offspring[i]
                    self.evaluations[worse_fit_index] = fit_offs[i]

                    worse_fit_index = np.argmin(self.evaluations)

            n = n + 1
            if n % 10 == 0:
                self.update_threshold(X, y)

    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        classes = np.unique(y)
        self.classes_ = classes

        self.main_loop(X, y)

        best_index = np.argmax(self.evaluations)
        mask = np.asarray(self.chromosomes[best_index], dtype=bool)
        self.X_ = X[mask]
        self.y_ = y[mask]
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
Пример #37
class CNN(InstanceReductionMixin):
    """Condensed Nearest Neighbors.

    Each class is represented by a set of prototypes, with test samples
    classified to the class with the nearest prototype.
    The Condensed Nearest Neighbors removes the redundant instances,
    maintaining the samples in the decision boundaries.

    n_neighbors : int, optional (default = 1)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    `prototypes_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `labels_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    >>> from protopy.selection.cnn import CNN
    >>> import numpy as np
    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    >>> y = np.array([1, 1, 1, 2, 2, 2])
    >>> cnn = CNN()
    >>> cnn.fit(X, y)
    >>> print(cnn.predict([[-0.8, -1]]))

    See also
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    The Condensed Nearest Neighbor is one the first prototype selection
    technique in literature.

    P. E. Hart, The condensed nearest neighbor rule, IEEE Transactions on 
    Information Theory 14 (1968) 515–516.

    def __init__(self, n_neighbors=1):
        self.n_neighbors = n_neighbors
        self.classifier = None

    def reduce_data(self, X, y):

        X, y = check_arrays(X, y, sparse_format="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]

        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)

        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
Пример #38
def run_knn_probabilistic_classifier(train, train_labels, validate,
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(train, train_labels)
    predicted_labels = knn.predict_proba(validate)
    return metrics.log_loss(validate_labels, predicted_labels)
Пример #39
import numpy as np
from sklearn import datasets
from sklearn.neighbors.classification import KNeighborsClassifier
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
print np.unique(iris_y)

indices = np.random.permutation(len(iris_X))
iris_X_train = iris_X[indices[indices[:-10]]]
iris_y_train = iris_y[indices[indices[:-10]]]
iris_X_test = iris_X[indices[indices[:-10]]]
iris_y_test = iris_y[indices[indices[:-10]]]
knn = KNeighborsClassifier()
knn.fit(iris_X_train, iris_y_train)
print "预测:"
print knn.predict(iris_X_test)
print "实际:"
print iris_y_test
i = knn.predict(iris_X_test) == iris_y_test
k = 0
for j in i:
    if j == False:
        k += 1
print len(i)
print "预测错误数:"
print k

diabetes = datasets.load_diabetes()  #糖尿病数据集
diabetes_X_train = diabetes.data[:-20]
Пример #40
class SGP2(SGP):
    """Self-Generating Prototypes 2

    The Self-Generating Prototypes 2 is the second version of the
    Self-Generating Prototypes algorithm.
    It has a higher generalization power, including the procedures
    merge and pruning.

    r_min: float, optional (default = 0.0)
        Determine the minimum size of a cluster [0.00, 0.20]

    r_mis: float, optional (default = 0.0)
        Determine the error tolerance before split a group

    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    >>> from protopy.generation.sgp import SGP2
    >>> import numpy as np
    >>> X = [np.asarray(range(1,13)) + np.asarray([0.1,0,-0.1,0.1,0,-0.1,0.1,-0.1,0.1,-0.1,0.1,-0.1])]
    >>> X = np.asarray(X).T
    >>> y = np.array([1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1])
    >>> sgp2 = SGP2()
    >>> sgp2.fit(X, y)
    SGP2(r_min=0.0, r_mis=0.0)
    >>> print sgp2.reduction_

    See also
    protopy.generation.SGP: self-generating prototypes
    protopy.generation.sgp.ASGP: adaptive self-generating prototypes

    Hatem A. Fayed, Sherif R Hashem, and Amir F Atiya. Self-generating prototypes
    for pattern classification. Pattern Recognition, 40(5):1498–1509, 2007.
    def __init__(self, r_min=0.0, r_mis=0.0):
        self.groups = None
        self.r_min = r_min
        self.r_mis = r_mis
        self.n_neighbors = 1
        self.classifier = None
        self.groups = None

    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        classes = np.unique(y)
        self.classes_ = classes

        # loading inicial groups
        self.groups = []
        for label in classes:
            mask = y == label
            self.groups = self.groups + [_Group(X[mask], label)]

        self.X_ = np.asarray([g.rep_x for g in self.groups])
        self.y_ = np.asarray([g.label for g in self.groups])
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_

    def _merge(self):

        if len(self.groups) < 2:
            return self.groups

        merged = False
        for group in self.groups:
            reps_x = np.asarray([g.rep_x for g in self.groups])
            reps_y = np.asarray([g.label for g in self.groups])
            self.classifier.fit(reps_x, reps_y)

            nn2_idx = self.classifier.kneighbors(group.X, n_neighbors=2, return_distance=False)
            nn2_idx = nn2_idx.T[1]

            # could use a threshold
            if len(set(nn2_idx)) == 1 and reps_y[nn2_idx[0]] == group.label:
                ng_group = self.groups[nn2_idx[0]]
                ng2_idx = self.classifier.kneighbors(ng_group.X, n_neighbors=2, return_distance=False)
                ng2_idx = ng2_idx.T[1]
                if len(set(ng2_idx)) == 1 and self.groups[ng2_idx[0]] == group:
                    group.add_instances(ng_group.X, update=True)
                    merged = True
        if merged:

        return self.groups

    def _pruning(self):

        if len(self.groups) < 2:
            return self.groups

        pruned, fst = False, True
        knn = KNeighborsClassifier(n_neighbors = 1, algorithm='brute')
        while pruned or fst:
            index = 0
            pruned, fst = False, False

            while index < len(self.groups):
                group = self.groups[index]

                mask = np.ones(len(self.groups), dtype=bool)
                mask[index] = False
                reps_x = np.asarray([g.rep_x for g in self.groups])[mask]
                reps_y = np.asarray([g.label for g in self.groups])[mask]
                labels = knn.fit(reps_x, reps_y).predict(group.X)

                if (labels == group.label).all():
                    pruned = True
                    index = index + 1

                if len(self.groups) == 1:
                    index = len(self.groups)
                    pruned = False

        return self.groups
Пример #41
class TomekLinks(InstanceReductionMixin):
    """Tomek Links.

    The Tomek Links algorithm removes a pair instances that
    forms a Tomek Link. This techniques removes instances in
    the decision region.

    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default in the classification (only).
        The Tomek Links uses only n_neighbors=1 in the reduction.

    keep_class : int, optional (default = None)
        Label of the class to not be removed in the tomek links. If None,
        it removes all nodes of the links.

    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.


    >>> from protopy.selection.tomek_links import TomekLinks
    >>> import numpy as np
    >>> X = np.array([[0],[1],[2.1],[2.9],[4],[5],[6],[7.1],[7.9],[9]])
    >>> y = np.array([1,1,2,1,2,2,2,1,2,2])
    >>> tl = TomekLinks()
    >>> tl.fit(X, y)
    >>> print tl.predict([[2.5],[7.5]])
    [1, 2]
    >>> print tl.reduction_

    See also
    protopy.selection.enn.ENN: edited nearest neighbor

    I. Tomek, “Two modifications of cnn,” IEEE Transactions on Systems,
    Man and Cybernetics, vol. SMC-6, pp. 769–772, 1976.

    def __init__(self, n_neighbors=3, keep_class=None):
        self.n_neighbors = n_neighbors
        self.classifier = None
        self.keep_class = keep_class

    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(
                n_neighbors=self.n_neighbors, algorithm='brute')
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes
        self.classifier.fit(X, y)
        nn_idx = self.classifier.kneighbors(X,
        nn_idx = nn_idx.T[1]

        mask = [
            nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]]
            for index in xrange(nn_idx.shape[0])
        mask = ~np.asarray(mask)
        if self.keep_class != None and self.keep_class in self.classes_:
            mask[y == self.keep_class] = True

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
Пример #42
class ENN(InstanceReductionMixin):

    """Edited Nearest Neighbors.

    The Edited Nearest Neighbors  removes the instances in de 
    boundaries, maintaining redudant samples.

    n_neighbors : int, optional (default = 3)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    `X_` : array-like, shape = [indeterminated, n_features]
        Selected prototypes.

    `y_` : array-like, shape = [indeterminated]
        Labels of the selected prototypes.

    `reduction_` : float, percentual of reduction.

    >>> from protopy.selection.enn import ENN
    >>> import numpy as np
    >>> X = np.array([[-1, 0], [-0.8, 1], [-0.8, -1], [-0.5, 0] , [0.5, 0], [1, 0], [0.8, 1], [0.8, -1]])
    >>> y = np.array([1, 1, 1, 2, 1, 2, 2, 2])
    >>> editednn = ENN()
    >>> editednn.fit(X, y)
    >>> print(editednn.predict([[-0.6, 0.6]]))
    >>> print editednn.reduction_

    See also
    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier

    Ruiqin Chang, Zheng Pei, and Chao Zhang. A modified editing k-nearest
    neighbor rule. JCP, 6(7):1493–1500, 2011.


    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
        self.classifier = None

    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
def nearest_fit(X,y):
    clf = KNeighborsClassifier(7, 'distance')
    return clf.fit(X, y)