def radiusNeighborClassifier():
    maximumValue = 0
    returnParameters = ['0', '0']
    for neighbor in xrange(100, 1001, 100):
        neighAutoRadius = RadiusNeighborsClassifier(radius=neighbor,
                                                    weights='uniform',
                                                    algorithm='auto',
                                                    p=2,
                                                    metric='minkowski')
        neighAutoRadius.fit(trainData, trainLabel)
        neighDistanceRadius = RadiusNeighborsClassifier(radius=neighbor,
                                                        weights='distance',
                                                        algorithm='auto',
                                                        p=2,
                                                        metric='minkowski')
        neighDistanceRadius.fit(trainData, trainLabel)
        scoreAuto = neighAutoRadius.score(validationData, validationLabel)
        scoreDistance = neighDistanceRadius.score(validationData,
                                                  validationLabel)
        if max(scoreAuto, scoreDistance) > maximumValue:
            maximumValue = max(scoreAuto, scoreDistance)
            returnParameters[0] = str(neighbor)
            returnParameters[
                1] = 'distance' if scoreDistance > scoreAuto else 'uniform'

    neighTest = RadiusNeighborsClassifier(radius=int(returnParameters[0]),
                                          weights=returnParameters[1],
                                          algorithm='auto',
                                          p=2,
                                          metric='minkowski')
    neighTest.fit(trainData, trainLabel)
    scoreTest = neighTest.score(testData, testLabel)
    guideToGraph['Radius Neighbor'] = scoreTest
예제 #2
0
def rnn_model(train_input, train_target, test_input, test_target):
    r_neigh = RadiusNeighborsClassifier(radius=3.0)
    r_neigh.fit(train_input, train_target)
    print("R-NN (r=1) accuracy for training set: %s" %
          (r_neigh.score(train_input, train_target)))
    print("R-NN (r=1) accuracy for testing set: %s" %
          (r_neigh.score(test_input, test_target)))
def radiusNeighborClassifier():
    maximumValue = 0
    returnParameters = ['0','0']
    for neighbor in xrange(100,1001,100):
        neighAutoRadius = RadiusNeighborsClassifier(radius=neighbor, weights='uniform',algorithm='auto', p=2,metric='minkowski')
        neighAutoRadius.fit(trainData, trainLabel)
        neighDistanceRadius = RadiusNeighborsClassifier(radius=neighbor, weights='distance',algorithm='auto', p=2,metric='minkowski')
        neighDistanceRadius.fit(trainData, trainLabel)
        scoreAuto = neighAutoRadius.score(validationData, validationLabel)
        scoreDistance = neighDistanceRadius.score(validationData, validationLabel)
        if max(scoreAuto,scoreDistance) > maximumValue:
            maximumValue = max(scoreAuto,scoreDistance)
            returnParameters[0] = str(neighbor)
            returnParameters[1] = 'distance' if scoreDistance>scoreAuto else 'uniform'

    neighTest = RadiusNeighborsClassifier(radius=int(returnParameters[0]), weights=returnParameters[1],algorithm='auto', p=2,metric='minkowski')
    neighTest.fit(trainData, trainLabel)
    scoreTest = neighTest.score(testData, testLabel)
    guideToGraph['Radius Neighbor'] = scoreTest
def knnClassifier():
    trainData, trainLabel = featureArray(conf['train']['feature_vector'])
    testData, testLabel = featureArray(conf['test']['feature_vector'])

    neigh = KNeighborsClassifier(n_neighbors=1, algorithm='auto', p=2)
    neigh.fit(trainData, trainLabel)
    print(neigh.score(testData,testLabel))


    neighRadius = RadiusNeighborsClassifier(radius=500, weights='distance',algorithm='auto', p=2,metric='minkowski')
    neighRadius.fit(trainData, trainLabel)
    print(neighRadius.score(testData, testLabel))
예제 #5
0
def knnClassifier():
    trainData, trainLabel = featureArray(conf['train']['feature_vector'])
    testData, testLabel = featureArray(conf['test']['feature_vector'])

    neigh = KNeighborsClassifier(n_neighbors=1, algorithm='auto', p=2)
    neigh.fit(trainData, trainLabel)
    print(neigh.score(testData, testLabel))

    neighRadius = RadiusNeighborsClassifier(radius=500,
                                            weights='distance',
                                            algorithm='auto',
                                            p=2,
                                            metric='minkowski')
    neighRadius.fit(trainData, trainLabel)
    print(neighRadius.score(testData, testLabel))
예제 #6
0
def KNN_diabetes_demo():
    import numpy as np
    import pandas as pd
    #get data
    data = pd.read_csv("D:/PythonProjects/pima-indians-diabetes.csv")
    print(data.shape)
    print("data.head(): ", data.head())  #first 5 rows
    #split intput and outcome
    X = data.iloc[:, 0:8]
    Y = data.iloc[:, 8]

    #split the training data and test data
    from sklearn.model_selection import train_test_split
    #random_state is a seed which will decide the way of splitting
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=22)

    #prediction
    from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier

    model1 = KNeighborsClassifier(n_neighbors=2)
    model1.fit(X_train, Y_train)
    score1 = model1.score(X_test, Y_test)

    model2 = KNeighborsClassifier(n_neighbors=2, weights='distance')
    model2.fit(X_train, Y_train)
    score2 = model2.score(X_test, Y_test)

    model3 = RadiusNeighborsClassifier(n_neighbors=2, radius=500.0)
    model3.fit(X_train, Y_train)
    score3 = model3.score(X_test, Y_test)
    #compare the results of the three models
    print(score1, score2, score3)

    #cross validation
    from sklearn.model_selection import cross_val_score

    result1 = cross_val_score(model1, X, Y, cv=10)
    result2 = cross_val_score(model2, X, Y, cv=10)
    result3 = cross_val_score(model3, X, Y, cv=10)

    print(result1.mean(), result2.mean(), result3.mean())
예제 #7
0
def main():
    X_train_all, t_train_all, train_all_ids = create_data_matrix(0, 3086, TRAIN_DIR)
    X_train, X_valid, t_train, t_valid = train_test_split(X_train_all, t_train_all, test_size=0.20, random_state=37)
    X_test_all, t_test_all, test_all_ids = create_data_matrix(0, 3724, TEST_DIR)

    sv = svm.SVC(kernel='poly')
    sv.fit(X_train, t_train)
    print "SVM Score was: %f" % clf.score(X_valid, t_valid)

    rf = RandomForestClassifier(n_estimators=30, min_samples_split=1, random_state=37)
    rf.fit(X_train, t_train)
    print "RandomForest Score was: %f" % (rf.score(X_valid, t_valid))

    lr = LogisticRegression(penalty='l2',solver='newton-cg',max_iter=500)
    lr.fit(X_train, t_train)
    print "LogisticRegression Score was: %f" % (lr.score(X_valid, t_valid))

    clf = GaussianNB()
    clf.fit(X_train, t_train)
    print "GaussianNB Score was: %f" % (clf.score(X_valid, t_valid))

    nn = KNeighborsClassifier(n_neighbors=6, weights='uniform')
    nn.fit(X_train, t_train)
    score = nn.score(X_valid, t_valid)
    print "KNeighbors Score was: %f" % (score)

    rnc = RadiusNeighborsClassifier(radius=6,outlier_label=8, p=2)
    rnc.fit(X_train, t_train)
    print "RadiusNeighbors Score was: %f" % (rnc.score(X_valid, t_valid))

    # Get predictions
    rf = RandomForestClassifier(n_estimators=30, min_samples_split=1)
    rf.fit(X_train_all, t_train_all)
    test_predictions = rf.predict(X_test_all)

    write_to_file("prediction.csv", test_all_ids, test_predictions)
                            p=2,
                            metric='minkowski',
                            metric_params=None,
                            n_jobs=1)
rlf2 = RadiusNeighborsClassifier(radius=1.0,
                                 weights='uniform',
                                 algorithm='auto',
                                 leaf_size=30,
                                 p=2,
                                 metric='minkowski',
                                 outlier_label=None,
                                 metric_params=None)
rlf1.fit(trainX, trainY)
rlf2.fit(trainX, trainY)
rlf1.score(testX, testY)
rlf2.score(testX, testY)
'''
    n_neighbors                 近邻数
    radius                      半径
    weights                     权重形式
    algorithm                   计算最邻近的算法
        ball_tree                   ball树
        kd_tree                     KD树
        brute                       暴力搜索
        auto                        根据X,自己选择
    leaf_size                   选择ball树和KD树时的叶尺寸
    p                           不太懂 
    metric                      计算树距离度量
    outlier_label               是否把离群样本点独立出来
    metric_params               不太懂
    n_jobs                      计算性能相关
    
    plt.show()

plate_img_lr, neg_img_lr = plate_img.T, negative_img.T

for i in np.random.randint(0, len(plate_img_lr), 5) :
    show_image_prediction(plate_img_lr, i, clf)

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

knn.fit(plate_img_lr, neg_img_lr)

print("Model accuracy: {:.2f}%".format(knn.score(plate_img_lr, neg_img_lr)*100))

plate_img_lr, neg_img_lr = plate_img.T, negative_img.T

for i in np.random.randint(0, len(plate_img_lr), 5) :
    show_image_prediction(plate_img_lr, i, knn)

from sklearn.neighbors import RadiusNeighborsClassifier

rnc = RadiusNeighborsClassifier()
rnc.fit(plate_img_lr, neg_img_lr)

print("Model accuracy: {:.2f}%".format(rnc.score(plate_img_lr, neg_img_lr)*100))

plate_img_lr, neg_img_lr = plate_img.T, negative_img.T
for i in np.random.randint(0, len(plate_img_lr), 5):
  show_image_prediction(plate_img_lr, i, rnc)
예제 #10
0
del globals()['unqLikesUIDs']
del globals()['unqLikesLIDs']
del globals()['profilesDF']
del globals()['profiles']
del globals()['profilesLSo']
del globals()['profilesLS']
del globals()['row']
del globals()['tmpLS']
del globals()['tmpAGE']
del globals()['profsTOlikes']
del globals()['i']
del globals()['tmpIND']

seed = 7
myRand = np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(likesMAT,
                                                    sexsARR,
                                                    test_size=1500)

myRAD = float(sys.argv[1])
radNN = RadiusNeighborsClassifier(radius=myRAD)

#radNN.fit(likesMAT, sexsARR)
radNN.fit(X_train, y_train)

print("sexs, Radius neighbors:  ", str(myRAD), " ",
      radNN.score(X_test, y_test))

# joblib.dump(radNN, "/Users/jamster/radNN-A-sexs.xz", compress=9)

# impRadNN = joblib.load("/Users/jamster/radNN-A-sexs.xz")
예제 #11
0
    standard_BCE = StandardScaler()
    standard_TCP = StandardScaler()
    standard_UIE = StandardScaler()

    PSE_BCE_t_Unit_neigh.fit(standard_BCE.fit_transform(X_train_BCE),
                             y_train_BCE_Unit)
    PSE_BCE_t_Upgrade_neigh.fit(standard_BCE.fit_transform(X_train_BCE),
                                y_train_BCE_Upgrade)
    PSE_TCP_t_Build_neigh.fit(standard_TCP.fit_transform(X_train_TCP),
                              y_train_TCP_Build)
    PSE_TCP_t_Attack_neigh.fit(standard_TCP.fit_transform(X_train_TCP),
                               y_train_TCP_Attack)

    print(
        'neigh - Unit Accuracy: ',
        PSE_BCE_t_Unit_neigh.score(standard_BCE.fit_transform(X_train_BCE),
                                   y_test_BCE_Unit))
    print(
        'neigh - Upgrade Accuracy: ',
        PSE_BCE_t_Upgrade_neigh.score(standard_BCE.fit_transform(X_train_BCE),
                                      y_test_BCE_Upgrade))
    print(
        'neigh - Build Accuracy: ',
        PSE_TCP_t_Build_neigh.score(standard_TCP.fit_transform(X_train_TCP),
                                    y_test_TCP_Build))
    print(
        'neigh - Attack Accuracy: ',
        PSE_TCP_t_Attack_neigh.score(standard_TCP.fit_transform(X_train_TCP),
                                     y_test_TCP_Attack))

#|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
#|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||KNeighborsClassifier
예제 #12
0
import pandas as pd
import numpy as np
from sklearn.neighbors import RadiusNeighborsClassifier
import matplotlib.pyplot as plt
data = pd.read_csv('ice.csv')
x = data[['temp', 'street']]
y = data['ice']
clf = RadiusNeighborsClassifier()
clf.fit(x, y)
print(clf.score(x, y))
t = np.arange(0.0, 31.0)
plt.plot(t, y, '--', t, clf.predict(x), '-')
plt.show()
        #标准化及其保存
        scale = StandardScaler()
        x4 = scale.fit_transform(x3)
        y4 = y3
        way_tempt_scale = 'C:/Users/Administrator/Desktop/ali/data/4_knn/scale_2/scale_2_' + mall_list[
            i] + '.model'
        #joblib.dump(scale,way_tempt_scale)

        #切分
        x5, x6, y5, y6 = train_test_split(x4, y4, test_size=0.5)

        #重新训练
        #knn=KNeighborsClassifier(n_neighbors=len(y3.unique())/3)
        knn = RadiusNeighborsClassifier(radius=j)  #####################
        knn.fit(x5, y5)
        score = knn.score(x6, y6)
        time_now = time.strftime('%H:%M:%S', time.localtime(time.time()))
        print(
            str(i) + '....mall_id=' + mall_list[i] + '....' + str(score) +
            '....' + str(len(x4)))
        way_tempt_knn = 'C:/Users/Administrator/Desktop/ali/data/4_knn/scale_2/knn_' + mall_list[
            i] + '.model'
        #joblib.dump(knn,way_tempt_knn)

        #保存
        parameters_append = pd.DataFrame(
            [[i, mall_list[i], score, len(y6)]],
            columns=['loop_number', 'mall_id', 'correct_rate', 'member'])
        parameters = parameters.append(parameters_append, ignore_index=True)
        #parameters.to_csv(way_write+'parameters.csv')
예제 #14
0
datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Indefinido',
              'estilo_de_aprendizagem'] = 0
datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Ativo',
              'estilo_de_aprendizagem'] = 1
datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Teorico',
              'estilo_de_aprendizagem'] = 2
datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Reflexivo',
              'estilo_de_aprendizagem'] = 3
datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Pragmatico',
              'estilo_de_aprendizagem'] = 4

datatrain = datatrain.apply(pd.to_numeric)
datatrain_array = datatrain.as_matrix()

X = datatrain_array[:, :14]
y = datatrain_array[:, 14:15]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

neigh = RadiusNeighborsClassifier(radius=3)
neigh.fit(X_train, y_train)
precisao = neigh.score(X_test, y_test)
print("------Acurácia-------: %f" % (precisao))
print(np.asarray(RN[0][0]))
# print(np.asarray(RN[1][2]))
## 計算x中的點與在指定半徑內的近鄰的加權圖
## radius neighbors graph
RNG = knn_model.radius_neighbors_graph(X, radius=10)
print(RNG)
print(RNG.toarray())

# ## 利用 test data裡的 X 來預測 y
# print(knn_model.predict(X_test))
# ## 查看實際y
# print(y_test.values.ravel())
# ## 這是 test data X 預測y是什麼的機率
# print(knn_model.predict_proba(X_test))
# ## 模型預測的準確率(Accuracy)
# print("radius = 2 , score :",knn_model.score(X_test, y_test)) ## 0.8333333333333334
# print("Accuracy: ",knn_model.score(X_test, y_test)*100,"%")

## 優化
## KNN Classifier 參數設
knn_model = RadiusNeighborsClassifier(radius=1,
                                      weights='uniform',
                                      algorithm='auto',
                                      leaf_size=30,
                                      p=2,
                                      metric='minkowski',
                                      metric_params=None,
                                      n_jobs=None)
knn_model.fit(X_train, y_train.values.ravel())
print("radius = 1, score :", knn_model.score(X_test, y_test))  # 1.0
print("Accuracy: ", knn_model.score(X_test, y_test) * 100, "%")
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.cross_validation import train_test_split


total_score = 0
stop = 1000
for x in range(stop):
    clf = RadiusNeighborsClassifier(radius=100.0)
    data = win.getStudents()
    data_train, data_test = train_test_split(data, test_size=0.2)
    data_train_labels = [s.spec for s in data_train]
    data_test_labels = [s.spec for s in data_test]
    data_train = [s.grades for s in data_train]
    data_test = [s.grades for s in data_test]
    clf.fit(data_train, data_train_labels)
    total_score += clf.score(data_test, data_test_labels)
total_score = total_score / stop
print('all')
print(total_score)

specs = ['FK', 'FM', 'MN', 'OE']
for sp in specs:
    total_score = 0
    for x in range(stop):
        clf = RadiusNeighborsClassifier(radius=100.0)
        data = win.getStudents()
        data_train, data_test = train_test_split(data, test_size=0.2)
        data_train_labels = [s.spec if s.spec == sp else 'NOT ' + sp for s in data_train]
        data_test_labels = [s.spec if s.spec == sp else 'NOT ' + sp for s in data_test]
        data_train = [s.grades for s in data_train]
        data_test = [s.grades for s in data_test]
예제 #17
0

import numpy as np
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
import matplotlib.pyplot as plt


X, y = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_clusters_per_class=1, n_classes=3, random_state=0)
# plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
# plt.show()

clf = RadiusNeighborsClassifier(radius=1, weights='distance', outlier_label=1)
# 若样本在指定半径范围内没有近邻样本,指定outlier_label。如果不指定,遇到异常点会报错
clf.fit(X, y)
print('score: {}'.format(clf.score(X, y)))

# 查看目标样本的近邻样本(距离+位置)
# print(clf.radius_neighbors(X[0,:].reshape(1, -1), return_distance=True))
# 查看目标样本的近邻图(稀疏矩阵,位置+距离或者连通)
# print(clf.radius_neighbors_graph(X[0].reshape(1, -1), mode='distance'))


# 可视化预测的效果(决策边界)
from matplotlib.colors import ListedColormap
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

# 确认训练集的边界
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
예제 #18
0
def train():
    objects = environment["objects"]
    workspace = environment["workspace"]

    for o in objects:
        log.vprint(
            f"{bcolors.HEADER}{bcolors.UNDERLINE}objects/{o}.xml{bcolors.ENDC}\n"
        )
        model_data = pd.read_csv(f"{workspace}/datasets/{o}_dataset_mut.csv")

        all_inputs = model_data[[
            "HIF-ScaleX", "HIF-ScaleY", "HIF-ScaleZ", "HIF-PositionX",
            "HIF-PositionZ", "HIS-ScaleX", "HIS-ScaleY", "HIS-ScaleZ",
            "HIS-PositionX", "HIS-PositionY", "HIS-PositionZ",
            "HIS-OrientationW", "HIS-OrientationZ"
        ]].values
        all_classes = model_data["Name"].values
        log.vprint(f"{bcolors.OKBLUE}Exploratory analysis{bcolors.ENDC}")
        log.vprint(all_inputs[:5], end='\n\n')
        log.vprint(f"Classes: {all_classes}", end='\n\n')

        (training_inputs, testing_inputs, training_classes,
         testing_classes) = train_test_split(all_inputs,
                                             all_classes,
                                             train_size=0.80,
                                             random_state=456)

        log.vprint(f"Training set: {training_inputs.shape}")
        log.vprint(f"Testing set: {testing_inputs.shape}", end='\n\n')

        ### DecisionTreeClassifier ###
        # Create the classifier.
        decision_tree_classifier = DecisionTreeClassifier()
        # Train the classifier on the training set.
        decision_tree_classifier.fit(training_inputs, training_classes)
        # Validate the classifier on the testing set using classification accuracy.
        log.vprint(f"{bcolors.OKBLUE}DecisionTreeClassifier{bcolors.ENDC}")
        log.vprint(
            decision_tree_classifier.score(testing_inputs, testing_classes))
        log.vprint(decision_tree_classifier.predict(testing_inputs[:1, :]),
                   end='\n\n')

        ### RadiusNeighborsClassifier ###
        # Create the classifier.
        neigh = RadiusNeighborsClassifier(radius=2.0)
        # Train the classifier on the training set.
        neigh.fit(training_inputs, training_classes)
        # Validate the classifier on the testing set using classification accuracy.
        log.vprint(neigh.score(testing_inputs, testing_classes))
        log.vprint(neigh.predict(testing_inputs[:1, :]), end='\n\n')

        ### Model accuracies ###
        plt.title("Model accuracies")
        # DecisionTreeClassifier
        model_accuracies = []
        for repetition in range(1000):
            (training_inputs, testing_inputs, training_classes,
             testing_classes) = train_test_split(all_inputs,
                                                 all_classes,
                                                 train_size=0.75)

            decision_tree_classifier = DecisionTreeClassifier()
            decision_tree_classifier.fit(training_inputs, training_classes)
            classifier_accuracy = decision_tree_classifier.score(
                testing_inputs, testing_classes)
            model_accuracies.append(classifier_accuracy)
        sb.distplot(model_accuracies, label="DecisionTreeClassifier")
        # RadiusNeighborsClassifier
        model_accuracies = []
        for repetition in range(1000):
            (training_inputs, testing_inputs, training_classes,
             testing_classes) = train_test_split(all_inputs,
                                                 all_classes,
                                                 train_size=0.75)

            neigh = RadiusNeighborsClassifier(radius=2.0)
            neigh.fit(training_inputs, training_classes)
            classifier_accuracy = neigh.score(testing_inputs, testing_classes)
            model_accuracies.append(classifier_accuracy)
        sb.distplot(model_accuracies, label="RadiusNeighborsClassifier")
        plt.legend()
        # plt.show()
        plt.savefig(f"{workspace}/plots/{o}_model_accuracies.pdf")
        plt.close("all")
        '''
        The model achieves 97% classification accuracy without much effort.

        It's obviously a problem that our model performs quite differently depending on
        the subset of the data it's trained on. This phenomenon is known as overfitting:
        The model is learning to classify the training set so well that it doesn't gene-
        ralize and perform well on data it hasn't seen before.

        This problem is the main reason that most data scientists perform k-fold cross-validation
        on their models: Split the original data set into k subsets, use one of the subsets as the
        testing set, and the rest of the subsets are used as the training set. This process is then
        repeated k times such that each subset is used as the testing set exactly once.

        10-fold cross-validation is the most common choice.
        '''

        # DecisionTreeClassifier(max_depth=4)
        decision_tree_classifier = DecisionTreeClassifier()

        # cross_val_score returns a list of the scores, which we can visualize
        # to get a reasonable estimate of our classifier's performance
        cv_scores = cross_val_score(decision_tree_classifier,
                                    all_inputs,
                                    all_classes,
                                    cv=10)
        sb.distplot(cv_scores)
        plt.title(f"Average score: {np.mean(cv_scores)}")
        # plt.show()
        plt.savefig(f"{workspace}/plots/{o}_model_cv_scores.pdf")
        plt.close("all")

        ### Grid Search ###
        '''
        Explore a range of parameters and find the best-performing parameter com-
        bination. Focus your search on the best range of parameters, then repeat
        this process several times until the best parameters are discovered.
        '''
        decision_tree_classifier = DecisionTreeClassifier()

        parameter_grid = {
            'max_depth': [1, 2, 3, 4, 5],
            'max_features': [1, 2, 3, 4]
        }

        # It may not work correctly if the least populated class in y has less mem-
        # bers than n_splits.
        cross_validation = StratifiedKFold(n_splits=50)

        grid_search = GridSearchCV(decision_tree_classifier,
                                   param_grid=parameter_grid,
                                   cv=cross_validation)

        grid_search.fit(all_inputs, all_classes)
        log.vprint(f"{bcolors.OKBLUE}GridSearch{bcolors.ENDC}")
        log.vprint(f"  Best score: {grid_search.best_score_}")
        log.vprint(f"  Best parameters: {grid_search.best_params_}",
                   end='\n\n')

        decision_tree_classifier = grid_search.best_estimator_
        log.vprint(f"{decision_tree_classifier} (before parameter tuning)",
                   end='\n\n')

        # Visualize the grid search to see how the parameters interact.
        grid_visualization = []
        grid_visualization.append(grid_search.cv_results_['mean_test_score'])
        grid_visualization = np.array(grid_visualization)
        grid_visualization.shape = (5, 4)
        sb.heatmap(grid_visualization, cmap='Blues')
        plt.xticks(np.arange(4) + 0.5, grid_search.param_grid['max_features'])
        plt.yticks(
            np.arange(5) + 0.5, grid_search.param_grid['max_depth'][::-1])
        plt.xlabel('max_features')
        plt.ylabel('max_depth')
        # plt.show()
        plt.savefig(f"{workspace}/plots/{o}_model_grid_search.pdf")
        plt.close("all")

        ### Parameter tuning ###
        decision_tree_classifier = DecisionTreeClassifier()
        '''
        Criterion <https://quantdare.com/decision-trees-gini-vs-entropy/>
        It is used to evaluate the feature importance.
            The default one is `gini` but you can also use `entropy`. Based on this,
            the model will define the importance of each feature for the classification.

        Splitter
        It is used to decide which feature and which threshold is used.
            Using `best`, the model if taking the feature with the highest importance.
            Using `random`, the model if taking the feature randomly but with the same distribution.
        '''
        parameter_grid = {
            'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random'],
            'max_depth': [1, 2, 3, 4, 5],
            'max_features': [1, 2, 3, 4]
        }

        cross_validation = StratifiedKFold(n_splits=10)

        grid_search = GridSearchCV(decision_tree_classifier,
                                   param_grid=parameter_grid,
                                   cv=cross_validation)

        grid_search.fit(all_inputs, all_classes)
        log.vprint(f"{bcolors.OKBLUE}Parameter Tuning{bcolors.ENDC}")
        log.vprint(f"  Best score: {grid_search.best_score_}")
        log.vprint(f"  Best parameters: {grid_search.best_params_}",
                   end='\n\n')

        # Then, the best classifer is taken
        decision_tree_classifier = grid_search.best_estimator_
        log.vprint(f"{decision_tree_classifier} (after parameter tuning)",
                   end='\n\n')

        with open(f"{workspace}/{o}_model_dtc.dot", 'w') as out_file:
            out_file = tree.export_graphviz(decision_tree_classifier,
                                            out_file=out_file)
        log.vprint(
            f"{bcolors.OKGREEN}Done! Check generated graph: {bcolors.ENDC}'./{workspace}/{o}_model_dtc.dot'.\n"
        )

        ### RandomForestClassifier ###
        random_forest_classifier = RandomForestClassifier()

        parameter_grid = {
            'n_estimators': [5, 10, 25, 50],
            'criterion': ['gini', 'entropy'],
            'max_features': [1, 2, 3, 4],
            'warm_start': [True, False]
        }

        cross_validation = StratifiedKFold(n_splits=10)

        grid_search = GridSearchCV(random_forest_classifier,
                                   param_grid=parameter_grid,
                                   cv=cross_validation)

        grid_search.fit(all_inputs, all_classes)
        log.vprint(f"{bcolors.OKBLUE}RandomForestClassifier{bcolors.ENDC}")
        log.vprint(f"  Best score: {grid_search.best_score_}")
        log.vprint(f"  Best parameters: {grid_search.best_params_}",
                   end='\n\n')

        random_forest_classifier = grid_search.best_estimator_
        log.vprint(random_forest_classifier, end='\n\n')

        ### Performance visuals ###
        rf_df = pd.DataFrame({
            'accuracy':
            cross_val_score(random_forest_classifier,
                            all_inputs,
                            all_classes,
                            cv=10),
            'classifier': ['Random Forest'] * 10
        })
        dt_df = pd.DataFrame({
            'accuracy':
            cross_val_score(decision_tree_classifier,
                            all_inputs,
                            all_classes,
                            cv=10),
            'classifier': ['Decision Tree'] * 10
        })
        both_df = rf_df.append(dt_df)

        sb.boxplot(x='classifier', y='accuracy', data=both_df)
        sb.stripplot(x='classifier',
                     y='accuracy',
                     data=both_df,
                     jitter=True,
                     color='white')
        # plt.show()
        plt.savefig(f"{workspace}/plots/{o}_model_classifiers_performance.pdf")
        plt.close("all")
예제 #19
0
                                                    y,
                                                    test_size=0.2,
                                                    random_state=22)
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier

model_1 = KNeighborsClassifier(n_neighbors=2, weights="uniform")
model_1.fit(x_train, y_train)
score_1 = model_1.score(x_test, y_test)

model_2 = KNeighborsClassifier(n_neighbors=2, weights="distance")
model_2.fit(x_train, y_train)
score_2 = model_2.score(x_test, y_test)

model_3 = RadiusNeighborsClassifier(n_neighbors=2, radius=500.0)
model_3.fit(x_train, y_train)
score_3 = model_3.score(x_test, y_test)

print(score_1, score_2, score_3)

from sklearn.model_selection import cross_val_score

result1 = cross_val_score(model_1, X, y, cv=10)
result2 = cross_val_score(model_2, X, y, cv=10)
result3 = cross_val_score(model_3, X, y, cv=10)
print(result1.mean(), result2.mean(), result3.mean())

predict1 = model_1.predict(x_test)
print(predict1)
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, predict1)
예제 #20
0
print("K-Nearest Neighbour(KNN) ALGORITMASI ILE  ")
from sklearn.neighbors import KNeighborsClassifier
score_list = []
each_list = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
for each in range(1, 15):
    knn = KNeighborsClassifier(n_neighbors=each)
    knn.fit(x_train, y_train)
    score_list = (knn.score(x_test, y_test))
    print("KNN de {} komsu alinirsa accuracy : {} ".format(each, score_list))
#%%Radius Neighbors Classifier
print("o-o--o-o-o-o-o-o-o-o-o-o--o-o-o-o-o-o-o-o-o-o-o-o-o-o-")
from sklearn.neighbors import RadiusNeighborsClassifier
rnn = RadiusNeighborsClassifier()
rnn.fit(x_train, y_train)
print("RNN Algoritmasi kullanilirrsa accuracy :{} ".format(
    rnn.score(x_test, y_test)))
#%%Logistic Regression
print("o-o--o-o-o-o-o-o-o-o-o-o--o-o-o-o-o-o-o-o-o-o-o-o-o-o-")
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
print("Logistic Regression parametreleri default ise accuracy : ",
      lr.score(x_test, y_test))
#%%Logistic Regression (max_iter=500,sover=saga,penalty=elasticnet)
lr = LogisticRegression(max_iter=500, solver='saga', penalty='elasticnet')
lr.fit(x_train, y_train)
print("Logistic Regression parametreleri degistirip bakalım accuracy : ",
      lr.score(x_test, y_test))
#%%Logistic Regression CV(Logistic Regression Cross-Validation)
from sklearn.linear_model import LogisticRegressionCV
lrcv = LogisticRegressionCV()
예제 #21
0
all_data = sel.fit_transform(all_data)
train_data = all_data[:tr_samples_size]
test_data = all_data[tr_samples_size:]

tr_samples_size, feature_size = train_data.shape
te_samples_size, _ = test_data.shape
print('Train Data Samples:', tr_samples_size, ', Test Data Samples',
      te_samples_size, ', Feature Size(after feature-selection):',
      feature_size)

#radius=1.0####################################################################
neigh = RadiusNeighborsClassifier(radius=2.9)
t1 = time.clock()
neigh.fit(train_data, train_labels)
t2 = time.clock()
CRR = neigh.score(test_data, test_labels)
t3 = time.clock()
print('CRR(radius=2.9):', CRR)
print('Training Time:', t2 - t1)
print('Testing Time:', t3 - t2)
predictions = neigh.predict(test_data)
ok = 0
confusion = np.zeros((10, 10))
confidence = np.zeros((10, 10))
for i in range(test_data.shape[0]):
    confusion[test_labels[i], predictions[i]] += 1
dig = np.arange(10)
plot_confusion_matrix(confusion, dig, title='Confusion matrix radius=2.9')
#-------------------------------------------------------------------------------------------#

import numpy as np
예제 #22
0
파일: KNN.py 프로젝트: matheo6/DataSCience
x_train,x_test,y_train,y_test= train_test_split(iris.data,iris.target)
x_train
y_train
x_test
y_test
##con numero de vecinos
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)
knn.score(x_test,y_test)
knn.predict(x_test)
y_test
knn.predict(x_test[2:3][:4])
y_test[2:3][:4]

##con radio

from sklearn.neighbors import RadiusNeighborsClassifier

knn_r=RadiusNeighborsClassifier(radius=1)
knn_r.fit(x_train,y_train)
knn_r.score(x_test,y_test)

knn_r.predict(x_test)
y_test

knn_r.predict(x_test[2:3][:4])
y_test[2:3][:4]

예제 #23
0
    ('Gradient Boosting Classifier', score, matrix))

print('K Nearest Neighbors')
kNeigh = KNeighborsClassifier(n_neighbors=3)
kNeigh.fit(X_train, y_train)
y_test_pred = kNeigh.predict(X_test)
matrix = confusion_matrix(y_test, y_test_pred)
score = kNeigh.score(X_test, y_test)
no_selection_performance.append(('K Nearest Neighbours', score, matrix))

print('Radius Nearest Neighbors')
rNeigh = RadiusNeighborsClassifier(radius=42.0)
rNeigh.fit(X_train, y_train)
y_test_pred = rNeigh.predict(X_test)
matrix = confusion_matrix(y_test, y_test_pred)
score = rNeigh.score(X_test, y_test)
no_selection_performance.append(('Radius Nearest Neighbours', score, matrix))

print('Decision Tree Classifier')
dTree = DecisionTreeClassifier(random_state=0)
dTree.fit(X_train, y_train)
y_test_pred = dTree.predict(X_test)
matrix = confusion_matrix(y_test, y_test_pred)
score = dTree.score(X_test, y_test)
no_selection_performance.append(('Decision Tree Classifier', score, matrix))

print('Bagging (with K Nearest Neighbors)')
bagging = BaggingClassifier(KNeighborsClassifier(),
                            max_samples=0.5,
                            max_features=0.5)
bagging.fit(X_train, y_train)
예제 #24
0
def run(data, classifications, scoring_data, scoring_classifications, radius):
    classifer = RadiusNeighborsClassifier(radius=radius)
    classifer.fit(data, classifications)
    accuracy = classifer.score(scoring_data, scoring_classifications)
    return accuracy