Пример #1
0
def KNN():
    print("--------------------KNeighbors-------------------------")
    neigh = KNeighborsRegressor(n_neighbors=10, weights='distance', n_jobs=-1)
    valiN = cross_val_score(neigh, Xvalid, Yvalid, cv=kf, verbose=1, n_jobs=-1)
    print(valiN.mean())
    neigh.fit(Xtrain, Ytrain)
    print(neigh.get_params())
    print('Score:', neigh.score(Xvalid, Yvalid))
    Y = neigh.predict(Xvalid)
    print("MSE:", mean_squared_error(Yvalid, Y, squared=False))
Пример #2
0
def surrogateKNN(Xarchive,
                 Farchive,
                 X,
                 file_loc,
                 file_loc_general,
                 toUpdate,
                 first_iter=False,
                 problem='LABS'):
    Xnew = Xarchive.T
    X_pred = X.T
    SMAC = False

    if SMAC:
        with open("/home/naamah/Documents/CatES/result_All/X1.p", "wb") as fp:
            pickle.dump(Xnew, fp)
        with open("/home/naamah/Documents/CatES/result_All/F1.p", "wb") as fp:
            pickle.dump(Farchive, fp)

        anf = smac_KNN.main_loop(problem)

        print("SMAC {}".format(anf))
        sys.exit("Error message")

    neigh = KNeighborsRegressor(n_neighbors=9,
                                algorithm="ball_tree",
                                p=1,
                                weights="distance",
                                leaf_size=60)  # KNN_LABS_444

    # if problem=="LABS":
    #     neigh = KNeighborsRegressor(n_neighbors=9,  algorithm="ball_tree", p=1, weights="distance",leaf_size=60) # KNN_LABS_444
    #
    #
    #
    # elif problem=="NKL":
    #     neigh = KNeighborsRegressor(n_neighbors=10, algorithm="brute", p=1, weights="distance",leaf_size=76) # KNN_NKL_4442
    #
    #
    #
    # else: #problem=="QAP"
    #     neigh = KNeighborsRegressor(n_neighbors=8,  algorithm="auto", p=1, weights="distance",leaf_size=19) # KNN_QAP_4446

    if not os.path.exists(file_loc_general + "/surrogate_configuration"):
        with open(file_loc_general + "/surrogate_configuration", 'a') as file:
            file.write("clf:\n{}\n\nTuning Algorithem: {} ".format(
                neigh.get_params(), "smac"))
        file.close()

    neigh.fit(Xnew, Farchive)
    F_pred = neigh.predict(X_pred)

    return F_pred
Пример #3
0
def knn_regression(df, significant_cols, target, cat_cols, num_cols):
    ss = StandardScaler()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X = df[significant_cols]
    y = df[target]
    estimator = KNeighborsRegressor(n_jobs=-1)
    params = {
        'n_neighbors': np.arange(5, int(X.shape[0] * 0.1)),
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    }
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)
    X_train_cat = ohe.fit_transform(X_train[cat_cols])
    X_train_num = ss.fit_transform(X_train[num_cols])
    X_test_cat = ohe.transform(X_test[cat_cols])
    X_test_num = ss.transform(X_test[num_cols])
    train_data = np.c_[X_train_cat, X_train_num]
    test_data = np.c_[X_test_cat, X_test_num]
    gs = GridSearchCV(estimator, params, scoring='r2', cv=3)
    gs.fit(train_data, y_train)
    estimator = gs.best_estimator_
    r2_cv_scores = cross_val_score(estimator,
                                   train_data,
                                   y_train,
                                   scoring='r2',
                                   cv=3,
                                   n_jobs=-1)
    rmse_cv_scores = cross_val_score(estimator,
                                     train_data,
                                     y_train,
                                     scoring='neg_root_mean_squared_error',
                                     cv=3,
                                     n_jobs=-1)
    params = estimator.get_params()
    r2 = np.mean(r2_cv_scores)
    rmse = np.abs(np.mean(rmse_cv_scores))
    r2_variance = np.var(r2_cv_scores, ddof=1)
    rmse_variance = np.abs(np.var(rmse_cv_scores, ddof=1))
    estimator.fit(train_data, y_train)
    y_pred = estimator.predict(test_data)
    r2_validation = r2_score(y_test, y_pred)
    rmse_validation = np.sqrt(mean_squared_error(y_test, y_pred))
    return r2, rmse, r2_variance, rmse_variance, r2_validation, rmse_validation, params
    data, target, test_size=0.10, random_state=42)

# drop column 'name'
x_train = x_train_with_names.drop('name', axis=1)
x_test = x_test_with_names.drop('name', axis=1)

# parameters
n_neighbors = 5
# building the model
neigh = KNeighborsRegressor(n_neighbors=n_neighbors)

# training the model
neigh.fit(x_train, y_train)

# looking at the methods
get_params = neigh.get_params()  # returns the parameters of the model
kneighbours = neigh.kneighbors(
    x_test.head(1), n_neighbors=n_neighbors
)  # the first array gives the distance between the new data point and the k neighbours, and the second array gives the sample number of the k neighbours
kneighbours_graph = neigh.kneighbors_graph(
    x_test.head(1), n_neighbors=n_neighbors, mode='distance'
)  # returns a sparce matrix for the k neighbours for the new data points
prediction_array = neigh.predict(x_test)  # predicted test values
train_score = neigh.score(x_train,
                          y_train)  # the mean auuracy of the training dataset
test_score = neigh.score(x_test,
                         y_test)  # the mean acccuracy for the test dataset

print(
    'The mean accuracy of the train dataset is: %.3f and the mean accuracy of the test dataset is: %.3f'
    % (train_score, test_score))
class KNearestNeighbor:
    model = KNeighborsRegressor()

    def __init__(self,
                 X,
                 Y,
                 neighbors=[1, 5, 20],
                 weights='uniform',
                 algorithm='auto',
                 n_jobs=1):
        """
        X,Y:       traindata
        neighbors: number of neighbors, if list than 5-fold cross-validation is used to get optimal value
        weights:   must be one of ['uniform', 'distance'], if list than 5-fold cross-validation is used to get optimal value
        algorithm: must be one of [‘auto', ‘ball_tree', ‘kd_tree', ‘brute']
        n_jobs:    The number of parallel jobs to run for neighbors search. If -1, then the number of jobs is set to the number of CPU cores. Doesn't affect fit method.

        """
        if isinstance(neighbors, list) or isinstance(weights, list):
            neighbors, weights, optimal_err = self.cross_validation(
                X, Y, neighbors, weights, algorithm, n_jobs)

        self.model = KNeighborsRegressor(n_neighbors=neighbors,
                                         weights=weights,
                                         algorithm=algorithm,
                                         n_jobs=n_jobs)
        self.model.fit(X, Y)

    def evaluate(self, X, Y):
        Y_pred = self.predict(X)
        mse = mean_squared_error(Y, Y_pred)
        Y_pred[Y_pred > 0.5] = 1
        Y_pred[Y_pred <= 0.5] = 0
        acs = accuracy_score(Y, Y_pred)
        print("\n%s: %.4f" % ("MSE", mse))
        print("%s: %.2f%%" % ("Accuracy", acs * 100))
        return mse, acs

    def predict(self, X):
        return self.model.predict(X)

    def kneighbors(X=None, n_neighbors=None, return_distance=True):
        return self.model.kneighbors(X=X,
                                     n_neighbors=n_neighbors,
                                     return_distance=return_distance)

    def cross_validation(self,
                         X,
                         Y,
                         neighbors,
                         weights,
                         algorithm,
                         n_jobs,
                         cv=5):
        if not isinstance(neighbors, list):
            neighbors = [neighbors]
        if not isinstance(weights, list):
            weights = [weights]

        cv_scores = []
        param = []
        for k in neighbors:
            for w in weights:
                self.model = KNeighborsRegressor(n_neighbors=k,
                                                 weights=w,
                                                 algorithm=algorithm,
                                                 n_jobs=n_jobs)
                scores = cross_val_score(self.model,
                                         X,
                                         Y,
                                         cv=cv,
                                         scoring='neg_mean_squared_error')
                cv_scores.append(-1 * scores.mean())
                param.append([k, w])
        #print(cv_scores)
        optimal_p = param[cv_scores.index(min(cv_scores))]
        optimal_k = optimal_p[0]
        optimal_weights = optimal_p[1]
        print("Optimal value for neighbors is: %i" % (optimal_k))
        print("Optimal weight method is: %s" % (optimal_weights))
        return optimal_k, optimal_weights, min(cv_scores)

    def get_params(self, deep=True):
        return self.model.get_params(deep)
print('CATB Best params: ', catb_cv_model.best_params_)

# Final model
print('CATB Baslangic zamani: ', datetime.now())
catb_tuned = CatBoostRegressor (**catb_cv_model.best_params_).fit(X_train, y_train)  # ???
print('CATB Bitis zamani: ', datetime.now())
y_pred = catb_tuned.predict(X_test)
print('CATB tuned: ', np.sqrt(mean_squared_error(y_test, y_pred)))


# KNN Modelling and Prediction

RMSE = []

knn_model = KNeighborsRegressor().fit(X_train, y_train)
knn_model.get_params()
y_pred = knn_model.predict(X_test)
print('KNN Base: ', np.sqrt(mean_squared_error(y_test, y_pred)))

for k in range(20):
    k += 2
    knn_model = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train)
    y_pred = knn_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    RMSE.append(rmse)
    print('KNN for k = ', k, ' RMSE value: ', rmse)

# Find optimum k value with GridSearch

knn_params = {'n_neighbors': np.arange(2, 30, 1)}
knn_model = KNeighborsRegressor()
Пример #7
0
# находятся линейные модели, в neighbors - методы основанные на ближайших 
# соседях.
from sklearn.neighbors import KNeighborsRegressor
# Импортируем алгоритм knn из sklearn. Работа с алгоритмами машинного обучения 
# в библиотеке состоит из трех этапов.

# Создание объекта, который будет реализовывать алгоритм.
# Вызов fit: обучение модели на тренировочной подвыборке
# Вызов predict: получение предсказаний на тестовой выборке
knn = KNeighborsRegressor(n_neighbors=5, weights='uniform', p=2)#p - указывает 
# на метрику близости. р = 2 - евклидово расстояниеб р = 1 - манхэттенское
# KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
#                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
#                     weights='uniform')
knn.fit(X_train, y_train)# обучаем модель
print(knn.get_params())
predictions = knn.predict(X_test)# получаем предсказания
# Посчитаем метрику, соответствующая функция есть в scikit-learn! Будет считать 
# средне квадратичную ошибку, так как мы решаем задачу регрессии.
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, predictions))
# Давайте попробуем сделать лучше! У нашего алгоритма есть множество 
# гиперпараметров: количество соседей, параметры метрики и веса. Запустим поиск 
# по сетке гиперараметров, алгоритм переберет все возможные комбинации, 
# посчитает метрику для каждого набора и выдаст лучший набор.
from sklearn.model_selection import GridSearchCV
grid_searcher = GridSearchCV(KNeighborsRegressor(),
                             param_grid={'n_neighbors': range(1, 40, 2),
                                         'weights': ['uniform', 'distance'],
                                         'p': [1, 2, 3]},
                             cv=5)
Пример #8
0
def surrogateKNN(Xarchive,
                 Farchive,
                 X,
                 file_loc,
                 file_loc_general,
                 toUpdate,
                 first_iter=False,
                 problem='Pump',
                 knn_parm=None):
    Xnew = Xarchive.T
    X_pred = X.T
    SMAC = False
    if SMAC:
        with open("/home/naamah/Documents/CatES/result_All/X1.p", "wb") as fp:
            pickle.dump(Xnew, fp)
        with open("/home/naamah/Documents/CatES/result_All/F1.p", "wb") as fp:
            pickle.dump(Farchive, fp)

        anf = smac_KNN.main_loop(problem)

        print("SMAC {}".format(anf))
        sys.exit("Error message")

    if (knn_parm == None):

        if problem == "Pump":
            #neigh = KNeighborsRegressor(n_neighbors=10,  algorithm="ball_tree", p=1, weights="distance",leaf_size=1)
            neigh = KNeighborsRegressor(n_neighbors=10,
                                        algorithm="ball_tree",
                                        p=3,
                                        weights="distance",
                                        leaf_size=10)  # R

        elif problem == "NKL":
            neigh = KNeighborsRegressor(n_neighbors=9,
                                        algorithm="auto",
                                        p=1,
                                        weights="distance")

        else:  #problem=="QAP"
            # neigh = KNeighborsRegressor(n_neighbors=10,  algorithm="ball_tree", p=1, weights="distance",leaf_size=1)
            neigh = KNeighborsRegressor(n_neighbors=10,
                                        algorithm="auto",
                                        p=3,
                                        weights="uniform",
                                        leaf_size=98)  # R

    else:
        neigh = KNeighborsRegressor(n_neighbors=knn_parm.get("n_neighbors"),
                                    algorithm=knn_parm.get("algorithm"),
                                    p=knn_parm.get("p"),
                                    weights=knn_parm.get("weights"),
                                    leaf_size=knn_parm.get("leaf_size"))  # R

    if not os.path.exists(file_loc_general + "/surrogate_configuration"):
        with open(file_loc_general + "/surrogate_configuration", 'a') as file:
            file.write("clf:\n{}\n\nTuning Algorithem: {} ".format(
                neigh.get_params(), "irace"))
        file.close()

    neigh.fit(Xnew, Farchive)
    F_pred = neigh.predict(X_pred)

    return F_pred
Пример #9
0
plt.plot(list(evres['validation_1']['mae']))
plt.title('Model mae')
plt.ylabel('mae')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
#plt.savefig("Keras_NN_Accuracy.png")
plt.show()
plt.clf()

#print(trainbst.evals_result())
print("####################KNN")
neigh = KNeighborsRegressor(n_neighbors=10, weights='distance', n_jobs=-1)
valiN = cross_val_score(neigh, Xvalid, Yvalid, cv=kf, verbose=1, n_jobs=-1)
print(valiN.mean())
neigh.fit(Xtrain, Ytrain)
print(neigh.get_params())
print(neigh.score(Xvalid, Yvalid))
# plt.plot(list(evres['validation_0']['rmse']))
# plt.plot(list(evres['validation_1']['rmse']))
# plt.title('Model rmse')
# plt.ylabel('rmse')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Test'], loc='upper left')
# #plt.savefig("Keras_NN_Accuracy.png")
# plt.show()
# plt.clf()
"""
print("##################SVM")
s=SVR(verbose=True)
s.fit(Xtrain,Ytrain)
print(s.score(Xvalid,Yvalid))
Пример #10
0
class KNN(object):
    def __init__(self, task_type="cla", module_type="performance", **params):

        assert task_type in ["cla", "reg"]  # 两种类型
        assert module_type in ["balance", "debug", "performance",
                               None]  # 三种 性能模型
        self.module_type = module_type

        if self.module_type == "debug":
            params["n_jobs"] = 1
        elif self.module_type == "performance":  # 性能模型
            params["n_jobs"] = cpu_count()  # cpu核心数
        elif self.module_type == "balance":  # 均衡模型
            params["n_jobs"] = cpu_count() // 2
        else:
            params["n_jobs"] = None

        self.task_type = task_type
        # weights  取值{"uniform", "distance",None}   # 默认使用的uniform
        # "algorithm" 取值 {"auto", "ball_tree", "kd_tree", "brute",None}s
        # 权重 uniform 均匀权重, distance  按照其距离的倒数
        # "ball_tree" 使用BallTree算法,
        # kd_tree   使用的KDTree 算法
        # brute   使用暴力搜索 算法。

        # p的取值,  int 类型数据, 默认为2  马尔科夫功率参数
        # p=1 时,等校于p=2使用manhattan_distance(l1)和euclidean_distance(l2).
        # 对于任意的p, 使用minkowskidistance(l_p)

        if self.task_type == "cla":
            self.model = KNeighborsClassifier(
                n_neighbors=params.get("n_neighbors", 5),
                weights=params.get("weights", 'uniform'),
                algorithm=params.get("algorithm", 'auto'),
                leaf_size=params.get("leaf_size", 30),  # 叶子大小
                p=params.get("p", 2),
                metric=params.get("metric", 'minkowski'),
                metric_params=params.get("metric_params", None),
                n_jobs=params.get("n_jobs", None)  # 并行数
            )

        else:
            self.model = KNeighborsRegressor(
                n_neighbors=params.get("n_neighbors", 5),
                weights=params.get("weights", 'uniform'),
                algorithm=params.get("algorithm", 'auto'),
                leaf_size=params.get("leaf_size", 30),
                p=params.get("p", 2),
                metric=params.get("metric", 'minkowski'),
                metric_params=params.get("metric_params", None),
                n_jobs=params.get("n_jobs", None))

    def fit(self, x, y=None):
        self.model.fit(X=x, y=y)

    def get_params(self):
        return self.model.get_params(deep=True)

    def set_params(self, params):
        self.model.set_params(**params)

    def predict(self, x):
        return self.model.predict(X=x)

    def predict_proba(self, x):
        if self.task_type == "cla":
            return self.model.predict_proba(X=x)
        else:
            ValueError("回归任务无法使用")

    def get_score(self, x, y, sample_weight):
        return self.model.score(X=x, y=y, sample_weight=sample_weight)

    def search_kneighbors(self,
                          x=None,
                          n_neighbors=None,
                          return_distance=True):  # 查找K近邻居
        return self.model.kneighbors(X=x,
                                     n_neighbors=n_neighbors,
                                     return_distance=return_distance)

    def get_kneighbors_graph(self,
                             x=None,
                             n_neighbors=None,
                             mode='connectivity'):  # 获取最近邻图
        """
        :param x:
        :param n_neighbors:
        :param mode: "distance","connectivity"
        :return:
        """
        return self.model.kneighbors_graph(X=x,
                                           n_neighbors=n_neighbors,
                                           mode=mode)