Exemplo n.º 1
0
    def training_model(self, code, data, features):
        X = data[features]
        y = data['next_direction']

        # normalization
        X = preprocessing.scale(X)

        # pca缩放
        pca = PCAModel(self.module_name).load(code)
        X = pca.transform(X)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.3,
                                                            shuffle=False)

        parameters_grid = [{
            'learning_rate': [0.05, 0.1, 0.3],
            'max_depth': range(2, 8, 2),
            'subsample': [
                0.7,
            ],
            'min_child_weight': range(1, 6, 2)
        }]

        gs_search = GridSearchCV(estimator=xgb.XGBClassifier(n_estimators=100,
                                                             random_state=10),
                                 param_grid=parameters_grid,
                                 n_jobs=-1)

        gs_result = gs_search.fit(X_train, y_train)

        logger.debug(gs_search.best_params_)
        logger.debug("XGBoost Classier's best score: %.4f" %
                     gs_result.best_score_)  # 训练的评分结果

        xgb_classifier = gs_search.best_estimator_
        # 使用训练数据, 重新训练
        xgb_classifier.fit(X_train, y_train)

        # 使用测试数据对模型进评平分
        y_test_pred = xgb_classifier.predict(X_test)

        # 在测试集中的评分
        test_score = accuracy_score(y_test, y_test_pred)
        logger.debug('test score: %.4f' % test_score)

        # 使用所有数据, 重新训练
        xgb_classifier.fit(X, y)

        # 记录日志
        k_data_model_log_dao.insert(code=code,
                                    name=self.model_name,
                                    best_estimator=gs_search.best_estimator_,
                                    train_score=gs_search.best_score_,
                                    test_score=test_score)
        # 输出模型
        joblib.dump(
            xgb_classifier,
            self.get_model_path(code, self.module_name, self.model_name))
Exemplo n.º 2
0
    def training_model(self, code, data, features):
        X = data[features]
        y = data['next_direction']

        # normalization
        X = preprocessing.scale(X)

        # pca缩放
        pca = PCAModel(self.module_name).load(code)
        X = pca.transform(X)

        X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.3)

        # normalization
        X_train = preprocessing.scale(X_train)
        x_test = preprocessing.scale(x_test)

        input_dim_len = len(features)

        sequantial_model = Sequential()

        sequantial_model.add(
            Dense(512, input_dim=input_dim_len, activation='relu'))
        sequantial_model.add(Dropout(0.5))
        sequantial_model.add(Dense(128, activation='relu'))
        sequantial_model.add(Dropout(0.5))

        sequantial_model.add(Dense(1, activation='tanh'))
        sequantial_model.compile(optimizer='sgd',
                                 loss='binary_crossentropy',
                                 metrics=['accuracy'])

        # traning performance
        sequantial_model.fit(X_train, y_train, epochs=10, batch_size=128)
        train_model_score = sequantial_model.evaluate(X_train,
                                                      y_train,
                                                      batch_size=128)

        # test performance
        test_model_score = sequantial_model.evaluate(x_test,
                                                     y_test,
                                                     batch_size=128)
        logger.debug('test model score: %s' % test_model_score)

        full_model_score = sequantial_model.evaluate(data[features],
                                                     data['next_direction'])
        logger.debug('full model score: %s' % full_model_score)

        # 记录日志
        k_data_60m_model_log_dao.insert(code=code,
                                        name=self.model_name,
                                        best_estimator=None,
                                        train_score=train_model_score[1],
                                        test_score=test_model_score[1],
                                        desc="full_model_score:%s" %
                                        full_model_score[1])
        # 输出模型, 使用h5的格式保存起来
        sequantial_model.save(
            self.get_model_path(code, self.module_name, self.model_name, 'h5'))
    def training_model(self, code, data, features):
        X = data[features]
        y = data['next_direction']

        # normalization
        X = preprocessing.scale(X)

        # pca缩放
        pca = PCAModel(self.module_name).load(code)
        X = pca.transform(X)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.3,
                                                            shuffle=False)

        tuned_parameters = [{
            'kernel': ['rbf'],
            'gamma': [1e-3, 1e-4],
            'C': [1, 10, 100, 1000]
        }]

        # # tsne缩放
        # X_train = TSNE(n_components=2, learning_rate=100).fit_transform(X_train)
        # X_test = TSNE(n_components=2, learning_rate=100).fit_transform(X_test)

        # 网格搜寻最优参数
        grid = GridSearchCV(svm.SVC(), tuned_parameters, cv=None, n_jobs=-1)
        grid.fit(X_train, y_train)

        logger.debug(grid.best_estimator_)  # 训练的结果
        logger.debug("Support Vector Classifier's best score: %.4f" %
                     grid.best_score_)  # 训练的评分结果

        support_vector_classifier = grid.best_estimator_
        # 使用训练数据, 重新训练
        support_vector_classifier.fit(X_train, y_train)

        # 使用测试数据对模型进评平分
        y_test_pred = support_vector_classifier.predict(X_test)

        # 在测试集中的评分
        test_score = accuracy_score(y_test, y_test_pred)
        logger.debug('test score: %.4f' % test_score)

        # 使用所有数据, 重新训练
        support_vector_classifier.fit(X, y)

        # 记录日志
        k_data_model_log_dao.insert(code=code,
                                    name=self.model_name,
                                    best_estimator=grid.best_estimator_,
                                    train_score=grid.best_score_,
                                    test_score=test_score)

        # 输出模型
        joblib.dump(
            support_vector_classifier,
            self.get_model_path(code, self.module_name, self.model_name))
Exemplo n.º 4
0
    def training_model(self, code, data, features):

        X = data[features]
        y = data['next_direction']

        X = preprocessing.scale(X)

        pca = PCAModel(self.module_name).load(code)
        X = pca.transform(X)

        # 数据按30%测试数据, 70%训练数据进行拆分
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.3,
                                                            shuffle=False)

        # 交叉验证查找合适的超参数: penalty, C
        # penalty
        tuned_parameters = {
            'penalty': ['l1', 'l2'],
            'C': [0.001, 0.01, 0.1, 1, 10, 100]
        }

        # 网格搜索训练
        grid = GridSearchCV(LogisticRegression(), tuned_parameters, cv=None)
        grid.fit(X_train, y_train)
        logger.debug(grid.best_estimator_)  # 训练的结果
        logger.debug("logistic regression's best score: %.4f" %
                     grid.best_score_)  # 训练的评分结果

        logistic_regression = grid.best_estimator_
        # 使用训练数据, 重新训练
        logistic_regression.fit(X_train, y_train)

        # 使用测试数据对模型进评平分
        y_test_pred = logistic_regression.predict(X_test)

        # test_score = logistic_regression.score(X_test, y_test)

        # 在测试集中的评分
        test_score = accuracy_score(y_test, y_test_pred)
        logger.debug('test score: %.4f' % test_score)

        # 使用所有数据, 重新训练
        logistic_regression.fit(X, y)

        # 记录日志
        k_data_model_log_dao.insert(code=code,
                                    name=self.model_name,
                                    best_estimator=grid.best_estimator_,
                                    train_score=grid.best_score_,
                                    test_score=test_score)
        # 输出模型
        joblib.dump(
            logistic_regression,
            self.get_model_path(code, self.module_name, self.model_name))
Exemplo n.º 5
0
    def predict(self, code, data):
        model_path = self.get_model_path(code, self.module_name, self.model_name)

        if not os.path.exists(model_path):
            logger.error('model not found, code is %s:' % code)
            return

        X = preprocessing.scale(data)
        pac = PCAModel(self.module_name).load(code)
        X = pac.transform(X)

        ridge_regression_model = joblib.load(model_path)

        y_pred = ridge_regression_model.predict(X)

        return int(y_pred[0])
Exemplo n.º 6
0
    def training_model(self, code, data, features, *args):
        X = data[features]
        # if not args:
        y = data['close']

        # normalization
        X = preprocessing.scale(X)

        # pca缩放
        pca = PCAModel(self.module_name).load(code)
        X = pca.transform(X)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.3,
                                                            shuffle=False,
                                                            random_state=10)

        LR_model = linear_model.LinearRegression()

        LR_model.fit(X_train, y_train)

        test_score = LR_model.score(X_test, y_test)

        y_pred = LR_model.predict(X_test)

        mse = metrics.mean_squared_error(y_test, y_pred)

        mse = '%.4e' % mse

        logger.debug('mse: %s' % metrics.mean_squared_error(y_test, y_pred))

        # full data training
        LR_model.fit(X, y)

        # 记录日志
        k_data_model_log_dao.insert(code=code,
                                    name=self.model_name,
                                    best_estimator=LR_model,
                                    train_score=test_score,
                                    test_score=mse)

        # 输出模型
        joblib.dump(
            LR_model,
            self.get_model_path(code, self.module_name, self.model_name))
Exemplo n.º 7
0
    def predict(self, code, data):

        model_path = self.get_model_path(code, self.module_name,
                                         self.model_name, 'h5')

        if not os.path.exists(model_path):
            logger.error('model not found, code is %s:' % code)
            return

        X = preprocessing.scale(data)

        pac = PCAModel(self.module_name).load(code)
        X = pac.transform(X)

        sequantial_model = load_model(model_path)

        y_pred = sequantial_model.predict(X)

        return int(y_pred[0][0])
Exemplo n.º 8
0
    def training_model(self, code, data, features, *args):
        X = data[features]
        y = data['close']

        # normalization
        X = preprocessing.scale(X)

        # pca缩放
        pca = PCAModel(self.module_name).load(code)
        X = pca.transform(X)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=False, random_state=10)

        tuned_parameters = [
            {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}
        ]

        svr_model = GridSearchCV(svm.SVR(), tuned_parameters, n_jobs=-1)

        svr_model.fit(X_train, y_train)

        test_score = svr_model.score(X_test, y_test)

        y_pred = svr_model.predict(X_test)

        mse = metrics.mean_squared_error(y_test, y_pred)

        mse = '%.4e' % mse

        # full data set training

        svr_model.fit(X, y)

        # 记录日志
        k_data_model_log_dao.insert(code=code, name=self.model_name
                                    , best_estimator=svr_model,
                                    train_score=test_score, test_score=mse)

        # 输出模型
        joblib.dump(svr_model, self.get_model_path(code, self.module_name, self.model_name))
Exemplo n.º 9
0
    def training_model(self, code, data, features, *args):
        X = data[features]
        y = data['close']

        # normalization
        X = preprocessing.scale(X)

        # pca缩放
        pca = PCAModel(self.module_name).load(code)
        X = pca.transform(X)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=False)

        ridge_model = linear_model.RidgeCV(alphas=[10, 1, 0.5, 0.25, 0.1, 0.005, 0.0025, 0.001])

        ridge_model.fit(X_train, y_train)

        test_score = ridge_model.score(X_test, y_test)

        y_pred = ridge_model.predict(X_test)

        mse = metrics.mean_squared_error(y_test, y_pred)

        mse = '%.4e' % mse

        # full data set training

        ridge_model.fit(X, y)

        # 记录日志
        k_data_model_log_dao.insert(code=code, name=self.model_name
                                    , best_estimator=ridge_model,
                                    train_score=test_score, test_score=mse)

        # 输出模型
        joblib.dump(ridge_model, self.get_model_path(code, self.module_name, self.model_name))
Exemplo n.º 10
0
    def training_model(self, code, data, features):
        X = data[features]
        y = data['next_direction']

        # normalization
        X = preprocessing.scale(X)

        # pca缩放
        pca = PCAModel(self.module_name).load(code)
        X = pca.transform(X)

        X_train, X_test, y_train, y_test = train_test_split(
            data[features],
            data['next_direction'],
            test_size=.3,
            shuffle=False)

        rfc_model = RandomForestClassifier(max_features='sqrt',
                                           max_depth=14,
                                           oob_score=True)

        tuned_parameter = {
            'n_estimators': [
                50,
            ],
            'min_samples_leaf': range(10, 60, 10),
            'min_samples_split': range(20, 100, 20)
        }

        gs_result = GridSearchCV(estimator=rfc_model,
                                 param_grid=tuned_parameter,
                                 scoring='roc_auc',
                                 cv=None,
                                 n_jobs=-1)

        gs_result.fit(X_train, y_train)

        logger.debug('auc: %s' % gs_result.best_score_)

        min_samples_leaf = gs_result.best_params_['min_samples_leaf']
        min_samples_split = gs_result.best_params_['min_samples_split']

        rf1 = RandomForestClassifier(n_estimators=50,
                                     min_samples_leaf=min_samples_leaf,
                                     min_samples_split=min_samples_split,
                                     max_features='sqrt',
                                     max_depth=3,
                                     oob_score=True,
                                     n_jobs=-1,
                                     random_state=10)

        rf1.fit(X_train, y_train)

        logger.debug('oob: %s' % rf1.oob_score_)

        # 在测试集中的评分
        test_score = rf1.score(X_test, y_test)
        logger.debug('test score: %.4f' % test_score)

        # 使用所有数据, 重新训练
        rf1.fit(X, y)

        rf1_str = "RandomForestClassifier(n_estimators=50, min_samples_leaf=%s" \
                  ",min_samples_split=%s, max_features='sqrt',max_depth=3, " \
                  "oob_score=True, n_jobs=-1, random_state=10)" % (min_samples_leaf, min_samples_split)

        # 记录日志
        k_data_model_log_dao.insert(code=code,
                                    name=self.model_name,
                                    best_estimator=rf1_str,
                                    train_score=gs_result.best_score_,
                                    test_score=test_score,
                                    desc="oob_score_:%s" % rf1.oob_score_)

        # 输出模型
        joblib.dump(
            rf1, self.get_model_path(code, self.module_name, self.model_name))