def training_k_data(): df = ts.get_hs300s() for code in df['code'].values: try: logger.debug('begin training mode, code:%s' % code) data, features = k_data_60m_dao.get_k_data_with_features( code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) pca = PCAModel(MODULE_NAME) lr = LogisticRegressionClassifier() svc = SupportVectorClassifier() rf = RandomForestClassifierModel() xgb = XGBoostClassier() ann = SequantialNeuralClassifier() pca.training_model(code, data, features) lr.training_model(code, data, features) svc.training_model(code, data, features) rf.training_model(code, data, features) xgb.training_model(code, data, features) ann.training_model(code, data, features) logger.debug('training mode end, code:%s' % code) except Exception as e: logger.error("training k data error, code:%s, error:%s" % (code, repr(e)))
def training_k_data(start, end): df = stock_pool_dao.get_list() codes = df['code'].values[start:end] for code in codes: try: logger.debug('begin training mode, code:%s' % code) data, features = k_data_dao.get_k_data_with_features( code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) pca = PCAModel('k_data') lr = LogisticRegressionClassifier() svc = SupportVectorClassifier() rf = RandomForestClassifierModel() xgb = XGBoostClassier() #ann = SequantialNeuralClassifier() pca.training_model(code, data, features) lr.training_model(code, data, features) svc.training_model(code, data, features) rf.training_model(code, data, features) xgb.training_model(code, data, features) #ann.training_model(code, data, features) logger.debug('training mode end, code:%s' % code) except Exception as e: logger.error("training k data error, code:%s, error:%s" % (code, repr(e)))
def training_model(self, code, data, features): X = data[features] y = data['next_direction'] # normalization X = preprocessing.scale(X) # pca缩放 pca = PCAModel(self.module_name).load(code) X = pca.transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=False) parameters_grid = [{ 'learning_rate': [0.05, 0.1, 0.3], 'max_depth': range(2, 8, 2), 'subsample': [ 0.7, ], 'min_child_weight': range(1, 6, 2) }] gs_search = GridSearchCV(estimator=xgb.XGBClassifier(n_estimators=100, random_state=10), param_grid=parameters_grid, n_jobs=-1) gs_result = gs_search.fit(X_train, y_train) logger.debug(gs_search.best_params_) logger.debug("XGBoost Classier's best score: %.4f" % gs_result.best_score_) # 训练的评分结果 xgb_classifier = gs_search.best_estimator_ # 使用训练数据, 重新训练 xgb_classifier.fit(X_train, y_train) # 使用测试数据对模型进评平分 y_test_pred = xgb_classifier.predict(X_test) # 在测试集中的评分 test_score = accuracy_score(y_test, y_test_pred) logger.debug('test score: %.4f' % test_score) # 使用所有数据, 重新训练 xgb_classifier.fit(X, y) # 记录日志 k_data_model_log_dao.insert(code=code, name=self.model_name, best_estimator=gs_search.best_estimator_, train_score=gs_search.best_score_, test_score=test_score) # 输出模型 joblib.dump( xgb_classifier, self.get_model_path(code, self.module_name, self.model_name))
def training_model(self, code, data, features): X = data[features] y = data['next_direction'] # normalization X = preprocessing.scale(X) # pca缩放 pca = PCAModel(self.module_name).load(code) X = pca.transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=False) tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }] # # tsne缩放 # X_train = TSNE(n_components=2, learning_rate=100).fit_transform(X_train) # X_test = TSNE(n_components=2, learning_rate=100).fit_transform(X_test) # 网格搜寻最优参数 grid = GridSearchCV(svm.SVC(), tuned_parameters, cv=None, n_jobs=-1) grid.fit(X_train, y_train) logger.debug(grid.best_estimator_) # 训练的结果 logger.debug("Support Vector Classifier's best score: %.4f" % grid.best_score_) # 训练的评分结果 support_vector_classifier = grid.best_estimator_ # 使用训练数据, 重新训练 support_vector_classifier.fit(X_train, y_train) # 使用测试数据对模型进评平分 y_test_pred = support_vector_classifier.predict(X_test) # 在测试集中的评分 test_score = accuracy_score(y_test, y_test_pred) logger.debug('test score: %.4f' % test_score) # 使用所有数据, 重新训练 support_vector_classifier.fit(X, y) # 记录日志 k_data_model_log_dao.insert(code=code, name=self.model_name, best_estimator=grid.best_estimator_, train_score=grid.best_score_, test_score=test_score) # 输出模型 joblib.dump( support_vector_classifier, self.get_model_path(code, self.module_name, self.model_name))
def training_model(self, code, data, features): X = data[features] y = data['next_direction'] # normalization X = preprocessing.scale(X) # pca缩放 pca = PCAModel(self.module_name).load(code) X = pca.transform(X) X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.3) # normalization X_train = preprocessing.scale(X_train) x_test = preprocessing.scale(x_test) input_dim_len = len(features) sequantial_model = Sequential() sequantial_model.add( Dense(512, input_dim=input_dim_len, activation='relu')) sequantial_model.add(Dropout(0.5)) sequantial_model.add(Dense(128, activation='relu')) sequantial_model.add(Dropout(0.5)) sequantial_model.add(Dense(1, activation='tanh')) sequantial_model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy']) # traning performance sequantial_model.fit(X_train, y_train, epochs=10, batch_size=128) train_model_score = sequantial_model.evaluate(X_train, y_train, batch_size=128) # test performance test_model_score = sequantial_model.evaluate(x_test, y_test, batch_size=128) logger.debug('test model score: %s' % test_model_score) full_model_score = sequantial_model.evaluate(data[features], data['next_direction']) logger.debug('full model score: %s' % full_model_score) # 记录日志 k_data_60m_model_log_dao.insert(code=code, name=self.model_name, best_estimator=None, train_score=train_model_score[1], test_score=test_model_score[1], desc="full_model_score:%s" % full_model_score[1]) # 输出模型, 使用h5的格式保存起来 sequantial_model.save( self.get_model_path(code, self.module_name, self.model_name, 'h5'))
def test_training(self): data, features = k_data_dao.get_k_data_with_features( '600196', '2015-01-01', datetime.now().strftime("%Y-%m-%d")) pca_model = PCAModel('k_data') pca_model.training_model(code='600196', data=data, features=features)
def training_model(self, code, data, features): X = data[features] y = data['next_direction'] X = preprocessing.scale(X) pca = PCAModel(self.module_name).load(code) X = pca.transform(X) # 数据按30%测试数据, 70%训练数据进行拆分 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=False) # 交叉验证查找合适的超参数: penalty, C # penalty tuned_parameters = { 'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100] } # 网格搜索训练 grid = GridSearchCV(LogisticRegression(), tuned_parameters, cv=None) grid.fit(X_train, y_train) logger.debug(grid.best_estimator_) # 训练的结果 logger.debug("logistic regression's best score: %.4f" % grid.best_score_) # 训练的评分结果 logistic_regression = grid.best_estimator_ # 使用训练数据, 重新训练 logistic_regression.fit(X_train, y_train) # 使用测试数据对模型进评平分 y_test_pred = logistic_regression.predict(X_test) # test_score = logistic_regression.score(X_test, y_test) # 在测试集中的评分 test_score = accuracy_score(y_test, y_test_pred) logger.debug('test score: %.4f' % test_score) # 使用所有数据, 重新训练 logistic_regression.fit(X, y) # 记录日志 k_data_model_log_dao.insert(code=code, name=self.model_name, best_estimator=grid.best_estimator_, train_score=grid.best_score_, test_score=test_score) # 输出模型 joblib.dump( logistic_regression, self.get_model_path(code, self.module_name, self.model_name))
def test_training(self): code = '600276' data, features = k_data_dao.get_k_data_with_features( code, '2015-01-01', datetime_utils.get_current_date()) pac = PCAModel('k_data') pac.training_model(code=code, data=data, features=features) model = SequantialNeuralClassifier() model.training_model(code, data, features)
def test_training(self): code = '600196' data, features = k_data_60m_dao.get_k_data_with_features(code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) logger.debug("features:%s, length:%s" % (features, len(features))) pac = PCAModel('k_data') pac.training_model(code=code, data=data, features=features) model = RidgeRegressionModel() model.training_model(code, data, features)
def test_training(self): code = '600196' data, features = k_data_dao.get_k_data_with_features( code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) pac = PCAModel('k_data') pac.training_model(code=code, data=data, features=features) model = RandomForestClassifierModel() model.training_model(code, data, features)
def test_training(self): code = '600276' # 从数据库中获取2015-01-01到今天的所有数据 data, features = k_data_dao.get_k_data_with_features(code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) logger.debug("features:%s" % features) pac = PCAModel('k_data') pac.training_model(code=code, data=data,features=features) model = SupportVectorClassifier() model.training_model(code, data, features)
def test_training(self): code = '600276' # 从数据库中获取2015-01-01到今天的所有数据 data, features = k_data_60m_dao.get_k_data_with_features( code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) logger.debug("features:%s" % features) pac = PCAModel(MODULE_NAME) pac.training_model(code=code, data=data, features=features) model = XGBoostClassier() model.training_model(code, data, features)
def test_training(self): code = '600196' # 从数据库中获取2015-01-01到今天的所有数据 data, features = k_data_dao.get_k_training_data( code, '2012-01-01', datetime.now().strftime("%Y-%m-%d"), self.futu_quote_ctx) data.to_csv("result.csv") logger.debug("features:%s, length:%s" % (features, len(features))) pac = PCAModel('k_data') pac.training_model(code=code, data=data, features=features) model = LogisticRegressionClassifier() model.training_model(code, data, features)
def test_training(self): code = '600196' # 从数据库中获取2015-01-01到今天的所有数据 data, features = k_data_60m_dao.get_k_data_with_features( code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) logger.debug("features:%s, length:%s" % (features, len(features))) data.to_csv("result.csv") pac = PCAModel(MODULE_NAME) pac.training_model(code=code, data=data, features=features) model = LogisticRegressionClassifier() model.training_model(code, data, features)
def predict(self, code, data): model_path = self.get_model_path(code, self.module_name, self.model_name) if not os.path.exists(model_path): logger.error('model not found, code is %s:' % code) return X = preprocessing.scale(data) pac = PCAModel(self.module_name).load(code) X = pac.transform(X) ridge_regression_model = joblib.load(model_path) y_pred = ridge_regression_model.predict(X) return int(y_pred[0])
def training_model(self, code, data, features, *args): X = data[features] # if not args: y = data['close'] # normalization X = preprocessing.scale(X) # pca缩放 pca = PCAModel(self.module_name).load(code) X = pca.transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=False, random_state=10) LR_model = linear_model.LinearRegression() LR_model.fit(X_train, y_train) test_score = LR_model.score(X_test, y_test) y_pred = LR_model.predict(X_test) mse = metrics.mean_squared_error(y_test, y_pred) mse = '%.4e' % mse logger.debug('mse: %s' % metrics.mean_squared_error(y_test, y_pred)) # full data training LR_model.fit(X, y) # 记录日志 k_data_model_log_dao.insert(code=code, name=self.model_name, best_estimator=LR_model, train_score=test_score, test_score=mse) # 输出模型 joblib.dump( LR_model, self.get_model_path(code, self.module_name, self.model_name))
def predict(self, code, data): model_path = self.get_model_path(code, self.module_name, self.model_name, 'h5') if not os.path.exists(model_path): logger.error('model not found, code is %s:' % code) return X = preprocessing.scale(data) pac = PCAModel(self.module_name).load(code) X = pac.transform(X) sequantial_model = load_model(model_path) y_pred = sequantial_model.predict(X) return int(y_pred[0][0])
def training_model(self, code, data, features, *args): X = data[features] y = data['close'] # normalization X = preprocessing.scale(X) # pca缩放 pca = PCAModel(self.module_name).load(code) X = pca.transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=False, random_state=10) tuned_parameters = [ {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]} ] svr_model = GridSearchCV(svm.SVR(), tuned_parameters, n_jobs=-1) svr_model.fit(X_train, y_train) test_score = svr_model.score(X_test, y_test) y_pred = svr_model.predict(X_test) mse = metrics.mean_squared_error(y_test, y_pred) mse = '%.4e' % mse # full data set training svr_model.fit(X, y) # 记录日志 k_data_model_log_dao.insert(code=code, name=self.model_name , best_estimator=svr_model, train_score=test_score, test_score=mse) # 输出模型 joblib.dump(svr_model, self.get_model_path(code, self.module_name, self.model_name))
def training_model(self, code, data, features, *args): X = data[features] y = data['close'] # normalization X = preprocessing.scale(X) # pca缩放 pca = PCAModel(self.module_name).load(code) X = pca.transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=False) ridge_model = linear_model.RidgeCV(alphas=[10, 1, 0.5, 0.25, 0.1, 0.005, 0.0025, 0.001]) ridge_model.fit(X_train, y_train) test_score = ridge_model.score(X_test, y_test) y_pred = ridge_model.predict(X_test) mse = metrics.mean_squared_error(y_test, y_pred) mse = '%.4e' % mse # full data set training ridge_model.fit(X, y) # 记录日志 k_data_model_log_dao.insert(code=code, name=self.model_name , best_estimator=ridge_model, train_score=test_score, test_score=mse) # 输出模型 joblib.dump(ridge_model, self.get_model_path(code, self.module_name, self.model_name))
def training_model(self, code, data, features): X = data[features] y = data['next_direction'] # normalization X = preprocessing.scale(X) # pca缩放 pca = PCAModel(self.module_name).load(code) X = pca.transform(X) X_train, X_test, y_train, y_test = train_test_split( data[features], data['next_direction'], test_size=.3, shuffle=False) rfc_model = RandomForestClassifier(max_features='sqrt', max_depth=14, oob_score=True) tuned_parameter = { 'n_estimators': [ 50, ], 'min_samples_leaf': range(10, 60, 10), 'min_samples_split': range(20, 100, 20) } gs_result = GridSearchCV(estimator=rfc_model, param_grid=tuned_parameter, scoring='roc_auc', cv=None, n_jobs=-1) gs_result.fit(X_train, y_train) logger.debug('auc: %s' % gs_result.best_score_) min_samples_leaf = gs_result.best_params_['min_samples_leaf'] min_samples_split = gs_result.best_params_['min_samples_split'] rf1 = RandomForestClassifier(n_estimators=50, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, max_features='sqrt', max_depth=3, oob_score=True, n_jobs=-1, random_state=10) rf1.fit(X_train, y_train) logger.debug('oob: %s' % rf1.oob_score_) # 在测试集中的评分 test_score = rf1.score(X_test, y_test) logger.debug('test score: %.4f' % test_score) # 使用所有数据, 重新训练 rf1.fit(X, y) rf1_str = "RandomForestClassifier(n_estimators=50, min_samples_leaf=%s" \ ",min_samples_split=%s, max_features='sqrt',max_depth=3, " \ "oob_score=True, n_jobs=-1, random_state=10)" % (min_samples_leaf, min_samples_split) # 记录日志 k_data_model_log_dao.insert(code=code, name=self.model_name, best_estimator=rf1_str, train_score=gs_result.best_score_, test_score=test_score, desc="oob_score_:%s" % rf1.oob_score_) # 输出模型 joblib.dump( rf1, self.get_model_path(code, self.module_name, self.model_name))