def training_k_data(start, end): df = stock_pool_dao.get_list() codes = df['code'].values[start:end] for code in codes: try: logger.debug('begin training mode, code:%s' % code) data, features = k_data_dao.get_k_data_with_features( code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) pca = PCAModel('k_data') lr = LogisticRegressionClassifier() svc = SupportVectorClassifier() rf = RandomForestClassifierModel() xgb = XGBoostClassier() #ann = SequantialNeuralClassifier() pca.training_model(code, data, features) lr.training_model(code, data, features) svc.training_model(code, data, features) rf.training_model(code, data, features) xgb.training_model(code, data, features) #ann.training_model(code, data, features) logger.debug('training mode end, code:%s' % code) except Exception as e: logger.error("training k data error, code:%s, error:%s" % (code, repr(e)))
def predict_k_data(): df = ts.get_hs300s() df_index = index_k_data_60m_dao.get_rel_price() for code in df['code'].values: try: logger.debug('begin predict, code:%s' % code) data, features = k_data_60m_dao.get_k_predict_data_with_features( code, df_index) lr = LogisticRegressionClassifier() svc = SupportVectorClassifier() rf = RandomForestClassifierModel() xgb = XGBoostClassier() ann = SequantialNeuralClassifier() lr_pred = lr.predict(code, data) svc_pred = svc.predict(code, data) rf_pred = rf.predict(code, data) xgb_pred = xgb.predict(code, data) ann_pred = ann.predict(code, data) k_data_60m_predict_log_dao.insert( code, logistic_regression=lr_pred, support_vector_classifier=svc_pred, random_forest_classifier=rf_pred, xgb_classifier=xgb_pred, sequantial_neural=ann_pred) logger.debug('predict end, code:%s' % code) except Exception as e: logger.error("predict k data error, code:%s, error:%s" % (code, repr(e)))
def back_test(): context = Context(start='2017-01-01', end='2018-07-14', base_capital=50000) kdj = KDJStrategy() kdj.init(context) try: trade_days = list( k_data_dao.get_trading_days(start=context.start, end=context.end, futu_quote_ctx=kdj.futu_quote_ctx)) kdj.before_trade() for date in trade_days: context.current_date = date kdj.before_handle_data() kdj.handle_data() finally: kdj.futu_quote_ctx.close() context_json = json.dumps(context, default=obj_dict) logger.debug("context:" + context_json)
def get_index_k_data_test(self): df = index_k_data_dao.get_k_data("^HSI", start="2018-01-01", end="2018-05-21") logger.debug(df.head()) self.assertIsNotNone(df)
def training_k_data(): df = ts.get_hs300s() for code in df['code'].values: try: logger.debug('begin training mode, code:%s' % code) data, features = k_data_60m_dao.get_k_data_with_features( code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) pca = PCAModel(MODULE_NAME) lr = LogisticRegressionClassifier() svc = SupportVectorClassifier() rf = RandomForestClassifierModel() xgb = XGBoostClassier() ann = SequantialNeuralClassifier() pca.training_model(code, data, features) lr.training_model(code, data, features) svc.training_model(code, data, features) rf.training_model(code, data, features) xgb.training_model(code, data, features) ann.training_model(code, data, features) logger.debug('training mode end, code:%s' % code) except Exception as e: logger.error("training k data error, code:%s, error:%s" % (code, repr(e)))
def get_k_data(self, code, start_date, end_date): try: start_date = self.string2ts(start_date) end_date = self.string2ts(end_date) cookie, crumb = self.get_cookie_crumb(code) url = "https://query1.finance.yahoo.com/v7/finance/download/%s?period1=%s&period2=%s&interval=1d&events" \ "=history&crumb=%s" % ( code, start_date, end_date, crumb) logger.debug(url) response = requests.get(url, cookies=cookie) df = pd.read_csv(io.StringIO(response.content.decode('utf-8'))) df["code"] = code df = df.drop(columns=['Adj Close']) df = df.rename(columns={'Date': 'date', 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume'}) df['pre_close'] = df['close'].shift(1) df = df.dropna() return df except Exception as e: logger.error(repr(e)) raise e
def get_stock_performance(self, code, name): headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } url = "http://basic.10jqka.com.cn/%s/finance.html#stockpage" % (code) r = requests.get(url, headers=headers) #logger.debug(r.content.decode('gbk')) try: selector = etree.HTML(r.content.decode('gbk')) content = selector.xpath('//*[@id="main"]/text()')[0] data = json.loads(content) df1 = pd.DataFrame(data['simple'], index=None) df2 = df1.T df = pd.DataFrame( df2.values, columns=[ 'report_date', 'esp', 'net_profits', 'profits_yoy', 'not_net_profits', 'not_profits_yoy', 'business_income', 'business_income_yoy', 'bvps', 'roe', 'roe_tanbo', 'net_debt_ratio', 'reservedPerShare', 'undistributed_profit_per_share', 'cash_flow_per_share', 'sales_gross_margin', 'inventory_turnover', 'sales_margin' ]) df = df.reindex(columns=[ 'code', 'name', 'report_date', 'esp', 'net_profits', 'profits_yoy', 'not_net_profits', 'not_profits_yoy', 'business_income', 'business_income_yoy', 'bvps', 'roe', 'roe_tanbo', 'net_debt_ratio', 'reservedPerShare', 'undistributed_profit_per_share', 'cash_flow_per_share', 'sales_gross_margin', 'inventory_turnover', 'sales_margin' ]) df['profits_yoy'] = df['profits_yoy'].str.strip('%') df['not_profits_yoy'] = df['not_profits_yoy'].str.strip('%') df['business_income_yoy'] = df['business_income_yoy'].str.strip( '%') df['roe'] = df['roe'].str.strip('%') df['roe_tanbo'] = df['roe_tanbo'].str.strip('%') df['net_debt_ratio'] = df['net_debt_ratio'].str.strip('%') df['sales_gross_margin'] = df['sales_gross_margin'].str.strip('%') df['sales_margin'] = df['sales_margin'].str.strip('%') df['net_profits'] = df['net_profits'].str.strip('亿') df['not_net_profits'] = df['not_net_profits'].str.strip('亿') df['business_income'] = df['business_income'].str.strip('亿') df['code'] = code df['name'] = name return df #logger.debug(data) except Exception as e: logger.debug(repr(e)) return None
def training_model(self, code, data, features): X = data[features] y = data['next_direction'] # normalization X = preprocessing.scale(X) # pca缩放 pca = PCAModel(self.module_name).load(code) X = pca.transform(X) X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.3) # normalization X_train = preprocessing.scale(X_train) x_test = preprocessing.scale(x_test) input_dim_len = len(features) sequantial_model = Sequential() sequantial_model.add( Dense(512, input_dim=input_dim_len, activation='relu')) sequantial_model.add(Dropout(0.5)) sequantial_model.add(Dense(128, activation='relu')) sequantial_model.add(Dropout(0.5)) sequantial_model.add(Dense(1, activation='tanh')) sequantial_model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy']) # traning performance sequantial_model.fit(X_train, y_train, epochs=10, batch_size=128) train_model_score = sequantial_model.evaluate(X_train, y_train, batch_size=128) # test performance test_model_score = sequantial_model.evaluate(x_test, y_test, batch_size=128) logger.debug('test model score: %s' % test_model_score) full_model_score = sequantial_model.evaluate(data[features], data['next_direction']) logger.debug('full model score: %s' % full_model_score) # 记录日志 k_data_60m_model_log_dao.insert(code=code, name=self.model_name, best_estimator=None, train_score=train_model_score[1], test_score=test_model_score[1], desc="full_model_score:%s" % full_model_score[1]) # 输出模型, 使用h5的格式保存起来 sequantial_model.save( self.get_model_path(code, self.module_name, self.model_name, 'h5'))
def test_predict(self): df_index = index_k_data_dao.get_rel_price() df, features = k_data_dao.get_k_predict_data_with_features("600196", df_index) logger.debug("features:%s, length:%s" % (features, len(features))) df.to_csv("result.csv") model = SupportVectorClassifier() y_predict = model.predict("600196", df[features]) logger.debug("predict:%s" % y_predict)
def exists(self, code): # 绑定引擎 # 连接数据表 table = Table('k_data_model_log', dataSource.mysql_quant_metadata, autoload=True) s = select([table.c.code, table.c.date]) \ .where(and_(table.c.date == datetime_utils.get_current_date(), table.c.code == code)) result = dataSource.mysql_quant_conn.execute(s) logger.debug("row count:%s" % result.rowcount) return result.rowcount > 0
def test_training(self): code = '600196' data, features = k_data_60m_dao.get_k_data_with_features(code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) logger.debug("features:%s, length:%s" % (features, len(features))) pac = PCAModel('k_data') pac.training_model(code=code, data=data, features=features) model = RidgeRegressionModel() model.training_model(code, data, features)
def test_training(self): code = '600276' # 从数据库中获取2015-01-01到今天的所有数据 data, features = k_data_dao.get_k_data_with_features(code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) logger.debug("features:%s" % features) pac = PCAModel('k_data') pac.training_model(code=code, data=data,features=features) model = SupportVectorClassifier() model.training_model(code, data, features)
def test_predict(self): code = '600276' df_index = k_data_60m_dao.get_rel_price() df, features = index_k_data_60m_dao.get_k_predict_data_with_features( code, df_index) logger.debug("features:%s, length:%s" % (features, len(features))) df.to_csv("result.csv") model = XGBoostClassier() y_predict = model.predict(code, df[features]) print(y_predict)
def test_predict(self): code = "600276" df_index = index_k_data_60m_dao.get_rel_price() df, features = k_data_60m_dao.get_k_predict_data_with_features( code, df_index) logger.debug("features:%s, length:%s" % (features, len(features))) df.to_csv("result.csv") model = RandomForestClassifierModel() y_predict = model.predict("600196", df[features]) logger.debug("predict:%s" % y_predict)
def test_predict(self): code = '600704' df_index = index_k_data_dao.get_rel_price() df, features = k_data_dao.get_k_predict_data_with_features( code, df_index) logger.debug("features:%s, length:%s" % (features, len(features))) df.to_csv("result.csv") model = SequantialNeuralClassifier() y_predict = model.predict(code, df[features]) logger.debug("predict:%s" % y_predict)
def test_predict(self): code = '600196' df_index = index_k_data_60m_dao.get_rel_price() df, features = k_data_60m_dao.get_k_predict_data_with_features( "600196", df_index) logger.debug("features:%s, length:%s" % (features, len(features))) df.to_csv("result.csv") model = LogisticRegressionClassifier() y_predict = model.predict(code, df[features]) print(y_predict)
def fn(*args, **kv): start_time = time.time() tmp = func(*args, **kv) end_time = time.time() if not kv: logger.debug("%s executed, elapsed time: %.2f ms" % (func.__name__, (end_time - start_time) * 1000)) else: logger.debug("%s executed, kv:%s, elapsed time: %.2f ms" % (func.__name__, str(kv), (end_time - start_time) * 1000)) return tmp
def test_training(self): code = '600276' # 从数据库中获取2015-01-01到今天的所有数据 data, features = k_data_60m_dao.get_k_data_with_features( code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) logger.debug("features:%s" % features) pac = PCAModel(MODULE_NAME) pac.training_model(code=code, data=data, features=features) model = XGBoostClassier() model.training_model(code, data, features)
def test_training(self): code = '600196' # 从数据库中获取2015-01-01到今天的所有数据 data, features = k_data_60m_dao.get_k_data_with_features( code, '2015-01-01', datetime.now().strftime("%Y-%m-%d")) logger.debug("features:%s, length:%s" % (features, len(features))) data.to_csv("result.csv") pac = PCAModel(MODULE_NAME) pac.training_model(code=code, data=data, features=features) model = LogisticRegressionClassifier() model.training_model(code, data, features)
def test_training(self): code = '600196' # 从数据库中获取2015-01-01到今天的所有数据 data, features = k_data_dao.get_k_training_data( code, '2012-01-01', datetime.now().strftime("%Y-%m-%d"), self.futu_quote_ctx) data.to_csv("result.csv") logger.debug("features:%s, length:%s" % (features, len(features))) pac = PCAModel('k_data') pac.training_model(code=code, data=data, features=features) model = LogisticRegressionClassifier() model.training_model(code, data, features)
def training_model(self, code, data, features, *args): X = data[features] # if not args: y = data['close'] # normalization X = preprocessing.scale(X) # pca缩放 pca = PCAModel(self.module_name).load(code) X = pca.transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=False, random_state=10) LR_model = linear_model.LinearRegression() LR_model.fit(X_train, y_train) test_score = LR_model.score(X_test, y_test) y_pred = LR_model.predict(X_test) mse = metrics.mean_squared_error(y_test, y_pred) mse = '%.4e' % mse logger.debug('mse: %s' % metrics.mean_squared_error(y_test, y_pred)) # full data training LR_model.fit(X, y) # 记录日志 k_data_model_log_dao.insert(code=code, name=self.model_name, best_estimator=LR_model, train_score=test_score, test_score=mse) # 输出模型 joblib.dump( LR_model, self.get_model_path(code, self.module_name, self.model_name))
def init_pool(self): self.truncate() stocks = stock_dao.query_all() k_data_list = k_data_dao.get_multiple_k_data(start=get_next_date(-720), end=get_current_date()) df = pd.DataFrame(columns=['code', 'name']) for stock in stocks: try: k_data = k_data_list.loc[k_data_list['code'] == fill_market(stock.code)] k_data = k_data.join(cal_macd(k_data)) k_data['turnover7'] = cal_mavol7(k_data, column='turnover') k_turnover7 = k_data['turnover7'].values[-1] if len(k_data['code'].values) == 0: continue stock_basic = stock_basic_dao.get_by_code(stock.code) eps_value = stock_basic['eps'].values[0] profits_yoy_value = stock_basic['profits_yoy'].values[0] if eps_value < 0: continue if profits_yoy_value < 0: continue if k_turnover7 < 65000000: continue dict = {"code": stock.code, "name": stock.name} df = df.append(dict, ignore_index=True) logger.debug("append code:%s" % stock.code) except Exception as e: logger.debug("code:%s, error:%s" % (stock.code, traceback.format_exc())) df.to_sql('stock_pool', dataSource.mysql_quant_engine, if_exists='append', index=False) '''
def cal_bk_vol(): bk_vol_frame = pd.DataFrame(columns=['bkcode', 'bk_name', 'total_mavol_3']) filter_list = [ 'BK0743', 'BK0804', 'BK0568', 'BK0707', 'BK0701', 'BK0611', 'BK0705', 'BK0612', 'BK0500' ] org_bk_code_list = list(stock_industry_dao.get_bkcode_list().values) bk_code_list = [c for c in org_bk_code_list if c not in filter_list] for bk in bk_code_list: bk_stocks = stock_industry_dao.get_by_bkcode(bk[0]) bk_vol3 = 0 try: bk_name = bk_stocks['bk_name'][0] except: bk_name = 'N/A' for code in bk_stocks['code'].values: stock_df = k_data_dao.get_k_data(code=code, start=get_next_date(-30), end=get_current_date()) if len(stock_df) == 0: continue stock_df['mavol3'] = cal_mavol3(stock_df) try: bk_vol3 += stock_df['mavol3'].values[-1:][0] except Exception as e: logger.debug("code:%s, error:%s" % (code, repr(e))) bk_vol_frame.loc[bk_vol_frame.shape[0] + 1] = { 'bkcode': bk, 'bk_name': bk_name, 'total_mavol_3': bk_vol3 } bk_vol_frame = bk_vol_frame.sort_values('total_mavol_3', ascending=False) bk_vol_frame.to_csv('bk_csv.csv', encoding='utf_8_sig')
def training_model(self, code, data, features): X = data[features] y = data['next_direction'] # normalization X = preprocessing.scale(X) # pca缩放 pca = PCAModel(self.module_name).load(code) X = pca.transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=False) parameters_grid = [{ 'learning_rate': [0.05, 0.1, 0.3], 'max_depth': range(2, 8, 2), 'subsample': [ 0.7, ], 'min_child_weight': range(1, 6, 2) }] gs_search = GridSearchCV(estimator=xgb.XGBClassifier(n_estimators=100, random_state=10), param_grid=parameters_grid, n_jobs=-1) gs_result = gs_search.fit(X_train, y_train) logger.debug(gs_search.best_params_) logger.debug("XGBoost Classier's best score: %.4f" % gs_result.best_score_) # 训练的评分结果 xgb_classifier = gs_search.best_estimator_ # 使用训练数据, 重新训练 xgb_classifier.fit(X_train, y_train) # 使用测试数据对模型进评平分 y_test_pred = xgb_classifier.predict(X_test) # 在测试集中的评分 test_score = accuracy_score(y_test, y_test_pred) logger.debug('test score: %.4f' % test_score) # 使用所有数据, 重新训练 xgb_classifier.fit(X, y) # 记录日志 k_data_model_log_dao.insert(code=code, name=self.model_name, best_estimator=gs_search.best_estimator_, train_score=gs_search.best_score_, test_score=test_score) # 输出模型 joblib.dump( xgb_classifier, self.get_model_path(code, self.module_name, self.model_name))
def training_model(self, code, data, features): X = data[features] y = data['next_direction'] # normalization X = preprocessing.scale(X) # pca缩放 pca = PCAModel(self.module_name).load(code) X = pca.transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=False) tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }] # # tsne缩放 # X_train = TSNE(n_components=2, learning_rate=100).fit_transform(X_train) # X_test = TSNE(n_components=2, learning_rate=100).fit_transform(X_test) # 网格搜寻最优参数 grid = GridSearchCV(svm.SVC(), tuned_parameters, cv=None, n_jobs=-1) grid.fit(X_train, y_train) logger.debug(grid.best_estimator_) # 训练的结果 logger.debug("Support Vector Classifier's best score: %.4f" % grid.best_score_) # 训练的评分结果 support_vector_classifier = grid.best_estimator_ # 使用训练数据, 重新训练 support_vector_classifier.fit(X_train, y_train) # 使用测试数据对模型进评平分 y_test_pred = support_vector_classifier.predict(X_test) # 在测试集中的评分 test_score = accuracy_score(y_test, y_test_pred) logger.debug('test score: %.4f' % test_score) # 使用所有数据, 重新训练 support_vector_classifier.fit(X, y) # 记录日志 k_data_model_log_dao.insert(code=code, name=self.model_name, best_estimator=grid.best_estimator_, train_score=grid.best_score_, test_score=test_score) # 输出模型 joblib.dump( support_vector_classifier, self.get_model_path(code, self.module_name, self.model_name))
def training_model(self, code, data, features): X = data[features] y = data['next_direction'] X = preprocessing.scale(X) pca = PCAModel(self.module_name).load(code) X = pca.transform(X) # 数据按30%测试数据, 70%训练数据进行拆分 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=False) # 交叉验证查找合适的超参数: penalty, C # penalty tuned_parameters = { 'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100] } # 网格搜索训练 grid = GridSearchCV(LogisticRegression(), tuned_parameters, cv=None) grid.fit(X_train, y_train) logger.debug(grid.best_estimator_) # 训练的结果 logger.debug("logistic regression's best score: %.4f" % grid.best_score_) # 训练的评分结果 logistic_regression = grid.best_estimator_ # 使用训练数据, 重新训练 logistic_regression.fit(X_train, y_train) # 使用测试数据对模型进评平分 y_test_pred = logistic_regression.predict(X_test) # test_score = logistic_regression.score(X_test, y_test) # 在测试集中的评分 test_score = accuracy_score(y_test, y_test_pred) logger.debug('test score: %.4f' % test_score) # 使用所有数据, 重新训练 logistic_regression.fit(X, y) # 记录日志 k_data_model_log_dao.insert(code=code, name=self.model_name, best_estimator=grid.best_estimator_, train_score=grid.best_score_, test_score=test_score) # 输出模型 joblib.dump( logistic_regression, self.get_model_path(code, self.module_name, self.model_name))
def training_model(self, code, data, features): X = data[features] y = data['next_direction'] # normalization X = preprocessing.scale(X) # pca缩放 pca = PCAModel(self.module_name).load(code) X = pca.transform(X) X_train, X_test, y_train, y_test = train_test_split( data[features], data['next_direction'], test_size=.3, shuffle=False) rfc_model = RandomForestClassifier(max_features='sqrt', max_depth=14, oob_score=True) tuned_parameter = { 'n_estimators': [ 50, ], 'min_samples_leaf': range(10, 60, 10), 'min_samples_split': range(20, 100, 20) } gs_result = GridSearchCV(estimator=rfc_model, param_grid=tuned_parameter, scoring='roc_auc', cv=None, n_jobs=-1) gs_result.fit(X_train, y_train) logger.debug('auc: %s' % gs_result.best_score_) min_samples_leaf = gs_result.best_params_['min_samples_leaf'] min_samples_split = gs_result.best_params_['min_samples_split'] rf1 = RandomForestClassifier(n_estimators=50, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, max_features='sqrt', max_depth=3, oob_score=True, n_jobs=-1, random_state=10) rf1.fit(X_train, y_train) logger.debug('oob: %s' % rf1.oob_score_) # 在测试集中的评分 test_score = rf1.score(X_test, y_test) logger.debug('test score: %.4f' % test_score) # 使用所有数据, 重新训练 rf1.fit(X, y) rf1_str = "RandomForestClassifier(n_estimators=50, min_samples_leaf=%s" \ ",min_samples_split=%s, max_features='sqrt',max_depth=3, " \ "oob_score=True, n_jobs=-1, random_state=10)" % (min_samples_leaf, min_samples_split) # 记录日志 k_data_model_log_dao.insert(code=code, name=self.model_name, best_estimator=rf1_str, train_score=gs_result.best_score_, test_score=test_score, desc="oob_score_:%s" % rf1.oob_score_) # 输出模型 joblib.dump( rf1, self.get_model_path(code, self.module_name, self.model_name))
# 死叉 # if pre_k > pre_d and ((k_value <= d_value) or (abs(k_value - d_value) <= 10)): # # shares = self.context.portfolio.positions[code].shares # # 清仓 # if shares > 0: # self.sell_value(code, shares) if __name__ == '__main__': context = Context(start='2018-07-01', end='2018-07-14', base_capital=50000) kdj = KDJStrategy() kdj.init(context) context.current_date = get_current_date() kdj.handle_data() logger.debug("base_capital:%s" % context.base_capital) logger.debug("blance:%s" % context.blance) # context.current_date = convert_to_datetime('2018-07-04') # kdj.handle_data() # logger.debug(context.order_book[1]) logger.debug("blance:%s" % context.blance) logger.debug("base_capital:%s" % context.base_capital) kdj.futu_quote_ctx.close()
return label, k, d if __name__ == '__main__': df_pool = stock_pool_dao.get_list() data = pd.DataFrame(columns=[ 'code', 'date', 'name', 'bk_code', 'bk_name', 'k', 'd', 'label' ]) list = [] for index, row in df_pool.iterrows(): code = row['code'] logger.debug("execute code:%s" % code) try: label, k, d = cal_single_stock(code) if label is None: continue if label == 'up': df_stock_industry = stock_industry_dao.get_by_code(code[3:]) name = df_stock_industry['name'].values[0] if name.find('ST') > -1: continue bk_code = df_stock_industry['bk_code'].values[0] bk_name = df_stock_industry['bk_name'].values[0]
async def register(websocket): USERS.add(websocket) logger.debug(USERS) await notify_users()