def compare_s_no(dataPath=""): data = pd.read_csv(dataPath) # 加入其他的指标 result = get_other_indicators(data) deal_result = result.dropna(axis=0)[-100:] # 利用LOF处理原始数据进行重新的决策 final_data = deal_data_from_dataFrame(deal_result) # data_y = final_data[1] final_data_x = nr.standardized_mars(final_data[0]) print(final_data_x.shape) # 直接使用pca数据,将100%做特异值处理,随机森林的训练 # 拿100%的数据进行PCA data_y = final_data[1] data_x = final_data[0] final_data_x = nr.standardized_mars(data_x) # 拿100%的数据进行PCA pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=54) # 奇异值处理 oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) #random_forest((lof_data_x, new_all_y)) lof_pred = oc.get_pred_test() error_index = oc.get_delete_index() lof_data_y = oc.replace_Singular(data_y, lof_pred) fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True) ax1.scatter(range(len(data_y)), data_y, label='data_y') error_close = data_y[error_index] # ax1.plot(range(len(lof_pred)), lof_pred, label='lof_pred') ax1.scatter(error_index, error_close, label='error_y', c='r', alpha=0.2) # ax1.xlabel('x -') # ax1.ylabel('y -') # ax1.title('plot open') ax1.legend() # ax2.ylabel('close') error_lof_y = lof_data_y[error_index] ax2.scatter(range(len(lof_data_y)), lof_data_y, label='lof_data_y') ax2.scatter(error_index, error_lof_y, label='error_lof_y', c='r', alpha=0.2) # ax2.plot(close**2, label='quadratic') ax2.legend() # 调整cavas 的间隔 print(len(data_y)) print(len(lof_data_y)) plt.tight_layout() plt.show()
def test(): data = pd.read_csv( '/home/mars/Data/finialData/electronic_infomation/000021.csv') data = data[::-1] result = get_other_indicators(data) #result = data[['price_change', 'p_change']] deal_result = result.dropna(axis=0) close = deal_result['close'] print(close.shape) s_deal_data = deal_data_from_dataFrame(deal_result) data_x = s_deal_data[0] data_y = s_deal_data[1] # 特征处理 #t_deal_data_x = Filter(use=False).Variance_selection(threshold=3, data=s_deal_data)[0] # 归一化 final_data_x = nr.standardized_mars(data_x) pca_x = oc.LOF_PCA_for_Clustering(final_data_x) final_data_x_LOF = oc.replace_Singular(final_data_x, oc.get_pred_test()) print('final_data_x_LOF', final_data_x_LOF[:16]) print(final_data_x_LOF.shape) #降维处理 #pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=0.9) # ############################################################################# # Compute clustering with MeanShift x_train = final_data_x_LOF[:int(len(data_x) * 0.7)] print('x_train', x_train.shape) x_test = final_data_x_LOF[int(len(data_x) * 0.7):] # The following bandwidth can be automatically detected using bandwidth = estimate_bandwidth(x_train, quantile=0.2, random_state=1) ms = MeanShift(bandwidth=bandwidth, bin_seeding=False) ms.fit(final_data_x_LOF) labels = ms.labels_ print('error size', labels[labels != 0].size) print('index of not 0 *******') print([i for i, x in enumerate(labels) if x != 0]) print('*******') print(labels) print(labels.shape) cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #score = metrics.silhouette_score(pca_x, labels, metric='euclidean') #score1 = metrics.calinski_harabaz_score(pca_x, labels) #print(score) #print(score1) print("number of estimated clusters : %d" % n_clusters_) plt.plot(range(len(close)), close) plt.plot(range(len(labels)), labels) plt.show() # ############################################################################# # Plot result '''
def get_today_data(data_x, today_data, n_components=20): add_data_x = np.vstack((data_x, today_data)) final_data_x = nr.standardized_mars(add_data_x) print('final_data_x:', final_data_x.shape) # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练 # 拿100%的数据进行PCA pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=n_components) today_x = pca_x[-1].reshape(1, n_components) print('today_x:', today_x) print(today_x.shape) return today_x
def analyze_lof(dataPath=""): data = pd.read_csv(dataPath) data = data[::-1] # 加入其他的指标 result = get_other_indicators(data) deal_result = result.dropna(axis=0) # 利用LOF处理原始数据进行重新的决策 # final_data = deal_data_from_dataFrame(deal_result) # 获得电子信息的板块的数据 # NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;' # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money'] # 对于nan的值进行向前填充 # NDXData = deal_dataFrame(NDX_sql, []) final_data = deal_data(deal_result) # data_y = final_data[1] final_data_x = nr.standardized_mars(final_data[0]) pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62) print(pca_x.shape) # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # x奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) error_index = oc.get_delete_index() print(error_index) result = deal_result.index.tolist() # 写入所有的日期,奇异值存在的标志为1 with open('300113_data.csv', 'w+') as f: f.write('date') f.write(',') f.write('300113_Sigular') f.write('\n') for index, date in enumerate(result): if index in error_index: f.write(date) f.write(',') f.write('1') f.write('\n') else: f.write(date) f.write(',') f.write('0') f.write('\n')
def analyze_lof_sql(code=""): # 获得电子信息的板块的数据 NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;' # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money'] # 对于nan的值进行向前填充 NDXData = deal_dataFrame(NDX_sql, []) final_data = deal_data(NDXData) # data_y = final_data[1] final_data_x = nr.standardized_mars(final_data[0]) pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62) print(pca_x.shape) # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # x奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) error_index = oc.get_delete_index() print(error_index) result = NDXData.index.tolist() # 写入所有的日期,奇异值存在的标志为1 with open('NDX_data.csv', 'w+') as f: f.write('date') f.write(',') f.write('NDX_Sigular') f.write('\n') for index, date in enumerate(result): if index in error_index: f.write(date.strftime('%Y-%m-%d')) f.write(',') f.write('1') f.write('\n') else: f.write(date.strftime('%Y-%m-%d')) f.write(',') f.write('0') f.write('\n')
def fit_randomForest_del(daySpan=0, dataPath=""): data = pd.read_csv(dataPath) # 加入其他的指标 result = get_other_indicators(data) deal_result = result.dropna(axis=0) # 利用LOF处理原始数据进行重新的决策 #final_data = deal_data_from_dataFrame(deal_result) data_x = dataX_from_dataFrame(deal_result) if daySpan == 0: # 对X处理, Y值做二分化处理 data_y = dataY_from_dataFrame(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] s_deal_data = (data_x, data_y) # data_y = final_data[1] final_data_x = nr.standardized_mars(s_deal_data[0]) print(final_data_x.shape) # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练 # 拿100%的数据进行PCA all_y = data_y pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=2) print(pca_x.shape) # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # 奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering_del(pca_x, isUsePCA=False) dele_index = oc.get_delete_index() lof_data_y = np.delete(all_y, dele_index, axis=0) # all_x = np.vstack((lof_data_x, x_test)) print(lof_data_x.shape) print('y', data_y.shape) # all_y = np.concatenate((y_train, y_test), axis=0) # print(all_x.shape, all_y.shape) feature_importances = np.copy(random_forest((lof_data_x, lof_data_y)))
def fit_randomForest_repXY(daySpan=0, dataPath=""): data = pd.read_csv(dataPath) # 加入其他的指标 result = get_other_indicators(data) deal_result = result.dropna(axis=0) # 利用LOF处理原始数据进行重新的决策 #final_data = deal_data_from_dataFrame(deal_result) data_x = dataX_from_dataFrame(deal_result) if daySpan == 0: # data_y = dataY_from_dataFrame(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] s_deal_data = (data_x, data_y) final_data_x = nr.standardized_mars(s_deal_data[0]) print(final_data_x.shape) # 直接使用pca数据,将100%做特异值处理,随机森林的训练 # 拿100%的数据进行PCA all_y = data_y pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=2) print(pca_x.shape) # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # x奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) # y奇异值处理 print('all_y', all_y[-10:]) lof_data_y = oc.replace_Singular(all_y, oc.get_pred_test()) print('lof_y', lof_data_y[-10:]) # all_x = np.vstack((lof_data_x, x_oc.get_pred_test()test)) print(pca_x.shape) # all_y = np.concatenate((y_train, y_test), axis=0) # print(all_x.shape, all_y.shape) random_forest((lof_data_x, lof_data_y))
'/home/mars/Data/finialData/electronic_infomation/002544.csv') data = data[::-1] result = get_other_indicators(data) #result = data[['price_change', 'p_change']] deal_result = result.dropna(axis=0) # close = deal_result['close'] # s_deal_data = deal_data_from_dataFrame(deal_result) data_x = s_deal_data[0] data_y = s_deal_data[1] print('data_x', data_x.shape) # 特征处理 #t_deal_data_x = Filter(use=False).Variance_selection(threshold=3, data=s_deal_data)[0] # 归一化 final_data_x = nr.standardized_mars(data_x) # # pca_x = oc.LOF_PCA_for_Clustering(final_data_x) # # final_data_x_LOF = oc.replace_Singular(final_data_x, oc.get_pred_test()) # print('final_data_x_LOF',final_data_x_LOF[:16]) # # print(final_data_x_LOF.shape) #降维处理 pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=0.9) # ############################################################################# # Compute clustering with MeanShift x_train, x_test, y_train, y_test = train_test_split(pca_x, data_y, test_size=0.3, random_state=0,
return Pca_x if __name__ == "__33main__": data = pd.read_csv('/home/mars/Data/finialData/electronic_infomation/000948.csv') data = data[::-1] result = get_other_indicators(data) #result = result[['open', 'close', 'low', 'high', 'volume', 'price_change']] deal_result = result.dropna(axis=0) s_deal_data = deal_data_from_dataFrame(deal_result) # 划分训练集和测试集,将数据集的70%划入训练集,30%划入测试集 #train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.3, random_state=1) # 归一化 final_data_x = nr.standardized_mars(s_deal_data[0]) ratio = 0.7 data_x = final_data_x x_train = data_x[:int(len(data_x) * ratio)] print('x_train', x_train.shape) x_test = data_x[int(len(data_x) * ratio):] m, n = np.shape(x_train) # 降维, 指定主成分的方差和所占的最小比例阈值 pca = PCA(n_components=0.9, random_state=1) pca.fit(x_train) print ('各维度的方差: ', pca.explained_variance_) print ('各维度的方差值占总方差值的比例: ', pca.explained_variance_ratio_) print ('占总方差值90%的维度数量: ', pca.n_components_, '\n') data_x = pca.fit_transform(final_data_x)
def other_main(): np.random.seed(42) data = pd.read_csv( '/home/mars/Data/finialData/electronic_infomation/300297.csv') data = data[::-1] result = get_other_indicators(data) delete_feature = [] deal_result = result.dropna(axis=0) # print(deal_result) print('***') #print(len(columns)) final_data = deal_data_from_dataFrame(deal_result) data_y = final_data[1] final_data_x = nr.standardized_mars(final_data[0]) print(final_data_x.shape) pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=0.9) # xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500)) # # Generate normal (not abnormal) training observations # X = 0.3 * np.random.randn(100, 2) # X_train = np.r_[X + 2, X - 2] # # Generate new normal (not abnormal) observations # X = 0.3 * np.random.randn(20, 2) # X_test = np.r_[X + 2, X - 2] # # Generate some abnormal novel observations # X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) # fit the model for novelty detection (novelty=True) print('pca_x', pca_x.shape) clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1) clf.fit(pca_x) # DO NOT use predict, decision_function and score_samples on X_train as this # would give wrong results but only on new unseen data (not used in X_train), # e.g. X_test, X_outliers or the meshgrid y_pred_test = clf.predict(pca_x) print(y_pred_test) error_index = [i for i, x in enumerate(y_pred_test) if x == -1] print('error size', y_pred_test[y_pred_test == -1].size) print('index of witch is -1 *******') print([i for i, x in enumerate(y_pred_test) if x == -1]) print('*******') # y_pred_outliers = clf.predict(X_outliers) # n_error_test = y_pred_test[y_pred_test == -1].size # n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size ''' # plot the learned frontier, the points, and the nearest vectors to the plane Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Novelty Detection with LOF") plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu) a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred') plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred') s = 40 b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k') b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s, edgecolors='k') c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s, edgecolors='k') plt.axis('tight') plt.xlim((-5, 5)) plt.ylim((-5, 5)) plt.legend([a.collections[0], b1, b2, c], ["learned frontier", "training observations", "new regular observations", "new abnormal observations"], loc="upper left", prop=matplotlib.font_manager.FontProperties(size=11)) plt.xlabel( "errors novel regular: %d/40 ; errors novel abnormal: %d/40" % (n_error_test, n_error_outliers)) plt.show() ''' '''
def fit_randomForest_MS(daySpan=0, dataPath="", stock_code=''): # data = pd.read_csv(dataPath) # data = data[::-1] # print(data[:10]) # # 加入其他的指标 # result = get_other_indicators(data) # deal_result = result.dropna(axis=0) # 利用LOF处理原始数据进行重新的决策 #final_data = deal_data_from_dataFrame(deal_result) # 根据stock_code获得相关性矩阵对应的相关性数据 #stock_code = '\'300017' collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv', index_col=0) collection = collection.sort_values(stock_code, ascending=False) #print(collection) # 提取前10的相关性股票 top_10 = collection.index.tolist() top_10 = top_10[:5] # 获得对应的数据 dataList = [] code_name = [] for code in top_10: code_name.append(code) # df[(df.BoolCol==3)&(df.attr==22)].index.tolist() # code = code_relation[code_relation.get(stock_code)==score].index ##print('code:', code[1:]) path = '/home/mars/Data/finialData/electronic_infomation/' + code[ 1:] + '.csv' code_data = pd.read_csv(path, index_col='date') result = get_other_indicators(code_data) # 数据整合 dataList.append(result) # 按照时间对接,并且去掉NAN数据 df = pd.concat(dataList, axis=1) # pandas会 按照文件的index索引来进行重新的拼接 new_df = df.sort_index() #print('new_df:', new_df[:5]) new_df.dropna(axis=0, inplace=True) #print('new_df2:', new_df.get('price_change')) print('all shape:', new_df.shape) #new_df.to_csv('300017_conbine.csv') deal_result = new_df data_x = dataX_from_dataFrame(deal_result) if daySpan == 0: # data_y = dataY_from_dataFrame(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] s_deal_data = (data_x, data_y) # data_y = final_data[1] final_data_x = nr.standardized_mars(s_deal_data[0]) print(final_data_x.shape) all_y = s_deal_data[1] MSx_train, MSx_test, MSy_train, MSy_test = ms.getMS_repx_data( final_data_x, all_y) all_x = np.vstack((MSx_train, MSx_test)) all_y = np.concatenate((MSy_train, MSy_test), axis=0) max_score = fit_randomForest_rep(data=(all_x, all_y)) print('综上的最高得分为:', max_score)
def get_predict_data(daySpan=0, stock_code='', date='now', dataFrom='csv', n_components=20, classfic=2): collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv', index_col=0) collection = collection.sort_values(stock_code, ascending=False) # 提取前10的相关性股票 top_10 = collection.index.tolist() top_10 = top_10[:5] # 获得对应的数据 dataList = [] code_name = [] for code in top_10: code_name.append(code) # df[(df.BoolCol==3)&(df.attr==22)].index.tolist() # code = code_relation[code_relation.get(stock_code)==score].index #print('code:', code[1:]) if dataFrom == 'csv': path = '/home/mars/Data/finialData/electronic_infomation/' + code[1:] + '.csv' code_data = pd.read_csv(path, index_col='date') code_data = code_data[::-1] #print(code_data) result = get_other_indicators(code_data) elif dataFrom == 'db': code_sql = 'SELECT * from stock_info where stock_code=' + code[1:] + ' order by date asc;' code_delete_list = ['id', 'stock_code', 'stock_name', 'turnover'] result = deal_dataFrame(code_sql, code_delete_list) # 获得需要删除的行索引 else: pass #result = get_other_indicators(code_data) # 数据整合 dataList.append(result) # 按照时间对接,并且去掉NAN数据 df = pd.concat(dataList, axis=1) # pandas会 按照文件的index索引来进行重新的拼接 new_df = df.sort_index() #print('new_df:', new_df[:5]) new_df.dropna(axis=0, inplace=True) #print('new_df2:', new_df.get('price_change')) #print('all shape:', new_df.shape) global now_df # 获得特定的行,获得以后在元数据中删除 date_index = new_df.index.tolist() if isinstance(date_index[0], str): try: now_index = date_index.index('2018-12-17') delete_index_list = date_index[now_index:] now_df = pd.DataFrame(new_df, index=[date]) new_df.drop(index=delete_index_list, inplace=True) except Exception as e: print(str(e)) else: try: now_index = date_index.index(datetime.date(2018, 12, 17)) delete_index_list = date_index[now_index:] date = datetime.datetime.strptime(date, '%Y-%m-%d') now_df = pd.DataFrame(new_df, index=[date]) new_df.drop(index=delete_index_list, inplace=True) except Exception as e: print(str(e)) print(date_index) try: pass # 如果给点的预测的日期小于'2018-12-14',只需要获得改天的数据即可 # if date_index.index('2018-12-14') < date_index.index(date): # now_index = date_index.index('2018-12-17') # delete_index_list = date_index[now_index:] # now_df = pd.DataFrame(new_df, index=date) # new_df.drop(index=delete_index_list, inplace=True) # else: # try: # now_index = date_index.index('2018-12-17') # delete_index_list = date_index[now_index:] # new_df.drop(index=delete_index_list, inplace=True) # except Exception as e: # print(str(e)) # now_df = pd.DataFrame(new_df, index=[date]) except Exception: return '本天该股票休市' print('new_df:', new_df) print('now_df:', now_df) deal_result = new_df data_x = dataX_from_dataFrame(deal_result) if classfic == 2: if daySpan == 0: data_y = dataY_from_dataFrame(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] elif classfic == 5: if daySpan == 0: data_y = dataY_from_dataFrame_5(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] else: pass final_data_x = nr.standardized_mars(data_x) print(final_data_x.shape) # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练 # 拿100%的数据进行PCA pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=n_components) #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) predict_y = random_forest((pca_x, data_y)) print('模型的分数: ', getScore()) return predict_y
def get_fit_model(daySpan=0, stock_code='', today='now', dataFrom='csv', n_components=20, classfic=2): collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv', index_col=0) collection = collection.sort_values(stock_code, ascending=False) # 提取前10的相关性股票 top_10 = collection.index.tolist() top_10 = top_10[:5] # 获得对应的数据 dataList = [] code_name = [] for code in top_10: if collection.loc[code, stock_code] < 0.6: continue code_name.append(code) # df[(df.BoolCol==3)&(df.attr==22)].index.tolist() # code = code_relation[code_relation.get(stock_code)==score].index # print('code:', code[1:]) if dataFrom == 'csv': path = '/home/mars/Data/finialData/electronic_infomation/' + code[1:] + '.csv' code_data = pd.read_csv(path, index_col='date') code_data = code_data[::-1] # print(code_data) result = get_other_indicators(code_data) elif dataFrom == 'db': code_sql = 'SELECT * from stock_fill where stock_code=' + code[1:] + ' order by date asc;' code_delete_list = ['id', 'stock_code', 'stock_name', 'modify'] result = deal_dataFrame(code_sql, code_delete_list) # 获得需要删除的行索引 else: pass # result = get_other_indicators(code_data) # 数据整合 dataList.append(result) # 按照时间对接,并且去掉NAN数据 df = pd.concat(dataList, axis=1) # pandas会 按照文件的index索引来进行重新的拼接 new_df = df.sort_index() print('new_df:', new_df.shape) #print('new_df data:', new_df[:5]) new_df.dropna(axis=0, inplace=True) global now_df # 获得特定的行,获得以后在元数据中删除 date_index = new_df.index.tolist() if isinstance(date_index[0], str): try: # csv文件训练的样本以这作为分割 #now_index = date_index.index('2018-12-14') #delete_index_list = date_index[now_index:] now_df = pd.DataFrame(new_df, index=[today]) #new_df.drop(index=delete_index_list, inplace=True) except Exception as e: print(str(e)) else: try: now_index = date_index.index(datetime.date(2018, 12, 17)) delete_index_list = date_index[now_index:] date = datetime.datetime.strptime(today, '%Y-%m-%d') now_df = pd.DataFrame(new_df, index=[date]) new_df.drop(index=delete_index_list, inplace=True) except Exception as e: print(str(e)) deal_result = new_df today_x = dataX_today(now_df) #print('now_df:', now_df) data_x = dataX_from_dataFrame(deal_result) today_x2 = get_today_data(data_x, today_x, n_components=n_components) if classfic == 2: if daySpan == 0: if data_x.shape[1] > 80: data_y = dataY_from_dataFrame(deal_result) else: # 表示没有联合股票的参与 data_y = dataY_no_correlation(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] elif classfic == 5: if daySpan == 0: data_y = dataY_from_dataFrame_5(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] else: pass final_data_x = nr.standardized_mars(data_x) print('final_data_x:', final_data_x.shape) # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练 # 拿100%的数据进行PCA pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=n_components) #print('data_y: ', data_y) # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) predict_y, future_y = random_forest((pca_x, data_y), another_data_x=today_x2) print('模型的分数: ', getScore()) print('预测今天的趋势是: ', future_y) del pca_x del final_data_x del today_x del deal_result del date_index del dataList return (getScore(), future_y)
def singular(daySpan=0, stock_code=''): # data = pd.read_csv(dataPath) # data = data[::-1] # # 加入其他的指标 # result = get_other_indicators(data) # deal_result = result.dropna(axis=0) # 利用LOF处理原始数据进行重新的决策 #final_data = deal_data_from_dataFrame(deal_result) #stock_code = '\'600775' collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv', index_col=0) collection = collection.sort_values(stock_code, ascending=False) #print(collection) # 提取前10的相关性股票 top_10 = collection.index.tolist() top_10 = top_10[:5] # 获得对应的数据 dataList = [] code_name = [] for code in top_10: #code_name.append(code) # 除去相关性系数小于0.6的股票 if collection.loc[code, stock_code] < 0.6: continue code_sql = 'SELECT * from stock_fill where stock_code=' + code[ 1:] + ' and date < "2018-12-15" order by date asc;' code_delete_list = ['id', 'stock_code', 'stock_name', 'modify'] codeData = deal_dataFrame(code_sql, code_delete_list) # 数据整合 dataList.append(codeData) # 按照时间对接,并且去掉NAN数据 df = pd.concat(dataList, axis=1) # pandas会 按照文件的index索引来进行重新的拼接 new_df = df.sort_index() #print('new_df:', new_df[:5]) print('new_df:', new_df.shape) print('new_df data:', new_df[:5]) new_df.dropna(axis=0, inplace=True) # 时间的索引 global date_index date_index = new_df.index.tolist() #print('new_df2:', new_df.get('price_change')) #print('all shape:', new_df.shape) deal_result = new_df print('shape:', deal_result.shape) data_x = dataX_from_dataFrame(deal_result) #print('data_x shape:', data_x[:3]) if daySpan == 0: # if data_x.shape[1] > 80: data_y = dataY_from_dataFrame_5(deal_result) else: # 表示没有联合股票的参与 data_y = dataY_5_no_correlation(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] # data_y = final_data[1] final_data_x = nr.standardized_mars(data_x) print(final_data_x.shape) # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练 # 拿100%的数据进行PCA all_y = data_y number = len(all_y) #global predict_info scoreInfoList = [] for i in range(6, final_data_x.shape[1] - 10, 1): pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=i) print(pca_x.shape) #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # 奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) #all_x = np.vstack((lof_data_x, x_test)) print(pca_x.shape) #all_y = np.concatenate((y_train, y_test), axis=0) #print(all_x.shape, all_y.shape) predict_y = random_forest((pca_x, all_y)) ratio_ss = len(oc.get_delete_index()) / number scoreInfoList.append((getScore(), ratio_ss, i)) scoreList = [] compent = [] for one in scoreInfoList: score = one[0] scoreList.append(score) compent.append(one[2]) ''' plt.title(stock_code + ' --- score of component') plt.xlabel('component') plt.ylabel('score') plt.plot(compent, scoreList,'r-o') max_indx = np.argmax(scoreList) # max value index suit_compent = compent[max_indx] plt.plot(suit_compent, scoreList[max_indx], 'ks') show_max = '[' + str(suit_compent) + ' ' + str(round(scoreList[max_indx], 4)) + ']' plt.annotate(show_max, xytext=(suit_compent, scoreList[max_indx]), xy=(suit_compent, scoreList[max_indx])) plt.show() ''' max_score = max(scoreList) max_index = scoreList.index(max_score) error_ratio = scoreInfoList[max_index][1] component = scoreInfoList[max_index][2] del scoreInfoList del scoreList print(max_score, error_ratio, component) return (max_score, error_ratio, component)
def fit_SVM(daySpan=0, code=None): stock_code = '\'000021' collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv', index_col=0) collection = collection.sort_values(stock_code, ascending=False) print(collection) # 提取前10的相关性股票 top_10 = collection.index.tolist() top_10 = top_10[:5] # 获得对应的数据 dataList = [] code_name = [] for code in top_10: code_name.append(code) # df[(df.BoolCol==3)&(df.attr==22)].index.tolist() # code = code_relation[code_relation.get(stock_code)==score].index print('code:', code[1:]) path = '/home/mars/Data/finialData/electronic_infomation/' + code[ 1:] + '.csv' code_data = pd.read_csv(path, index_col='date') result = get_other_indicators(code_data) # 数据整合 dataList.append(result) # 按照时间对接,并且去掉NAN数据 df = pd.concat(dataList, axis=1) # pandas会 按照文件的index索引来进行重新的拼接 new_df = df.sort_index() print('new_df:', new_df[:5]) new_df.dropna(axis=0, inplace=True) print('new_df2:', new_df.get('price_change')) print('all shape:', new_df.shape) deal_result = new_df # 利用LOF处理原始数据进行重新的决策 #final_data = deal_data_from_dataFrame(deal_result) data_x = dataX_from_dataFrame(deal_result) if daySpan == 0: # data_y = dataY_from_dataFrame(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] s_deal_data = (data_x, data_y) # data_y = final_data[1] final_data_x = nr.standardized_mars(s_deal_data[0]) print(final_data_x.shape) # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练 # 拿100%的数据进行PCA all_y = s_deal_data[1] scoreListInfo = [] for i in range(6, 40, 1): pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=i) print(pca_x.shape) #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # 奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) #all_x = np.vstack((lof_data_x, x_test)) print(pca_x.shape) x_train, x_test, y_train, y_test = train_test_split(lof_data_x, all_y, test_size=0.3, random_state=0, shuffle=False) # fit the model #for fig_num, kernel in enumerate(('linear', 'rbf', 'poly','sigmoid')): for c in np.arange(0.1, 1, 0.1): clf = svm.SVC(gamma=c, kernel='rbf') clf.fit(x_train, y_train) score = clf.score(x_test, y_test) print(score) scoreListInfo.append((score, i, c)) #print(scoreListInfo) scoreList = [] for one in scoreListInfo: score = one[0] scoreList.append(score) max_score = max(scoreList) max_index = scoreList.index(max_score) # error_ratio = scoreInfoList[max_index][1] components = scoreListInfo[max_index][1] c = scoreListInfo[max_index][2] del scoreListInfo del scoreList print('best paramers:') print(max_score, c, components) return (max_score, c, components)