def test(): data = pd.read_csv( '/home/mars/Data/finialData/electronic_infomation/000021.csv') data = data[::-1] result = get_other_indicators(data) #result = data[['price_change', 'p_change']] deal_result = result.dropna(axis=0) close = deal_result['close'] print(close.shape) s_deal_data = deal_data_from_dataFrame(deal_result) data_x = s_deal_data[0] data_y = s_deal_data[1] # 特征处理 #t_deal_data_x = Filter(use=False).Variance_selection(threshold=3, data=s_deal_data)[0] # 归一化 final_data_x = nr.standardized_mars(data_x) pca_x = oc.LOF_PCA_for_Clustering(final_data_x) final_data_x_LOF = oc.replace_Singular(final_data_x, oc.get_pred_test()) print('final_data_x_LOF', final_data_x_LOF[:16]) print(final_data_x_LOF.shape) #降维处理 #pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=0.9) # ############################################################################# # Compute clustering with MeanShift x_train = final_data_x_LOF[:int(len(data_x) * 0.7)] print('x_train', x_train.shape) x_test = final_data_x_LOF[int(len(data_x) * 0.7):] # The following bandwidth can be automatically detected using bandwidth = estimate_bandwidth(x_train, quantile=0.2, random_state=1) ms = MeanShift(bandwidth=bandwidth, bin_seeding=False) ms.fit(final_data_x_LOF) labels = ms.labels_ print('error size', labels[labels != 0].size) print('index of not 0 *******') print([i for i, x in enumerate(labels) if x != 0]) print('*******') print(labels) print(labels.shape) cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #score = metrics.silhouette_score(pca_x, labels, metric='euclidean') #score1 = metrics.calinski_harabaz_score(pca_x, labels) #print(score) #print(score1) print("number of estimated clusters : %d" % n_clusters_) plt.plot(range(len(close)), close) plt.plot(range(len(labels)), labels) plt.show() # ############################################################################# # Plot result '''
def compare_s_no(dataPath=""): data = pd.read_csv(dataPath) # 加入其他的指标 result = get_other_indicators(data) deal_result = result.dropna(axis=0)[-100:] # 利用LOF处理原始数据进行重新的决策 final_data = deal_data_from_dataFrame(deal_result) # data_y = final_data[1] final_data_x = nr.standardized_mars(final_data[0]) print(final_data_x.shape) # 直接使用pca数据,将100%做特异值处理,随机森林的训练 # 拿100%的数据进行PCA data_y = final_data[1] data_x = final_data[0] final_data_x = nr.standardized_mars(data_x) # 拿100%的数据进行PCA pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=54) # 奇异值处理 oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) #random_forest((lof_data_x, new_all_y)) lof_pred = oc.get_pred_test() error_index = oc.get_delete_index() lof_data_y = oc.replace_Singular(data_y, lof_pred) fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True) ax1.scatter(range(len(data_y)), data_y, label='data_y') error_close = data_y[error_index] # ax1.plot(range(len(lof_pred)), lof_pred, label='lof_pred') ax1.scatter(error_index, error_close, label='error_y', c='r', alpha=0.2) # ax1.xlabel('x -') # ax1.ylabel('y -') # ax1.title('plot open') ax1.legend() # ax2.ylabel('close') error_lof_y = lof_data_y[error_index] ax2.scatter(range(len(lof_data_y)), lof_data_y, label='lof_data_y') ax2.scatter(error_index, error_lof_y, label='error_lof_y', c='r', alpha=0.2) # ax2.plot(close**2, label='quadratic') ax2.legend() # 调整cavas 的间隔 print(len(data_y)) print(len(lof_data_y)) plt.tight_layout() plt.show()
def analyze_lof(dataPath=""): data = pd.read_csv(dataPath) data = data[::-1] # 加入其他的指标 result = get_other_indicators(data) deal_result = result.dropna(axis=0) # 利用LOF处理原始数据进行重新的决策 # final_data = deal_data_from_dataFrame(deal_result) # 获得电子信息的板块的数据 # NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;' # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money'] # 对于nan的值进行向前填充 # NDXData = deal_dataFrame(NDX_sql, []) final_data = deal_data(deal_result) # data_y = final_data[1] final_data_x = nr.standardized_mars(final_data[0]) pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62) print(pca_x.shape) # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # x奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) error_index = oc.get_delete_index() print(error_index) result = deal_result.index.tolist() # 写入所有的日期,奇异值存在的标志为1 with open('300113_data.csv', 'w+') as f: f.write('date') f.write(',') f.write('300113_Sigular') f.write('\n') for index, date in enumerate(result): if index in error_index: f.write(date) f.write(',') f.write('1') f.write('\n') else: f.write(date) f.write(',') f.write('0') f.write('\n')
def analyze_lof_sql(code=""): # 获得电子信息的板块的数据 NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;' # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money'] # 对于nan的值进行向前填充 NDXData = deal_dataFrame(NDX_sql, []) final_data = deal_data(NDXData) # data_y = final_data[1] final_data_x = nr.standardized_mars(final_data[0]) pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62) print(pca_x.shape) # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # x奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) error_index = oc.get_delete_index() print(error_index) result = NDXData.index.tolist() # 写入所有的日期,奇异值存在的标志为1 with open('NDX_data.csv', 'w+') as f: f.write('date') f.write(',') f.write('NDX_Sigular') f.write('\n') for index, date in enumerate(result): if index in error_index: f.write(date.strftime('%Y-%m-%d')) f.write(',') f.write('1') f.write('\n') else: f.write(date.strftime('%Y-%m-%d')) f.write(',') f.write('0') f.write('\n')
def fit_randomForest_repXY(daySpan=0, dataPath=""): data = pd.read_csv(dataPath) # 加入其他的指标 result = get_other_indicators(data) deal_result = result.dropna(axis=0) # 利用LOF处理原始数据进行重新的决策 #final_data = deal_data_from_dataFrame(deal_result) data_x = dataX_from_dataFrame(deal_result) if daySpan == 0: # data_y = dataY_from_dataFrame(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] s_deal_data = (data_x, data_y) final_data_x = nr.standardized_mars(s_deal_data[0]) print(final_data_x.shape) # 直接使用pca数据,将100%做特异值处理,随机森林的训练 # 拿100%的数据进行PCA all_y = data_y pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=2) print(pca_x.shape) # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # x奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) # y奇异值处理 print('all_y', all_y[-10:]) lof_data_y = oc.replace_Singular(all_y, oc.get_pred_test()) print('lof_y', lof_data_y[-10:]) # all_x = np.vstack((lof_data_x, x_oc.get_pred_test()test)) print(pca_x.shape) # all_y = np.concatenate((y_train, y_test), axis=0) # print(all_x.shape, all_y.shape) random_forest((lof_data_x, lof_data_y))
print('MSy_train,MSy_test:', MSy_train.shape, MSy_test.shape) all_x = np.vstack((MSx_train, MSx_test)) all_y = np.concatenate((MSy_train, MSy_test), axis=0) pca_x = PCA_mars.getPcaComponent(all_x, n_components=35) print('all_y:', all_y.shape) MS_predict_y = random_forest((pca_x, all_y)) print('MS_predict_y', MS_predict_y.shape) ms_score = round(getScore(), 4) del pca_x del all_y del all_x # 获得singular的train_Y print('***********开始测试 singular ********************') pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=50) lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) singular_predict_y = random_forest((lof_data_x, data_y)) print('singular_predict_y', singular_predict_y.shape) singular_score = round(getScore(), 4) del lof_data_x del pca_x # 获得original_RF的train_Y print('***********开始测试 original ********************') pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=53) original_predict_y = random_forest((pca_x, data_y)) print('original_predict_y', original_predict_y.shape) original_score = round(getScore(), 4) del pca_x # 获得model4的train_Y
def singular(daySpan=0, stock_code=''): # data = pd.read_csv(dataPath) # data = data[::-1] # # 加入其他的指标 # result = get_other_indicators(data) # deal_result = result.dropna(axis=0) # 利用LOF处理原始数据进行重新的决策 #final_data = deal_data_from_dataFrame(deal_result) #stock_code = '\'600775' collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv', index_col=0) collection = collection.sort_values(stock_code, ascending=False) #print(collection) # 提取前10的相关性股票 top_10 = collection.index.tolist() top_10 = top_10[:5] # 获得对应的数据 dataList = [] code_name = [] for code in top_10: #code_name.append(code) # 除去相关性系数小于0.6的股票 if collection.loc[code, stock_code] < 0.6: continue code_sql = 'SELECT * from stock_fill where stock_code=' + code[ 1:] + ' and date < "2018-12-15" order by date asc;' code_delete_list = ['id', 'stock_code', 'stock_name', 'modify'] codeData = deal_dataFrame(code_sql, code_delete_list) # 数据整合 dataList.append(codeData) # 按照时间对接,并且去掉NAN数据 df = pd.concat(dataList, axis=1) # pandas会 按照文件的index索引来进行重新的拼接 new_df = df.sort_index() #print('new_df:', new_df[:5]) print('new_df:', new_df.shape) print('new_df data:', new_df[:5]) new_df.dropna(axis=0, inplace=True) # 时间的索引 global date_index date_index = new_df.index.tolist() #print('new_df2:', new_df.get('price_change')) #print('all shape:', new_df.shape) deal_result = new_df print('shape:', deal_result.shape) data_x = dataX_from_dataFrame(deal_result) #print('data_x shape:', data_x[:3]) if daySpan == 0: # if data_x.shape[1] > 80: data_y = dataY_from_dataFrame_5(deal_result) else: # 表示没有联合股票的参与 data_y = dataY_5_no_correlation(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] # data_y = final_data[1] final_data_x = nr.standardized_mars(data_x) print(final_data_x.shape) # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练 # 拿100%的数据进行PCA all_y = data_y number = len(all_y) #global predict_info scoreInfoList = [] for i in range(6, final_data_x.shape[1] - 10, 1): pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=i) print(pca_x.shape) #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # 奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) #all_x = np.vstack((lof_data_x, x_test)) print(pca_x.shape) #all_y = np.concatenate((y_train, y_test), axis=0) #print(all_x.shape, all_y.shape) predict_y = random_forest((pca_x, all_y)) ratio_ss = len(oc.get_delete_index()) / number scoreInfoList.append((getScore(), ratio_ss, i)) scoreList = [] compent = [] for one in scoreInfoList: score = one[0] scoreList.append(score) compent.append(one[2]) ''' plt.title(stock_code + ' --- score of component') plt.xlabel('component') plt.ylabel('score') plt.plot(compent, scoreList,'r-o') max_indx = np.argmax(scoreList) # max value index suit_compent = compent[max_indx] plt.plot(suit_compent, scoreList[max_indx], 'ks') show_max = '[' + str(suit_compent) + ' ' + str(round(scoreList[max_indx], 4)) + ']' plt.annotate(show_max, xytext=(suit_compent, scoreList[max_indx]), xy=(suit_compent, scoreList[max_indx])) plt.show() ''' max_score = max(scoreList) max_index = scoreList.index(max_score) error_ratio = scoreInfoList[max_index][1] component = scoreInfoList[max_index][2] del scoreInfoList del scoreList print(max_score, error_ratio, component) return (max_score, error_ratio, component)
def fit_SVM(daySpan=0, code=None): stock_code = '\'000021' collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv', index_col=0) collection = collection.sort_values(stock_code, ascending=False) print(collection) # 提取前10的相关性股票 top_10 = collection.index.tolist() top_10 = top_10[:5] # 获得对应的数据 dataList = [] code_name = [] for code in top_10: code_name.append(code) # df[(df.BoolCol==3)&(df.attr==22)].index.tolist() # code = code_relation[code_relation.get(stock_code)==score].index print('code:', code[1:]) path = '/home/mars/Data/finialData/electronic_infomation/' + code[ 1:] + '.csv' code_data = pd.read_csv(path, index_col='date') result = get_other_indicators(code_data) # 数据整合 dataList.append(result) # 按照时间对接,并且去掉NAN数据 df = pd.concat(dataList, axis=1) # pandas会 按照文件的index索引来进行重新的拼接 new_df = df.sort_index() print('new_df:', new_df[:5]) new_df.dropna(axis=0, inplace=True) print('new_df2:', new_df.get('price_change')) print('all shape:', new_df.shape) deal_result = new_df # 利用LOF处理原始数据进行重新的决策 #final_data = deal_data_from_dataFrame(deal_result) data_x = dataX_from_dataFrame(deal_result) if daySpan == 0: # data_y = dataY_from_dataFrame(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] s_deal_data = (data_x, data_y) # data_y = final_data[1] final_data_x = nr.standardized_mars(s_deal_data[0]) print(final_data_x.shape) # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练 # 拿100%的数据进行PCA all_y = s_deal_data[1] scoreListInfo = [] for i in range(6, 40, 1): pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=i) print(pca_x.shape) #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # 奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) #all_x = np.vstack((lof_data_x, x_test)) print(pca_x.shape) x_train, x_test, y_train, y_test = train_test_split(lof_data_x, all_y, test_size=0.3, random_state=0, shuffle=False) # fit the model #for fig_num, kernel in enumerate(('linear', 'rbf', 'poly','sigmoid')): for c in np.arange(0.1, 1, 0.1): clf = svm.SVC(gamma=c, kernel='rbf') clf.fit(x_train, y_train) score = clf.score(x_test, y_test) print(score) scoreListInfo.append((score, i, c)) #print(scoreListInfo) scoreList = [] for one in scoreListInfo: score = one[0] scoreList.append(score) max_score = max(scoreList) max_index = scoreList.index(max_score) # error_ratio = scoreInfoList[max_index][1] components = scoreListInfo[max_index][1] c = scoreListInfo[max_index][2] del scoreListInfo del scoreList print('best paramers:') print(max_score, c, components) return (max_score, c, components)