def compare_s_no(dataPath=""): data = pd.read_csv(dataPath) # 加入其他的指标 result = get_other_indicators(data) deal_result = result.dropna(axis=0)[-100:] # 利用LOF处理原始数据进行重新的决策 final_data = deal_data_from_dataFrame(deal_result) # data_y = final_data[1] final_data_x = nr.standardized_mars(final_data[0]) print(final_data_x.shape) # 直接使用pca数据,将100%做特异值处理,随机森林的训练 # 拿100%的数据进行PCA data_y = final_data[1] data_x = final_data[0] final_data_x = nr.standardized_mars(data_x) # 拿100%的数据进行PCA pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=54) # 奇异值处理 oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) #random_forest((lof_data_x, new_all_y)) lof_pred = oc.get_pred_test() error_index = oc.get_delete_index() lof_data_y = oc.replace_Singular(data_y, lof_pred) fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True) ax1.scatter(range(len(data_y)), data_y, label='data_y') error_close = data_y[error_index] # ax1.plot(range(len(lof_pred)), lof_pred, label='lof_pred') ax1.scatter(error_index, error_close, label='error_y', c='r', alpha=0.2) # ax1.xlabel('x -') # ax1.ylabel('y -') # ax1.title('plot open') ax1.legend() # ax2.ylabel('close') error_lof_y = lof_data_y[error_index] ax2.scatter(range(len(lof_data_y)), lof_data_y, label='lof_data_y') ax2.scatter(error_index, error_lof_y, label='error_lof_y', c='r', alpha=0.2) # ax2.plot(close**2, label='quadratic') ax2.legend() # 调整cavas 的间隔 print(len(data_y)) print(len(lof_data_y)) plt.tight_layout() plt.show()
def analyze_lof(dataPath=""): data = pd.read_csv(dataPath) data = data[::-1] # 加入其他的指标 result = get_other_indicators(data) deal_result = result.dropna(axis=0) # 利用LOF处理原始数据进行重新的决策 # final_data = deal_data_from_dataFrame(deal_result) # 获得电子信息的板块的数据 # NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;' # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money'] # 对于nan的值进行向前填充 # NDXData = deal_dataFrame(NDX_sql, []) final_data = deal_data(deal_result) # data_y = final_data[1] final_data_x = nr.standardized_mars(final_data[0]) pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62) print(pca_x.shape) # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # x奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) error_index = oc.get_delete_index() print(error_index) result = deal_result.index.tolist() # 写入所有的日期,奇异值存在的标志为1 with open('300113_data.csv', 'w+') as f: f.write('date') f.write(',') f.write('300113_Sigular') f.write('\n') for index, date in enumerate(result): if index in error_index: f.write(date) f.write(',') f.write('1') f.write('\n') else: f.write(date) f.write(',') f.write('0') f.write('\n')
def fit_randomForest_del(daySpan=0, dataPath=""): data = pd.read_csv(dataPath) # 加入其他的指标 result = get_other_indicators(data) deal_result = result.dropna(axis=0) # 利用LOF处理原始数据进行重新的决策 #final_data = deal_data_from_dataFrame(deal_result) data_x = dataX_from_dataFrame(deal_result) if daySpan == 0: # 对X处理, Y值做二分化处理 data_y = dataY_from_dataFrame(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] s_deal_data = (data_x, data_y) # data_y = final_data[1] final_data_x = nr.standardized_mars(s_deal_data[0]) print(final_data_x.shape) # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练 # 拿100%的数据进行PCA all_y = data_y pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=2) print(pca_x.shape) # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # 奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering_del(pca_x, isUsePCA=False) dele_index = oc.get_delete_index() lof_data_y = np.delete(all_y, dele_index, axis=0) # all_x = np.vstack((lof_data_x, x_test)) print(lof_data_x.shape) print('y', data_y.shape) # all_y = np.concatenate((y_train, y_test), axis=0) # print(all_x.shape, all_y.shape) feature_importances = np.copy(random_forest((lof_data_x, lof_data_y)))
def analyze_lof_sql(code=""): # 获得电子信息的板块的数据 NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;' # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money'] # 对于nan的值进行向前填充 NDXData = deal_dataFrame(NDX_sql, []) final_data = deal_data(NDXData) # data_y = final_data[1] final_data_x = nr.standardized_mars(final_data[0]) pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62) print(pca_x.shape) # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # x奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) error_index = oc.get_delete_index() print(error_index) result = NDXData.index.tolist() # 写入所有的日期,奇异值存在的标志为1 with open('NDX_data.csv', 'w+') as f: f.write('date') f.write(',') f.write('NDX_Sigular') f.write('\n') for index, date in enumerate(result): if index in error_index: f.write(date.strftime('%Y-%m-%d')) f.write(',') f.write('1') f.write('\n') else: f.write(date.strftime('%Y-%m-%d')) f.write(',') f.write('0') f.write('\n')
def singular(daySpan=0, stock_code=''): # data = pd.read_csv(dataPath) # data = data[::-1] # # 加入其他的指标 # result = get_other_indicators(data) # deal_result = result.dropna(axis=0) # 利用LOF处理原始数据进行重新的决策 #final_data = deal_data_from_dataFrame(deal_result) #stock_code = '\'600775' collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv', index_col=0) collection = collection.sort_values(stock_code, ascending=False) #print(collection) # 提取前10的相关性股票 top_10 = collection.index.tolist() top_10 = top_10[:5] # 获得对应的数据 dataList = [] code_name = [] for code in top_10: #code_name.append(code) # 除去相关性系数小于0.6的股票 if collection.loc[code, stock_code] < 0.6: continue code_sql = 'SELECT * from stock_fill where stock_code=' + code[ 1:] + ' and date < "2018-12-15" order by date asc;' code_delete_list = ['id', 'stock_code', 'stock_name', 'modify'] codeData = deal_dataFrame(code_sql, code_delete_list) # 数据整合 dataList.append(codeData) # 按照时间对接,并且去掉NAN数据 df = pd.concat(dataList, axis=1) # pandas会 按照文件的index索引来进行重新的拼接 new_df = df.sort_index() #print('new_df:', new_df[:5]) print('new_df:', new_df.shape) print('new_df data:', new_df[:5]) new_df.dropna(axis=0, inplace=True) # 时间的索引 global date_index date_index = new_df.index.tolist() #print('new_df2:', new_df.get('price_change')) #print('all shape:', new_df.shape) deal_result = new_df print('shape:', deal_result.shape) data_x = dataX_from_dataFrame(deal_result) #print('data_x shape:', data_x[:3]) if daySpan == 0: # if data_x.shape[1] > 80: data_y = dataY_from_dataFrame_5(deal_result) else: # 表示没有联合股票的参与 data_y = dataY_5_no_correlation(deal_result) else: data_y = dataY_for_Nmean(deal_result, N=daySpan) data_x = data_x[:len(data_y)] # data_y = final_data[1] final_data_x = nr.standardized_mars(data_x) print(final_data_x.shape) # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练 # 拿100%的数据进行PCA all_y = data_y number = len(all_y) #global predict_info scoreInfoList = [] for i in range(6, final_data_x.shape[1] - 10, 1): pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=i) print(pca_x.shape) #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False) # 奇异值处理 lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False) #all_x = np.vstack((lof_data_x, x_test)) print(pca_x.shape) #all_y = np.concatenate((y_train, y_test), axis=0) #print(all_x.shape, all_y.shape) predict_y = random_forest((pca_x, all_y)) ratio_ss = len(oc.get_delete_index()) / number scoreInfoList.append((getScore(), ratio_ss, i)) scoreList = [] compent = [] for one in scoreInfoList: score = one[0] scoreList.append(score) compent.append(one[2]) ''' plt.title(stock_code + ' --- score of component') plt.xlabel('component') plt.ylabel('score') plt.plot(compent, scoreList,'r-o') max_indx = np.argmax(scoreList) # max value index suit_compent = compent[max_indx] plt.plot(suit_compent, scoreList[max_indx], 'ks') show_max = '[' + str(suit_compent) + ' ' + str(round(scoreList[max_indx], 4)) + ']' plt.annotate(show_max, xytext=(suit_compent, scoreList[max_indx]), xy=(suit_compent, scoreList[max_indx])) plt.show() ''' max_score = max(scoreList) max_index = scoreList.index(max_score) error_ratio = scoreInfoList[max_index][1] component = scoreInfoList[max_index][2] del scoreInfoList del scoreList print(max_score, error_ratio, component) return (max_score, error_ratio, component)