def load_data(code, type, period, index_name): """ 按照周期加载三类数据:基金/股票;指数(市场);国债(无风险) :param code: 代码 :param type: stock|fund :param period: 期间:day,week,month,year :param index_name:指数名称,中文的 :return: """ # 加载基金/股票数据 if type == const.FUND: data = data_utils.load_fund_data(code) if data is None: logger.warning("[%s]数据有问题,忽略它...", code) return -999, None, None data_rate = calculate_rate(data, const.COL_ACCUMULATIVE_NET, period, 'price') elif type == const.STOCK: data = data_utils.load_stock_data(code) data_rate = calculate_rate(data, 'close', period, 'price') else: raise ValueError("type不合法:" + type) # 加载指数数据 index_data = data_utils.load_index_data_by_name(index_name, period) index_rate = data_utils.calculate_rate(index_data, 'close', period) # 加载无风险利率(/365=每天利率) bond_rate = data_utils.load_bond_interest_data() / PERIOD_NUM[period] bond_rate = calculate_rate(bond_rate, '收盘', period, 'rate') return data_rate, index_rate, bond_rate
def calculate_one_fund_by_period(fund, period): # # 不计算今年才开始的基金 # if fund.start_date > datetime.strptime('2020-1-1', DATE_FORMAT).date(): # logger.debug("此基金开始日期[%r],太新了,不具备分析价值") # return None start_year = fund.start_date.year end_year = datetime.now().date().year periods = [] for year in range(start_year, end_year + 1): periods += date_utils.get_peroid(year, period) trade_data = data_utils.load_fund_data(fund.code) if trade_data is None: return None data = filter_trade_by_period(trade_data, periods) logger.debug("过滤出%d条基金净值记录,%r~%r", len(data), data.index[0], data.index[-1]) bond_interests = data_utils.load_bond_interest_data(data.index) bond_interests = calculate_rate(bond_interests, '收盘', period) logger.debug("过滤出%d条基准利率记录", len(bond_interests)) # assert len(data) == len(bond_interests), "基金净值数据 ~ 基准利率 个数不一致" sharpe_ratio = calculate_sharpe(data, bond_interests, period) return sharpe_ratio
def test_KS_test(): """ https://www.cnblogs.com/eat-drink-breathe-hard/p/13798547.html As Stijn pointed out, the k-s test returns a D statistic and a p-value corresponding to the D statistic. The D statistic is the absolute max distance (supremum) between the CDFs of the two samples. The closer this number is to 0 the more likely it is that the two samples were drawn from the same distribution. Check out the Wikipedia page for the k-s test. It provides a good explanation: https://en.m.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test The p-value returned by the k-s test has the same interpretation as other p-values. You reject the null hypothesis that the two samples were drawn from the same distribution if the p-value is less than your significance level. You can find tables online for the conversion of the D statistic into a p-value if you are interested in the procedure. """ data = data_utils.load_fund_data('519778') data = data[[const.COL_DAILY_RATE]] data = data.dropna() test_stat = stats.kstest(data, 'norm', args=(data.mean(), data.std())) logger.debug("KS检验结果:%r", test_stat) if test_stat.pvalue < 0.05: logger.debug("KS检验%d条数据,p=%.2f<0.05,拒绝原假设(是正态分布),不是正态分布", len(data), test_stat.pvalue) else: logger.debug("KS检验%d条数据,p=%.2f>0.05,不拒绝原假设(是正态分布),应该是正态分布", len(data), test_stat.pvalue)
def main(args): codes = args.codes.split(",") data_list = [data_utils.load_fund_data(code) for code in codes] data = data_utils.merge_by_date(data_list, [COL_DAILY_RATE] * len(data_list), codes) calculated_data = calculate(data, args.sample) show(calculated_data)
def crawle_one(self, code, force=False, period=None): total_data = load_fund_data(code) if force: logger.info("强制重新爬取 基金[%s]", code) total_data = None start_date, end_date = self.get_start_end_date(code, total_data) if start_date is None and end_date is None: logger.info("爬取失败[%s],原因:无法获得起止日期", code) return if start_date == end_date: logger.info("无需爬取[%s],原因:开始和结束日期[%r]一样", code, start_date) return logger.info("准备爬取 [%s] --> [%s] 的数据", start_date, end_date) page_num = self.get_page_num(code, start_date, end_date) for i in range(1, page_num + 1): html = self.get_content(code, i, NUM_PER_PAGE, start_date, end_date) data = self.parse_html(html) if data is None: continue # 修改数据类型 data[COL_DATE] = pd.to_datetime(data[COL_DATE], format=const.DATE_FORMAT) data.set_index([COL_DATE], inplace=True) # 这里需要提前设置一下index,为了和旧数据兼容 data[COL_UNIT_NET] = data[COL_UNIT_NET].astype(float) data[COL_ACCUMULATIVE_NET] = data[COL_ACCUMULATIVE_NET].astype( float) data[COL_DAILY_RATE] = data[COL_DAILY_RATE].str.strip('%').astype( float) if total_data is None: total_data = data logger.debug("基金[%s]不存在,创建[%d]条", code, len(data)) else: total_data = total_data.append(data) logger.debug("追加[%d]条到基金[%s]中,合计[%d]条", len(data), code, len(total_data)) time.sleep(random.random() * 1) logger.info("已爬完第%d页数据,准备爬取第%d页", i, i + 1) if total_data is None: logger.error("代码 [%s] 爬取失败!!!") return data_path = save_fund_data(code, total_data) logger.info("保存%d行所有数据,到[%s]中", len(total_data), data_path)
def random_caculate(args): files = os.listdir(const.FUND_DATA_DIR) random.shuffle(files) if args.code: num = 1 files = [args.code + ".csv"] else: num = args.num result = None counter = 0 for f in files: code, _ = os.path.splitext(f) data = data_utils.load_fund_data(code) if data is None: continue if data.index[0] > date_utils.str2date(args.start) or \ data.index[-1] < date_utils.str2date(args.end): continue # logger.debug("start:%r/%r",data.index[0], date_utils.str2date(args.start)) # logger.debug("end:%r/%r", data.index[-1], date_utils.str2date(args.end)) if counter > num: break data = data[[const.COL_DAILY_RATE]] # only left rate col data.columns = [code] # give him a name if result is None: result = data else: result = pd.concat([data, result], axis=1) result = result.dropna(how="any", axis=0) # logger.debug("-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-") # logger.debug("结果2:%r", result) counter += 1 # logger.debug("最终结果:\n%r", result) logger.debug("=============================================") logger.debug("描述 :\n%r", result.describe()) logger.debug("=============================================") logger.debug('信息 :\n%r', result.info()) logger.debug("=============================================") logger.debug('协方差 :\n%r', result.cov()) logger.debug("=============================================") logger.debug('相关系数:\n%r', result.corr()) # plot(result[const.COL_DAILY_RATE]) logger.debug("=============================================") logger.debug("从[%d]个基金中筛出[%d]个,跨[%d]天,叠加偏度:\n%r", len(files), counter, len(result), result.skew())
def load_data(self, args): data = data_utils.load_fund_data(args.code) if data is None: raise ValueError("数据不存在,代码:" + args.code) index_data = data_utils.load_index_data_by_name('上证指数') index_rate = data_utils.calculate_rate(index_data, 'close') self.load_info(args.code) return data, index_data, index_rate
def test_shapiro_test(): """ https://zhuanlan.zhihu.com/p/26539771 https://www.jianshu.com/p/e202069489a6 测试是不是符合正太分布 """ data = data_utils.load_fund_data('519778').iloc[0:50] W, p_value = stats.shapiro(data[const.COL_DAILY_RATE]) # p_value if p_value < 0.05: logger.debug("W=%.2f,p=%.2f,拒绝原假设(是正态分布),不是正态分布", W, p_value) else: logger.debug("W=%.2f,p=%.2f,接受原假设(正态分布),是正态分布", W, p_value)
def main(args): data = data_utils.load_fund_data(args.code) if data is None: return data = data.loc[args.start:args.end] start_time = time.time() invest_data = filter_invest_by(data, args.period, args.day) # print(invest_data.info()) # print(invest_data.describe()) # print(invest_data) price_of_last_day = data[[COL_ACCUMULATIVE_NET]].iloc[-1] # print(invest_data.info()) logger.debug("最后一天[%s]的价格为:%.2f", invest_data.index[-1], price_of_last_day) profit_percentage = invest(invest_data, price_of_last_day) logger.info("代码[%s] 按[%s]定投 %d 次, [%s] -> [%s] 定投收益率: %.3f%%, 耗时: %.2f", args.code, PERIOD_NAMES[args.period], len(invest_data), args.start, args.end, profit_percentage * 100 - 100, time.time() - start_time)
def main(code, threshold): data = data_utils.load_fund_data(code) data = data[[const.COL_ACCUMULATIVE_NET]] # data_mean = resample('1W', how='mean').fillna(0) exp_smooth_data = exponential_smoothing(data, alpha=0.1) show_plot(x_data=exp_smooth_data.index, y_data=exp_smooth_data, color='y') data_diff1 = exp_smooth_data.diff(1) data_diff2 = exp_smooth_data.diff(2) # data_diff1.diff(1) show_plot(x_data=data.index, y_data=data, color='b') show_plot(x_data=data_diff1.index, y_data=data_diff1, color='r') show_plot(x_data=data_diff2.index, y_data=data_diff2, color='g') up = data_diff2[data_diff2['value'] > threshold] down = data_diff2[data_diff2['value'] < -threshold] ax1 = plt.gca() ax1.scatter(data.loc[up.index].index, data.loc[up.index], color='g') ax1.scatter(data.loc[down.index].index, data.loc[down.index], color='r') plt.show()
def main(args): codes = args.codes.split(",") fund_list = [data_utils.load_fund(code) for code in codes] fund_data_list = [data_utils.load_fund_data(code) for code in codes] calculate(fund_data_list, fund_list)
def main(fund_list, start=None): funds = [data_utils.load_fund_data(f) for f in fund_list] calculate(funds, start)