示例#1
0
def get_revenue_ttm(test_report_date):
    """
    营业收入的ttm = 前四季度单季度营业收入的和
    如果当前是2季度,则返回 ttm * 2 / 4
    """
    # 获取当前报告日之前的四个季度报告日
    prev_report_dates = [get_report_date(test_report_date, -i) for i in range(1, 5)]

    ret = pd.DataFrame()
    for report_date in prev_report_dates:
        quarterly_income = get_quarter_cumul_income(report_date, cumul=False)[['ticker', 'quarter_revenue']]
        if ret.empty:
            ret = quarterly_income
            ret['ttm'] = ret['quarter_revenue']
        else:
            ret = pd.merge(ret, quarterly_income, on='ticker')
            ret['ttm'] += ret['quarter_revenue']
        ret.rename(columns={'quarter_revenue': 'quarter_revenue_' + report_date}, inplace=True)

    report_date_dt = datetime.strptime(test_report_date, '%Y-%m-%d')
    quarter = report_date_dt.month // 4 + 1
    ret['ttm'] = ret['ttm'] * quarter / 4.0

    # 添加 行业和市值 信息
    ticker_property = get_ticker_property(ret['ticker'].tolist(),
                                          get_report_date(test_report_date, -1), ['industry', 'market_value'])
    industry = ticker_property[['ticker', 'industry']]
    market_value = ticker_property[['ticker', 'market_value']]
    ret = pd.merge(ret, industry, on='ticker')
    ret = pd.merge(ret, market_value, on='ticker')

    # 去掉所有na
    ret.dropna(inplace=True)
    return ret
示例#2
0
def get_revenue_ttm_ratio(test_report_date, year):
    """
    营业收入的ttm ratio = 前四季度单季度营业收入增长率的均值
    为方便计算,增长率取对数增长率,那么增长率的均值 = 1/4 *(log(x_4) - log(x_0))

    修改:
        增加year: 过去year年到过去一年数据的增长率平均值
    """
    # 获取当前报告日之前的第一个季度报告日
    prev_report_date = get_report_date(test_report_date, -4)
    ret = get_quarter_cumul_income(prev_report_date,
                                   cumul=False)[['ticker', 'quarter_revenue']]

    # 获取当前报告日之前的第五个季度报告日
    prev_report_date = get_report_date(test_report_date, -4 * year)
    temp = get_quarter_cumul_income(prev_report_date,
                                    cumul=False)[['ticker', 'quarter_revenue']]
    ret = pd.merge(ret, temp, on='ticker')
    ret.dropna(inplace=True)

    ret['ttm_ratio'] = (np.log(ret['quarter_revenue_x']) -
                        np.log(ret['quarter_revenue_y'])) * (year - 1)

    # 添加 行业和市值 信息
    ticker_property = get_ticker_property(ret['ticker'].tolist(),
                                          test_report_date,
                                          ['industry', 'market_value'])
    industry = ticker_property[['ticker', 'industry']]
    market_value = ticker_property[['ticker', 'market_value']]
    ret = pd.merge(ret, industry, on='ticker')
    ret = pd.merge(ret, market_value, on='ticker')

    # 去掉所有na
    ret.dropna(inplace=True)
    return ret
示例#3
0
def get_adjusted_ttm(test_report_date, year):
    ind_ratio_map = cal_industry_ratio(test_report_date, year)

    prev_report_dates = [
        get_report_date(test_report_date, -i) for i in range(1, 9)
    ]

    ret = pd.DataFrame()
    for report_date in prev_report_dates:
        report_date_dt = datetime.strptime(report_date, '%Y-%m-%d')
        quarter = report_date_dt.month // 4 + 1
        dict = ind_ratio_map[quarter - 1]['ratio'].to_dict()

        # 读取revenue数据
        quarterly_income = get_quarter_cumul_income(
            report_date, cumul=False)[['ticker', 'quarter_revenue']]
        # 读取市值与产业数据
        ticker_property = get_ticker_property(
            quarterly_income['ticker'].tolist(),
            get_report_date(test_report_date, -1),
            ['industry', 'market_value'])
        industry = ticker_property[['ticker', 'industry']]
        market_value = ticker_property[['ticker', 'market_value']]

        quarterly_income = pd.merge(quarterly_income, industry, on='ticker')
        quarterly_income = pd.merge(quarterly_income,
                                    market_value,
                                    on='ticker')
        quarterly_income['ratio_based_weight'] = quarterly_income[
            'industry'].map(dict)

        if ret.empty:
            ret = quarterly_income
            ret['ttm'] = ret['quarter_revenue'] * ret['ratio_based_weight']
            ret.drop('ratio_based_weight', axis=1, inplace=True)
        else:
            ret = pd.merge(ret, quarterly_income, on='ticker')
            ret['ttm'] += ret['quarter_revenue'] * ret['ratio_based_weight']
            ret.drop('ratio_based_weight', axis=1, inplace=True)
        ret.rename(
            columns={'quarter_revenue': 'quarter_revenue_' + report_date},
            inplace=True)

    report_date_dt = datetime.strptime(test_report_date, '%Y-%m-%d')
    quarter = report_date_dt.month // 4 + 1
    ret['ttm'] = ret['ttm'] * quarter / 2.0

    # 添加 行业和市值 信息
    ticker_property = get_ticker_property(
        ret['ticker'].tolist(), get_report_date(test_report_date, -1),
        ['industry', 'market_value'])
    industry = ticker_property[['ticker', 'industry']]
    market_value = ticker_property[['ticker', 'market_value']]
    ret = pd.merge(ret, industry, on='ticker')
    ret = pd.merge(ret, market_value, on='ticker')

    # 去掉所有na

    return ret
示例#4
0
def cal_industry_ratio(test_report_date, year):

    def get_quarter(date):
        report_date_dt = datetime.strptime(date, '%Y-%m-%d')
        quarter = report_date_dt.month // 4 + 1
        return quarter

    # industry_pool = get_quarter_cumul_income(test_report_date, cumul=False)['industry'].unique()

    q1 = pd.DataFrame()
    q2 = pd.DataFrame()
    q3 = pd.DataFrame()
    q4 = pd.DataFrame()
    data_classified_by_quarter = [q1, q2, q3, q4]
    for y in range(year):
        sum_data = pd.DataFrame()
        for i in range(1, 5):
            back_q_num = y * 4 + i
            prev_report_date = get_report_date(test_report_date, -back_q_num)
            quarter = get_quarter(prev_report_date)
            revenue_data = get_quarter_cumul_income(prev_report_date, cumul=False)[['ticker', 'quarter_revenue']]

            # 生成总数
            if i == 1:
                sum_data = revenue_data[['ticker', 'quarter_revenue']]
            else:
                sum_data = pd.merge(sum_data, revenue_data[['ticker', 'quarter_revenue']], on='ticker')
                sum_data['quarter_revenue'] = sum_data['quarter_revenue_x'] + sum_data['quarter_revenue_y']
                sum_data.drop(['quarter_revenue_x', 'quarter_revenue_y'], axis=1, inplace=True)

        sum_data.rename(columns={'quarter_revenue': 'sum'}, inplace=True)

        # 计算占比
        for i in range(1, 5):
            back_q_num = y * 4 + i
            prev_report_date = get_report_date(test_report_date, -back_q_num)
            quarter = get_quarter(prev_report_date)
            revenue_data = get_quarter_cumul_income(prev_report_date, cumul=False)[['ticker', 'quarter_revenue']]
            revenue_data = pd.merge(revenue_data, sum_data, on='ticker')
            revenue_data['ratio'] = revenue_data['quarter_revenue'] / revenue_data['sum']

            # 合并industry
            industry = get_ticker_property(revenue_data['ticker'].tolist(),
                                                  prev_report_date, ['industry'])
            revenue_data = pd.merge(revenue_data, industry, on='ticker').drop(['quarter_revenue', 'sum'], axis=1)
            if y == 0:
                data_classified_by_quarter[quarter-1] = revenue_data
            else:
                data_classified_by_quarter[quarter-1] = pd.concat([data_classified_by_quarter[i-1], revenue_data], axis=0)

    d1 = pd.DataFrame()
    d2 = pd.DataFrame()
    d3 = pd.DataFrame()
    d4 = pd.DataFrame()
    industry_ratio_map = [d1, d2, d3, d4]
    for i in range(4):
        industry_ratio_map[i] = data_classified_by_quarter[i].groupby('industry').mean()

    return industry_ratio_map
示例#5
0
def get_ewma_revenue(test_report_date, ttm_term=4, alpha=0.55, industry=None):
    """
    :param test_report_date: 报告日期
    :param alpha: 该系数越高,说明对过去值赋予的权重越低
    :param ttm_term: 计算ewma的过去期数,虽然不是ttm,这样命名是为了模型系数统一
    :return: ['ticker', 'ewma']
    """

    prev_report_dates = [
        get_report_date(test_report_date, -i) for i in range(1, ttm_term + 2)
    ]

    ret = pd.DataFrame()
    for report_date in prev_report_dates:
        quarterly_income = get_quarter_cumul_income(
            report_date, cumul=False)[['ticker', 'quarter_revenue']]
        if ret.empty:
            ret = quarterly_income
        else:
            ret = pd.merge(ret, quarterly_income, on='ticker')
        ret.rename(columns={'quarter_revenue': report_date}, inplace=True)

    #  以递归的方式计算ewma
    def cal_ewma(test_date):
        if test_date == prev_report_dates[-2]:
            return alpha * ret[test_date] + (
                1 - alpha) * ret[prev_report_dates[-1]]
        else:
            return alpha * ret[test_date] + (1 - alpha) * cal_ewma(
                get_report_date(test_date, -1))

    ewma_data = cal_ewma(get_report_date(test_report_date, -1))
    ewma_data = pd.DataFrame(ewma_data, columns=['ewma'])
    ewma_data = pd.concat([ret['ticker'], ewma_data], axis=1)

    report_date_dt = datetime.strptime(test_report_date, '%Y-%m-%d')
    quarter = report_date_dt.month // 4 + 1
    ewma_data['ewma'] = ewma_data['ewma'] * quarter

    # 添加 行业和市值 信息
    ticker_property = get_ticker_property(
        ret['ticker'].tolist(), get_report_date(test_report_date, -1),
        ['industry', 'market_value'])
    industry_info = ticker_property[['ticker', 'industry']]
    market_value = ticker_property[['ticker', 'market_value']]
    ewma_data = pd.merge(ewma_data, industry_info, on='ticker')
    ewma_data = pd.merge(ewma_data, market_value, on='ticker')

    # 去掉所有na
    ewma_data.dropna(inplace=True)
    # """
    if industry is None:
        pass
    else:
        ewma_data = ewma_data[ewma_data['industry'].isin(list(industry))]
    # """
    return ewma_data
示例#6
0
 def cal_ewma(test_date):
     if test_date == prev_report_dates[-2]:
         return alpha * ret[test_date] + (
             1 - alpha) * ret[prev_report_dates[-1]]
     else:
         return alpha * ret[test_date] + (1 - alpha) * cal_ewma(
             get_report_date(test_date, -1))
示例#7
0
def revenue_predict(pred_report_date, tickers, alpha, year):
    """
    预测本期的营业收入

    如果要预测的ticker不在计算ttm的DataFrame中,那么以行业中市值相近的ticker替代
    """
    # 获取营业收入的ttm值
    # ret = get_ewma_revenue(pred_report_date, alpha, 4)  # 此处有一超参数
    # ret = get_revenue_ttm(pred_report_date)
    ret = get_adjusted_ttm(pred_report_date, year)
    ret = ret[ret['ticker'].isin(tickers)]
    if len(tickers) > len(ret['ticker']):
        missing_tickers = [ticker for ticker in tickers if ticker not in ret['ticker'].tolist()]
        missing_tickers_property = get_ticker_property(missing_tickers,
                                                       get_report_date(pred_report_date, -1), ['industry', 'market_value'])

        # unsolved_ticker = []
        for ticker in missing_tickers_property['ticker'].tolist():
            market_value = missing_tickers_property[missing_tickers_property['ticker'] == ticker]['market_value'].values[0]
            industry = missing_tickers_property[missing_tickers_property['ticker'] == ticker]['industry'].values[0]
            temp = ret[ret['industry'] == industry]

            if temp.empty:
                # unsolved_ticker.append(ticker)
                continue
            else:
                closest_df = temp.iloc[(temp['market_value'] - market_value).abs().argsort()[:1]].copy()
                closest_df.iloc[0, 0] = ticker
                ret = pd.concat([ret, closest_df])
        # ret.drop(unsolved_ticker, inplace=True)
    return ret
示例#8
0
def ttm_ratio_model(test_report_date, year):
    # 真实值
    # 获得测试报告的真实累计营收
    quarter_income_cumul = get_quarter_cumul_income(test_report_date)[[
        'ticker', 'revenue'
    ]]
    tickers = quarter_income_cumul['ticker'].tolist()
    # 预测值
    pred_quarter_income = revenue_predict(test_report_date, tickers, year)

    # 获得测试上一期的累计营收
    prev_report_date = get_report_date(test_report_date, num_quarter=-4)
    prev_quarter_cumul_income = get_quarter_cumul_income(
        prev_report_date, cumul=True)[['ticker', 'revenue']]
    prev_quarter_cumul_income.rename(
        columns={'revenue': 'prev_quarter_revenue_cumul'}, inplace=True)

    # 获得测试上一期的非累计营收
    prev_quarter_income = get_quarter_cumul_income(
        prev_report_date, cumul=False)[['ticker', 'quarter_revenue']]
    prev_quarter_income.rename(
        columns={'quarter_revenue': 'prev_quarter_revenue'}, inplace=True)

    # 合并到同一个DataFrame里面
    temp = pd.merge(prev_quarter_cumul_income,
                    prev_quarter_income,
                    on='ticker')
    temp2 = pd.merge(temp, pred_quarter_income, on='ticker')
    ret = pd.merge(temp2, quarter_income_cumul, on='ticker')

    # 还是有个别ticker 是nan, 暂且去除掉
    ret.replace([np.inf, -np.inf], np.nan, inplace=True)
    ret.dropna(inplace=True)

    # 预测 = 上一季累计 + 上一季非累计 * ttm_ratio
    # 如果是一季度,那么不需要上一季累计的值
    report_date_dt = datetime.strptime(test_report_date, '%Y-%m-%d')
    quarter = report_date_dt.month // 4 + 1
    """
    if quarter == 1:
        ret['ttm'] = (1 + ret['ttm_ratio']) * ret['prev_quarter_revenue']
    else:
        ret['ttm'] = ret['prev_quarter_revenue_cumul'] + (1 + ret['ttm_ratio']) * ret['prev_quarter_revenue']
    """
    ret['ttm'] = (1 + ret['ttm_ratio']) * ret['prev_quarter_revenue_cumul']
    print('Baseline model weighted error: {0}'.format(
        weighted_error(pred_y=ret['ttm'].values,
                       test_y=ret['revenue'].values,
                       market_values=ret['market_value'].values)))

    return
示例#9
0
def get_adjusted_again_ttm(test_report_date, year, ttm_term=4):
    ind_ratio_map = cal_ticker_ratio(test_report_date, year)
    every_ticker_map = cal_ticker_ratio(test_report_date, year)
    prev_report_dates = [
        get_report_date(test_report_date, -i) for i in range(1, ttm_term + 1)
    ]

    ret = pd.DataFrame()  # 用于存放分季度的数据
    for report_date in prev_report_dates:
        quarter = get_quarter(report_date)
        ind_map = ind_ratio_map[quarter - 1]
        ticker_map = every_ticker_map[quarter - 1]

        # 读取revenue数据
        quarterly_income = get_quarter_cumul_income(
            report_date, cumul=False)[['ticker', 'quarter_revenue']]
        # quarterly_income.rename(columns={'quarter_revenue': quarter}, inplace=True)

        # 读取市值与产业数据
        # ticker_property = get_ticker_property(quarterly_income['ticker'].tolist(),
        #                                       get_report_date(test_report_date, -1), ['industry', 'market_value'])
        # industry = ticker_property[['ticker', 'industry']]
        # market_value = ticker_property[['ticker', 'market_value']]

        # quarterly_income = pd.merge(quarterly_income, industry, on='ticker')
        # quarterly_income = pd.merge(quarterly_income, market_value, on='ticker')

        # 根据ticker_map中是否包含数据中的ticker将数据分成两组

        in_data = quarterly_income[quarterly_income['ticker'].isin(
            ticker_map.index)].copy()  # 找得到对应ticker的数据
        not_in_data = quarterly_income.drop(
            in_data.index).copy()  # 找不到dict中对应ticker的数据,用行业数据代替

        # 处理in_data
        in_data['ratio_based_weight'] = in_data['ticker'].map(
            ticker_map['ratio'].to_dict())
        # 处理not_in_data
        if not_in_data.empty:
            processed_data = in_data

        else:
            """
                ticker_property = get_ticker_property(not_in_data['ticker'].tolist(),
                         get_report_date(test_report_date, -1), ['industry', 'market_value'])
                industry = ticker_property[['ticker', 'industry']]
                not_in_data = pd.merge(not_in_data, industry, on='ticker')

                not_in_data['ratio'] = not_in_data['ticker'].map(ind_map['ratio'].to_dict())
                not_in_data.drop('industry', axis=1, inplace=True)
            """

            not_in_data['ratio_based_weight'] = 0.25

            # 纵向合并两个DF
            processed_data = pd.concat([in_data, not_in_data])

        if ret.empty:
            ret = processed_data
            ret['ttm'] = ret['quarter_revenue'] * ret['ratio_based_weight']
            ret.drop('ratio_based_weight', axis=1, inplace=True)
        else:
            ret = pd.merge(ret, processed_data, on='ticker')
            ret['ttm'] += ret['quarter_revenue'] * ret['ratio_based_weight']
            ret.drop('ratio_based_weight', axis=1, inplace=True)
        ret.rename(
            columns={'quarter_revenue': 'quarter_revenue_' + report_date},
            inplace=True)

    report_date_dt = datetime.strptime(test_report_date, '%Y-%m-%d')
    quarter = report_date_dt.month // 4 + 1
    ret['ttm'] = ret['ttm'] * quarter / (ttm_term / 4.0)

    # 添加 行业和市值 信息
    ticker_property = get_ticker_property(
        ret['ticker'].tolist(), get_report_date(test_report_date, -1),
        ['industry', 'market_value'])
    industry = ticker_property[['ticker', 'industry']]
    market_value = ticker_property[['ticker', 'market_value']]
    ret = pd.merge(ret, industry, on='ticker')
    ret = pd.merge(ret, market_value, on='ticker')

    # 去掉所有na

    return ret
示例#10
0
def cal_ticker_ratio(test_report_date, year):
    """
    calculate ticker-ratio
    返回的list里的DataFrames以ticker为index
    """
    q1 = pd.DataFrame()
    q2 = pd.DataFrame()
    q3 = pd.DataFrame()
    q4 = pd.DataFrame()
    data_classified_by_quarter = [q1, q2, q3, q4]

    for y in range(year):
        sum_data = pd.DataFrame()
        for i in range(1, 5):
            back_q_num = y * 4 + i
            prev_report_date = get_report_date(test_report_date, -back_q_num)
            quarter = get_quarter(prev_report_date)
            revenue_data = get_quarter_cumul_income(
                prev_report_date, cumul=False)[['ticker', 'quarter_revenue']]

            # 生成总数
            if i == 1:
                sum_data = revenue_data[['ticker', 'quarter_revenue']]
            else:
                sum_data = pd.merge(sum_data,
                                    revenue_data[['ticker',
                                                  'quarter_revenue']],
                                    on='ticker')
                sum_data['quarter_revenue'] = sum_data[
                    'quarter_revenue_x'] + sum_data['quarter_revenue_y']
                sum_data.drop(['quarter_revenue_x', 'quarter_revenue_y'],
                              axis=1,
                              inplace=True)

        sum_data.rename(columns={'quarter_revenue': 'sum'}, inplace=True)
        for i in range(1, 5):
            back_q_num = y * 4 + i
            prev_report_date = get_report_date(test_report_date, -back_q_num)
            quarter = get_quarter(prev_report_date)
            revenue_data = get_quarter_cumul_income(
                prev_report_date, cumul=False)[['ticker', 'quarter_revenue']]
            revenue_data = pd.merge(revenue_data, sum_data, on='ticker')
            revenue_data['ratio'] = revenue_data[
                'quarter_revenue'] / revenue_data['sum']

            if y == 0:
                data_classified_by_quarter[quarter - 1] = revenue_data
            else:
                data_classified_by_quarter[quarter - 1] = pd.concat(
                    [data_classified_by_quarter[i - 1], revenue_data], axis=0)

    d1 = pd.DataFrame()
    d2 = pd.DataFrame()
    d3 = pd.DataFrame()
    d4 = pd.DataFrame()
    ticker_ratio_map = [d1, d2, d3, d4]
    for i in range(4):
        ticker_ratio_map[i] = data_classified_by_quarter[i].groupby(
            'ticker').mean()

    return ticker_ratio_map
示例#11
0
def revenue_predict(pred_report_date,
                    tickers,
                    alpha=0.5,
                    ttm_term=4,
                    year=4,
                    industry=None):
    """
    预测本期的营业收入
    如果要预测的ticker不在计算ttm的DataFrame中,那么以行业中市值相近的ticker替代
    method可选:'ttm', 'ewma', 'ind_ttm', 'ticker_ttm'
    """

    # 获取营业收入的ttm值

    ret = get_ewma_revenue(pred_report_date, ttm_term, alpha, industry)

    ret = ret[ret['ticker'].isin(tickers)]
    if len(tickers) > len(ret['ticker']):
        missing_tickers = [
            ticker for ticker in tickers
            if ticker not in ret['ticker'].tolist()
        ]
        missing_tickers_property = get_ticker_property(
            missing_tickers, get_report_date(pred_report_date, -1),
            ['industry', 'market_value'])
        if industry is None:
            pass
        else:
            missing_tickers_property = missing_tickers_property[
                missing_tickers_property['industry'].isin(industry)]

        # unsolved_ticker = []
        for ticker in missing_tickers_property['ticker'].tolist():
            market_value = missing_tickers_property[
                missing_tickers_property['ticker'] ==
                ticker]['market_value'].values[0]
            industry_info = missing_tickers_property[
                missing_tickers_property['ticker'] ==
                ticker]['industry'].values[0]
            temp = ret[ret['industry'] == industry_info]
            closest_df = temp.iloc[(temp['market_value'] -
                                    market_value).abs().argsort()[:1]].copy()
            closest_df.iloc[0, 0] = ticker
            if temp.empty:
                # unsolved_ticker.append(ticker)
                continue
            else:
                closest_df = temp.iloc[(
                    temp['market_value'] -
                    market_value).abs().argsort()[:1]].copy()
                closest_df.iloc[0, 0] = ticker

                quarter = get_quarter(pred_report_date)
                last_report_date = get_report_date(pred_report_date, -1)
                last_quarter_revenue = get_quarter_cumul_income(
                    last_report_date, cumul=False)['ticker' == ticker]
                if quarter == 1:
                    closest_df['ewma'] = last_quarter_revenue.iloc[0, 0]
                else:
                    last_quarter_cumul_revenue = get_quarter_cumul_income(
                        last_report_date)['ticker' == ticker]
                    closest_df['ewma'] = last_quarter_revenue.iloc[
                        0, 0] + last_quarter_cumul_revenue.iloc[0, 0]
                ret = pd.concat([ret, closest_df])

        # ret.drop(unsolved_ticker, inplace=True)
    return ret
示例#12
0
    ret.dropna(inplace=True)

    # 追踪数据
    ret.to_csv('../tracing_data_{0}.csv'.format(test_report_date))

    w_err = weighted_error(pred_y=ret['ewma'].values,
                           test_y=ret['revenue'].values,
                           market_values=ret['market_value'].values)
    print('Baseline model weighted error: {0}'.format(w_err))

    return w_err


if __name__ == '__main__':
    report_dates = [get_report_date('2018-03-31', -i) for i in range(1, 9)]
    # report_dates = ['2017-06-30']
    t_term = 4  # 这也是一个超参数
    print('ttm_term=', t_term)
    models = [
        'ewma_0.4', 'ewma_0.45', 'ewma_0.5', 'ewma_0.55', 'ewma_0.6',
        'ewma_0.65', 'ewma_0.7'
    ]
    errors = []

    result = pd.DataFrame(index=models)
    for report_date in report_dates:
        print('report_date:', report_date)
        for a in range(40, 75, 5):
            alpha = a / 100
            errors.append(