Exemplo n.º 1
0
def summary_release_2_docx(title, img_meta_dic_list, stg_run_id=None, enable_clean_cache=True):
    """
    生成 预测成功率趋势报告
    :param title:
    :param img_meta_dic_list:
    :param stg_run_id:
    :param enable_clean_cache:
    :return:
    """
    logger.debug('生成报告开始')
    # 生成 docx 文件
    document = docx.Document()
    # 设置默认字体
    document.styles['Normal'].font.name = '微软雅黑'
    document.styles['Normal']._element.rPr.rFonts.set(docx.oxml.ns.qn('w:eastAsia'), '微软雅黑')
    # 创建自定义段落样式(第一个参数为样式名, 第二个参数为样式类型, 1为段落样式, 2为字符样式, 3为表格样式)
    UserStyle1 = document.styles.add_style('UserStyle1', 1)
    # 设置字体尺寸
    UserStyle1.font.size = docx.shared.Pt(40)
    # 设置字体颜色
    UserStyle1.font.color.rgb = docx.shared.RGBColor(0xff, 0xde, 0x00)
    # 居中文本
    UserStyle1.paragraph_format.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER
    # 设置中文字体
    UserStyle1.font.name = '微软雅黑'
    UserStyle1._element.rPr.rFonts.set(docx.oxml.ns.qn('w:eastAsia'), '微软雅黑')

    # 文件内容
    document.add_heading(title, 0).alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER
    document.add_paragraph('')
    document.add_paragraph('')
    heading_size = 1
    for num, info_dic in enumerate(img_meta_dic_list, start=1):
        trade_date_last_train = info_dic['trade_date_last_train']
        trade_date_end = info_dic['trade_date_end']
        document.add_heading(
            f"{num}、{date_2_str(trade_date_last_train)} - {date_2_str(trade_date_end)}", heading_size)
        split_point_list = info_dic['split_point_list']
        if split_point_list is None:
            p = document.add_paragraph(f"{num}.1) 日期区间段1个:\n")
            p.add_run(f'\t1) {date_2_str(trade_date_last_train)} ~ {date_2_str(trade_date_end)}\n')
        else:
            p = document.add_paragraph(f"{num}.1) 日期区间段{len(split_point_list) - 1}个:\n")
            for num2, (point1, point2) in enumerate(
                    iter_2_range(split_point_list, has_left_outer=False, has_right_outer=False), start=1):
                p.add_run(f'\t{num2}) {date_2_str(point1)} ~ {date_2_str(point2)}\n')

        document.add_paragraph(f"{num}.2) 模型路径:\n\t{info_dic['module_file_path']}")
        document.add_paragraph(f"{num}.3) 取样状态(random_state):\n\t{info_dic['predict_test_random_state']}")
        document.add_paragraph(f"{num}.4) 展示数据长度:\n\t{info_dic['in_range_count']}")
        document.add_paragraph(f"{num}.5) 预测准确率趋势图:")
        document.add_picture(info_dic['img_file_path'])

    file_name = f"{title}.docx"
    file_path = os.path.join(get_report_folder_path(stg_run_id), file_name)
    document.save(file_path)
    if enable_clean_cache:
        clean_cache()
    logger.debug('生成报告结束。%s', file_path)
    return file_path
Exemplo n.º 2
0
    def get_df_iter(self,
                    date_start,
                    date_end,
                    step,
                    df_len_limit=3000,
                    deep=0):
        """
        获取日期范围内的数据,当数据记录大于上限条数时,将日期范围进行二分法拆分,迭代进行查询
        :param date_start:
        :param date_end:
        :param step:
        :param df_len_limit:
        :param deep:
        :return:
        """
        for num, (date_from, date_to) in enumerate(iter_2_range(
                range_date(date_start, date_end, step),
                has_left_outer=False,
                has_right_outer=False),
                                                   start=1):
            q = query(self.statement).filter(
                self.statement.pub_date > date_2_str(date_from),
                self.statement.pub_date <= date_2_str(date_to))

            df = finance.run_query(q)
            df_len = df.shape[0]
            if df_len >= df_len_limit:
                if step >= 2:
                    self.logger.warning(
                        '%s%s%d) [%s ~ %s] 包含 %d 条数据,可能已经超越 %d 条提取上限,开始进一步分割日期',
                        self.table_name, '  ' * deep, num, date_from, date_to,
                        df_len, df_len_limit)
                    yield from self.get_df_iter(date_from,
                                                date_to,
                                                step // 2,
                                                deep=deep + 1)
                else:
                    self.logger.warning(
                        '%s%s%d) [%s ~ %s] 包含 %d 条数据,可能已经超越 %d 条提取上限且无法再次分割日期范围,手动需要补充提取剩余数据',
                        self.table_name, '  ' * deep, num, date_from, date_to,
                        df_len, df_len_limit)
                    yield df, date_from, date_to
            else:
                self.logger.debug('%s%s%d) [%s ~ %s] 包含 %d 条数据',
                                  self.table_name, '  ' * deep, num, date_from,
                                  date_to, df_len)
                yield df, date_from, date_to
Exemplo n.º 3
0
def merge_ifind_stock_daily(ths_code_set: set = None, date_from=None):
    """将ds his 以及财务数据合并为 daily 数据"""
    table_name = 'ifind_stock_daily'
    logging.info("合成 %s 开始", table_name)
    has_table = engine_md.has_table(table_name)
    if date_from is None and has_table:
        sql_str = "select adddate(max(`time`),1) from {table_name}".format(
            table_name=table_name)
        with with_db_session(engine_md) as session:
            date_from = date_2_str(session.execute(sql_str).scalar())
    # 獲取各個表格數據
    ifind_his_df = get_ifind_daily_df('ifind_stock_daily_his', date_from)
    ifind_ds_df = get_ifind_daily_df('ifind_stock_daily_ds', date_from)
    ifind_report_date_df = get_ifind_report_date_df('ifind_stock_report_date',
                                                    None)
    ifind_fin_df = get_ifind_daily_df('ifind_stock_fin', None)
    ifind_fin_df_g = ifind_fin_df.groupby('ths_code')
    ths_code_set_4_daily = set(ifind_fin_df_g.size().index)
    # 合并 ds his 数据
    ifind_his_ds_df = pd.merge(ifind_his_df,
                               ifind_ds_df,
                               how='outer',
                               on=['ths_code', 'time'])  # 拼接後續有nan,無數據
    ifind_his_ds_df_g = ifind_his_ds_df.groupby('ths_code')
    logger.debug("提取数据完成")
    # 计算 财报披露时间
    report_date_dic_dic = {}
    for report_date_g in [
            ifind_report_date_df.groupby(
                ['ths_code', 'ths_regular_report_actual_dd_stock'])
    ]:
        for num, ((ths_code, report_date), data_df) in enumerate(report_date_g,
                                                                 start=1):
            if ths_code_set is not None and ths_code not in ths_code_set:
                continue
            if is_nan_or_none(report_date):
                continue
            report_date_dic = report_date_dic_dic.setdefault(ths_code, {})
            if ths_code not in ths_code_set_4_daily:
                logger.error('fin 表中不存在 %s 的財務數據', ths_code)
                continue
            ifind_fin_df_temp = ifind_fin_df_g.get_group(ths_code)
            if report_date not in report_date_dic_dic:
                ifind_fin_df_temp = ifind_fin_df_temp[
                    ifind_fin_df_temp['time'] <= report_date]
                if ifind_fin_df_temp.shape[0] > 0:
                    report_date_dic[
                        report_date] = ifind_fin_df_temp.sort_values(
                            'time').iloc[0]

    # # 设置 dtype
    dtype = {'report_date': Date}
    for dic in [
            DTYPE_STOCK_DAILY_DS, DTYPE_STOCK_REPORT_DATE,
            DTYPE_STOCK_DAILY_FIN, DTYPE_STOCK_DAILY_HIS
    ]:
        for key, val in dic.items():
            dtype[key] = val

    logger.debug("计算财报日期完成")
    # 整理 data_df 数据
    tot_data_count, data_count, data_df_list, for_count = 0, 0, [], len(
        report_date_dic_dic)
    try:
        for num, (ths_code,
                  report_date_dic) in enumerate(report_date_dic_dic.items(),
                                                start=1):  # key:ths_code
            # TODO: 檢查判斷 ths_code 是否存在在ifind_fin_df_g 裏面,,size暫時使用  以後在驚醒改進
            if ths_code not in ifind_his_ds_df_g.size():
                logger.error('fin 表中不存在 %s 的財務數據', ths_code)
                continue
            # open low  等 is NAN 2438
            ifind_his_ds_df_cur_ths_code = ifind_his_ds_df_g.get_group(
                ths_code)  # shape[1] 30
            logger.debug('%d/%d) 处理 %s %d 条数据', num, for_count, ths_code,
                         ifind_his_ds_df_cur_ths_code.shape[0])
            report_date_list = list(report_date_dic.keys())
            report_date_list.sort()
            for report_date_from, report_date_to in iter_2_range(
                    report_date_list):
                logger.debug('%d/%d) 处理 %s [%s - %s]', num, for_count,
                             ths_code, date_2_str(report_date_from),
                             date_2_str(report_date_to))
                # 计算有效的日期范围
                if report_date_from is None:
                    is_fit = ifind_his_ds_df_cur_ths_code[
                        'time'] < report_date_to
                elif report_date_to is None:
                    is_fit = ifind_his_ds_df_cur_ths_code[
                        'time'] >= report_date_from
                else:
                    is_fit = (ifind_his_ds_df_cur_ths_code['time'] <
                              report_date_to) & (
                                  ifind_his_ds_df_cur_ths_code['time'] >=
                                  report_date_from)
                # 获取日期范围内的数据
                ifind_his_ds_df_segment = ifind_his_ds_df_cur_ths_code[
                    is_fit].copy()
                segment_count = ifind_his_ds_df_segment.shape[0]
                if segment_count == 0:
                    continue
                fin_s = report_date_dic[
                    report_date_from] if report_date_from is not None else None
                for key in DTYPE_STOCK_DAILY_FIN.keys():
                    if key in ('ths_code', 'time'):
                        continue
                    ifind_his_ds_df_segment[key] = fin_s[
                        key] if fin_s is not None and key in fin_s else None
                ifind_his_ds_df_segment['report_date'] = report_date_from
                # 添加数据到列表
                data_df_list.append(ifind_his_ds_df_segment)
                data_count += segment_count

            if DEBUG and len(data_df_list) > 1:
                break

            # 保存数据库
            if data_count > 10000:
                # 保存到数据库
                data_df = pd.concat(data_df_list)
                data_count = bunch_insert_on_duplicate_update(
                    data_df, table_name, engine_md, dtype)
                tot_data_count += data_count
                data_count, data_df_list = 0, []

    finally:
        # 保存到数据库
        if len(data_df_list) > 0:
            data_df = pd.concat(data_df_list)
            data_count = bunch_insert_on_duplicate_update(
                data_df, table_name, engine_md, dtype)
            tot_data_count += data_count

        logger.info('%s 新增或更新记录 %d 条', table_name, tot_data_count)
        if not has_table and engine_md.has_table(table_name):
            alter_table_2_myisam(engine_md, [table_name])
            build_primary_key([table_name])
Exemplo n.º 4
0
    return pd.DataFrame(data_list)


def get_ifind_daily_df(table_name, date_from) -> pd.DataFrame:
    if date_from is None:
        sql_str = "select * from {table_name}".format(table_name=table_name)
        data_df = pd.read_sql(sql_str, engine_md)  # , index_col='ths_code'
    else:
        sql_str = "select * from {table_name} where time >= %s".format(
            table_name=table_name)
        data_df = pd.read_sql(sql_str, engine_md,
                              params=[date_from])  # , index_col='ths_code'
    return data_df


def get_wind_daily_df(table_name, date_from) -> pd.DataFrame:
    if date_from is None:
        sql_str = "select * from {table_name}".format(table_name=table_name)
        data_df = pd.read_sql(sql_str, engine_md)  # , index_col='ths_code'
    else:
        sql_str = "select * from {table_name} where time >= %s".format(
            table_name=table_name)
        data_df = pd.read_sql(sql_str, engine_md,
                              params=[date_from])  # , index_col='ths_code'
    return data_df


if __name__ == "__main__":
    for x in iter_2_range([1, 2, 3]):
        print(x)
def plot_industry_classified_mid(col_name='ev2_to_ebitda'):
    # sql_str = """select sector_code, sector_name,base.trade_date, sum(ev2_to_ebitda) tot_val
    #     from (
    #         SELECT * FROM fof_ams_dev.wind_sectorconstituent where sector_name like 'cs%%'
    #     ) base
    #     LEFT JOIN
    #     (
    #     select trade_date, wind_code, ev2_to_ebitda from wind_stock_daily where ev2_to_ebitda is not null
    #     ) val
    #     on base.trade_date = val.trade_date
    #     and base.wind_code = val.wind_code
    #     group by sector_code, base.trade_date
    #     having tot_val is not null"""
    # TODO: 待行业数据下载齐全后可生成相应的分布图
    sector_sql_str = """SELECT sector_name, trade_date, wind_code FROM fof_ams_dev.wind_sectorconstituent 
        where sector_name like 'cs%'"""
    with with_db_session(engine_md) as session:
        table = session.execute(sector_sql_str)
        sector_trade_date_wind_code_list_dic = defaultdict(dict)
        num = 0
        for num, (sector_name, trade_date,
                  wind_code) in enumerate(table.fetchall(), start=1):
            if sector_name not in sector_trade_date_wind_code_list_dic:
                sector_trade_date_wind_code_list_dic[sector_name] = {
                    'trade_date_set': set(),
                    'trade_date_wind_code_list_dic': defaultdict(list)
                }
            sector_trade_date_wind_code_list_dic[sector_name][
                'trade_date_set'].add(trade_date)
            sector_trade_date_wind_code_list_dic[sector_name][
                'trade_date_wind_code_list_dic'][trade_date].append(wind_code)
    sector_count = len(sector_trade_date_wind_code_list_dic)
    logger.debug('获取行业数据 %d 条 %d 个行业', num, sector_count)

    stock_sql_str = f"""select wind_code, trade_date, `{col_name}` from wind_stock_daily 
        where `{col_name}` is not null"""
    data_df = pd.read_sql(stock_sql_str, engine_md)
    logger.debug('获取行情数据 %d 条', data_df.shape[0])
    pivot_df = data_df.pivot(index='trade_date',
                             columns='wind_code',
                             values=col_name).sort_index()
    logger.debug('转换数据 %s', pivot_df.shape)

    sector_trade_date_val_list_dic, sector_trade_date_val_dic = {}, {}
    logger.debug('计算 %d 个行业中位数', sector_count)
    for num, (sector_name, data_dic) in enumerate(
            sector_trade_date_wind_code_list_dic.items(), start=1):
        trade_date_list = list(data_dic['trade_date_set'])
        trade_date_list.sort()
        trade_date_list_len = len(trade_date_list)
        logger.debug('%d/%d) %s %d 个交易日', num, sector_count, sector_name,
                     trade_date_list_len)
        trade_date_wind_code_list_dic = data_dic[
            'trade_date_wind_code_list_dic']
        # for trade_date, wind_code_list in trade_date_wind_code_list_dic.items():
        for num2, (trade_date_from, trade_date_to) in enumerate(iter_2_range(
                trade_date_list, has_left_outer=False),
                                                                start=1):
            wind_code_list = trade_date_wind_code_list_dic[trade_date_from]
            # logger.debug('%d/%d) [%d/%d] %s [%s %s)', num, sector_count, num2, trade_date_list_len,
            #              sector_name, trade_date_from, trade_date_to, )
            # 计算中位数
            try:
                tmp_df = pivot_df.loc[trade_date_from:trade_date_to,
                                      wind_code_list]
                if tmp_df.shape[0] == 0:
                    continue
            except KeyError:
                continue
            val_s = tmp_df.median(axis=1)
            if trade_date_to is not None:
                # 去除最后一天
                val_s = val_s.iloc[:-1]
            # 保存到dict
            if sector_name not in sector_trade_date_val_list_dic:
                sector_trade_date_val_list_dic[sector_name] = [val_s]
            else:
                sector_trade_date_val_list_dic[sector_name].append(val_s)

        # 合并计算结果成为 一个 Series
        if sector_name in sector_trade_date_val_list_dic and len(
                sector_trade_date_val_list_dic[sector_name]) > 0:
            logger.debug('%s %d 个交易日合并数据', sector_name, len(trade_date_list))
            sector_trade_date_val_dic[sector_name] = pd.concat(
                sector_trade_date_val_list_dic[sector_name])

    # 数据合并
    # 将所有 sector 的 数据合并成为 DataFrame
    logger.debug('合并 %d 个行业数据', sector_count)
    data_df = pd.DataFrame(sector_trade_date_val_dic)
    data_df.to_excel('median.xls', legend=False)
    data_df.plot()
    plt.show()