Exemplo n.º 1
0
def get_factor_return_daily(factor_return_name, trading_days=[]):
    """
    从本地数据库中获取某段日期某个factor_return的日收益率

    @factor_return_name (str): factor名称
    @trading_days (['%Y-%m-%d']): 日期列表
    :return: DataFrame, index: date, columns: [sec_id, group01-group10, factor]
    """

    if factor_return_name not in get_schema(
            "factor_return"):  # 判断所给定的factor_return是否存在本地factor库中
        Logger.error(
            "{} is not in FACTOR_RETURN library".format(factor_return_name))
        return
    else:
        filepath = os.path.join(DB_FACTOR_RETURN_PATH,
                                "{}.csv".format(factor_return_name))
        df_info = open_csv_as_df(filepath, validate=True)

        if not trading_days:
            output = df_info.copy()
        else:
            output = df_info[df_info.date.isin(trading_days)]
            not_found_dates = set(trading_days) - set(output["date"].tolist())
            if not_found_dates:
                Logger.warn(
                    "Following dates are invalid: {}".format(not_found_dates))
                return
        output = output.set_index(['date'])
        return output
Exemplo n.º 2
0
def get_index_contents(index_code, date="", approx=False, log=False):
    """
    读取单个日期指数成分股列表

    @index_code (str): 指数代码,目前支持 ['A', 'H', '000905.SH', '000300.SH', '000016.SH', 'HSI.HI']
    @date ('%Y-%m-%d'): 单个日期
    @log (Bool): 是否打印log
    :return (list): 股票代码列表
    """

    if log:
        Logger.info(
            "Reading index contents of {} on {}".format(index_code, date),
            "green")

    if not date:
        Logger.error("Empty date")
        raise ValueError

    # approx 用于保证更新 indicator 财报数据时财报日非交易日的情况
    if approx:
        date = get_nearest_trading_day(date=date,
                                       direction='left',
                                       self_included=True)

    if index_code in IDXCONT_AS_SQL:
        output = get_index_contents_from_sql(index_code, date, log=log)
    elif index_code in IDXCONT_AS_CSV:
        output = get_index_contents_from_csv(index_code)
    else:
        Logger.error("Unrecognized index code: {}".format(index_code))
        raise ValueError
    return output
Exemplo n.º 3
0
def update_calendar(start_date, end_date, log=False):
    """
    从Wind更新calendar相关数据 每次更新将删除原有所有数据 更新到当前区间

    @start_date ("%Y-%m-%d"): 开始日日期 必须是月初日期
    @end_date ("%Y-%m-%d"): 结束日日期 必须是月月末日期
    @log (Bool): 是否打印log
    """

    Logger.info("Updating calendar ...", "green")

    max_existed_date = get_trading_days
    with SqliteProxy(log=log) as proxy:
        proxy.connect(os.path.join(DB_CALENDAR_PATH, "calendar.db"))
        proxy.execute("DELETE FROM calendar")
        try:
            df = load_calendar_from_wind(start_date, end_date)
        except Exception:
            Logger.error("Error occurred when loading")
            raise ValueError
        try:
            proxy.write_from_dataframe(df, "calendar")
        except Exception:
            Logger.error(
                "Error occurred when writing dataframe into sqlite db")
            traceback.print_exc()
            raise ValueError
    if log:
        Logger.info("calendar was updated from {} to {}".format(
            start_date, end_date),
                    color="green")
        Logger.info("------------------------------------------")
Exemplo n.º 4
0
def get_index_weights(index_code, date=""):
    """
    读取单个日期指数成分股的权重

    @index_code (str): 指数代码,目前支持 ['000016.SH', '000300.SH', '000905.SH']
    @date (%Y-%m-%d): 单个日期
    :return: {sec_id: weight}
    """

    if not date:
        Logger.error("Empty date")
        raise ValueError

    if index_code not in ['000016.SH', '000300.SH', '000905.SH']:
        Logger.error("Invalid index code: {}".format(index_code))

    dbpath = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(date[:4]))
    with SqliteProxy(log=False) as proxy:
        proxy.connect(dbpath)
        query = "SELECT sec_id, weight FROM [{}] WHERE date = '{}' ".format(
            index_code, date)
        df = proxy.query_as_dataframe(query)

        if len(df) == 0:
            Logger.warn("Empty result when reading {} at {}".format(
                index_code, date))
            output = {}
        else:
            output = {
                df.at[i, 'sec_id']: df.at[i, 'weight']
                for i in range(len(df))
            }

        return output
Exemplo n.º 5
0
def sqlize_db_industry(subdb):
    """
    将 industry sql化

    @subdb (str): 子数据库名 
    """

    db_path = DB_PATH_LIB['industry']
    subdb_path = os.path.join(db_path, subdb)
    trading_days = listdir_advanced(subdb_path, 'json', strip_suffix=True)
    with SqliteProxy(log=False) as proxy:
        for year, dates in classify_dates_by_year(trading_days).items():
            path = os.path.join(db_path, '{}.db'.format(year))
            proxy.connect(path)
            if subdb not in proxy.list_tables:
                create_db_03(proxy, subdb)

            for date in dates:
                js = json2dict(os.path.join(subdb_path,
                                            '{}.json'.format(date)))
                df = pd.DataFrame(list(js.items()),
                                  columns=['sec_id', 'industry'])
                df['date'] = date
                try:
                    proxy.write_from_dataframe(df, "A_SWL1")
                except Exception:
                    Logger.error(
                        "Error occurred when sqlizing {} on {}.".format(
                            subdb, date))
                    traceback.print_exc()
Exemplo n.º 6
0
def open_db_folder(db=""):
    if not db:
        path = DB_PATH
    elif db in DB_PATH_LIB:
        path = DB_PATH_LIB[db]
    else:
        Logger.error("db not found: {}".format(db))

    subprocess.Popen(r'explorer "{}"'.format(path))
Exemplo n.º 7
0
def get_secs_factor_on_multidays(factor,
                                 sec_ids=[],
                                 trading_days=[],
                                 log=False):
    """
    从本地数据库中获取一段日期的单个factor的值,并返回 dict of DataFrame

    @factor (str): 单个factor
    @sec_ids (list): 支持多个股票查询,默认为[],表示查询范围是全A股
    @trading_days (["%Y-%m-%d"]): 日期列表
    @log (Bool): 是否打印log
    :return: {date: Dataframe},其中 DataFrame 列为factor名,index为sec_id
    """

    if log:
        Logger.info(
            "Reading {} from {} to {}".format(factor, trading_days[0],
                                              trading_days[-1]), "green")

    if factor not in get_schema("factor"):
        Logger.error("Unrecognized factor: {}".format(factor))
        raise ValueError

    if not isinstance(sec_ids, list):
        Logger.error("sec_ids must be list!")
        raise ValueError

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    # 长连接效率更高,所以这里不是复用 get_secs_factor 而是重新写
    with SqliteProxy(log=log) as proxy:
        output = {}
        for year, date_list in classify_dates_by_year(trading_days).items():
            path = os.path.join(DB_FACTOR, '{}.db'.format(year))
            proxy.connect(path)
            for date in date_list:
                if len(sec_ids) == 0:  # 为空默认全A股
                    conds = ""
                elif len(sec_ids) == 1:
                    conds = "AND sec_id = '{}'".format(sec_ids[0])
                else:
                    conds = "AND sec_id IN {}".format(tuple(sec_ids))
                query = "SELECT sec_id, {} FROM [{}] WHERE date = '{}' {}".format(
                    factor, factor, date, conds)
                try:
                    df = proxy.query_as_dataframe(query)
                except Exception:
                    Logger.error("Error occurred when reading {} at {}".format(
                        factor, date))
                    traceback.print_exc()
                    raise ValueError

                output[date] = df

    return output
Exemplo n.º 8
0
def load_secs_industry_sw_from_wind(index_code, date, level=1):
    """
    从Wind更新指定index成分股的申万行业数据

    @index_code (str): 指数代码 可选代码: "A" "H"
    @date (%Y-%m-%d): 单个日期
    @level (int): 行业级数 默认为1 表示为申万1级行业分类
    :return: (dict of str): 键是证券代码,值是行业名称
    """

    universe = get_index_contents(index_code, date, log=False)

    if not universe:
        Logger.error("Empty universe at {}!".format(date))
        return {}

    output = get_secs_industry_sw(sec_ids=universe, date=date, level=level, market=index_code)
    return output
Exemplo n.º 9
0
def load_secs_industry_gics_from_wind(index_code, date, level=1):
    """
    从Wind更新指定index成分股的gics行业数据

    @index_code (str):  "H_GICSL1"
    @date (%Y-%m-%d):  单个日期
    @level (int): 行业级数 默认为1 表示为申万1级行业分类
    :return: (dict of str): 键是证券代码,值是行业名称
    """

    universe = get_index_contents(index_code, date)

    if not universe:
        Logger.error("Empty universe at {}!".format(date))
        return {}

    output = get_secs_industry_gics(sec_ids=universe, level=1)
    return output
Exemplo n.º 10
0
def update_indicators(indicators=[],
                      trading_days=[],
                      sec_ids=[],
                      override=False,
                      log=False):
    """
    更新多个indicator的指定日期列表的数据

    @indicators (list): indicator的名称构成的列表
    @trading_days ([%Y-%m-%d]): 日期列表
    @override (Bool): 是否覆盖原记录 默认为False 表示不覆盖
    @log (Bool): 是否打印log
    """

    SCHEMA = get_schema('indicator')
    if not indicators:
        indicators = list(SCHEMA.keys())

    start = trading_days[0]
    end = trading_days[-1]

    update_days_map = {
        "财报数据": set(get_report_days(start, end)),
        "时间序列": set(get_trading_days(start, end)),
    }

    for ind in indicators:
        if ind in SCHEMA:
            # 更新日期取交集
            itype = SCHEMA[ind]['type']
            update_days = [
                t for t in trading_days if t in update_days_map[itype]
            ]
            if not update_days:
                Logger.warn("No valid days to update!")
            else:
                update_single_indicator(indicator=ind,
                                        trading_days=update_days,
                                        sec_ids=sec_ids,
                                        override=override,
                                        log=log)
        else:
            Logger.error("Unrecognized indicator: {}".format(ind))
Exemplo n.º 11
0
def calculate_factor(factor, date):
    """
    通过对indicator的计算得到因子的值

    :param: factor (str): 该factor的名字
    :param: date (%Y-%m-%d): 日期
    :return: dataframe 处理后的因子值
    """
    func = getattr(formula, "calculate_raw_{}".format(factor))
    if func is None:
        Logger.error("Formula not implemented: {}".format(factor))
        raise ValueError
    context, df_today, missing_flag = load_context(factor, date)
    last_day = get_previous_existed_day_in_table(date, DB_FACTOR, factor)
    if missing_flag == 1:
        if last_day is None:  # 无最新数据
            Logger.error("当前日期数据缺失值太多,且之前没有可以复制的文件")
            raise ValueError
        else:
            Logger.warn("由于 {} 值缺失太多直接复制于 {}".format(date, last_day))
            try:
                df_last = get_secs_factor(factor,
                                          sec_ids=[],
                                          date=last_day,
                                          log=False)
            except Exception:
                traceback.print_exc()
                Logger.warn("无法提取 {} 上个记录日的数据".format(factor))
                raise ValueError
            value = df_today.merge(df_last,
                                   how="left",
                                   left_on='sec_id',
                                   right_index=True)
            return value
    else:
        data_raw = func(context)
        data_final = statistical_process(  # 数据处理: 缺失值分离 winsorize 标准化
            data=data_raw,
            var=factor,
            winsor_LB=WINSORIZE_LB,
            winsor_UB=WINSORIZE_UB)
        return data_final
Exemplo n.º 12
0
def get_index_contents_from_sql(index_code, date="", log=False):
    path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(date[:4]))
    with SqliteProxy(log=log) as proxy:
        proxy.connect(path)
        query = "SELECT sec_id FROM [{}] WHERE date = '{}'".format(
            index_code, date)
        try:
            df = proxy.query_as_dataframe(query)
        except Exception:
            Logger.error("Error occurred when reading {} at {}".format(
                index_code, date))
            traceback.print_exc()
            raise ValueError

    if len(df) == 0:
        Logger.warn("Empty result when reading {} at {}".format(
            index_code, date))
        return []

    return df["sec_id"].tolist()
Exemplo n.º 13
0
def update_index_std(index, cp=3, log=False):
    """
    更新index_std
    更新原理: 无需指定trading_days 更新全部index中有的日期但在index_std中没有的日期
    @index <str>: index名称 不是index_std名称
    @cp <int>: winsorize的临界值
    """

    trading_days = get_unique_datelist_from_table("index", index)
    existed_days = get_unique_datelist_from_table("index_std", "{}_std".format(index))
    update_days = sorted(list(set(trading_days) - set(existed_days)))
    if len(update_days) == 0:
        Logger.warn("All given dates has existed. No need to update!!")
        return
    output = process_ts_index(index, update_days, cp)
    if len(output) == 0:
        Logger.error("Fail to process {} on given dates".format(index))
    df2mysql(USER, PASSWORD, "index_std", index + '_std', output)
    del output, trading_days, update_days
    gc.collect()
    Logger.info("Updated successfully!!")
Exemplo n.º 14
0
def update_industry(industry, trading_days=[], override=False, log=False):
    """
    从Wind更新某指数成分股申万一级行业数据

    @industry (str): 行业数据库名称
    @trading_days (['%Y-%m-%d']): 日期列表
    @override (Bool): 是否覆盖原记录 默认为False 表示不覆盖
    @log (Bool): 是否打印日志信息
    """

    Logger.info("Updating industry {}".format(industry), "green")

    if industry not in get_schema('industry'):
        Logger.error("Unrecognized industry: {}".format(industry))
        return

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    if industry in INDUSTRY_AS_SQL:
        update_industry_to_sql(industry, trading_days, override, log)
    elif industry in INDUSTRY_AS_JSON:
        # 非sql数据强制更新,原有的会自动保存副本
        update_industry_to_json(industry, trading_days)
    else:
        Logger.error("Unrecognized industry: {}".format(industry))
        raise ValueError

    if log:
        Logger.info("industry/{} is updated.".format(industry), color="green")
        Logger.info("------------------------------------------")
Exemplo n.º 15
0
def update_industry_to_json(industry, trading_days):
    try:
        date = trading_days[-1]
        index_code, loader = INDEX_LOADER_MAP[industry]
        info = loader(index_code, date, level=1)
    except Exception:
        Logger.error("Error occurred when loading {} on {}".format(industry, date))
        raise ValueError

    try:
        path = os.path.join(DB_INDUSTRY, '{}.json'.format(industry))
        copy_to = os.path.join(DB_INDUSTRY, '{}_backup.json'.format(industry))
        shutil.copy(path, copy_to)  # 保存副本,以防数据损坏
        dict2json(info, path, log=False)

        Logger.info("{} on {} is updated successfully".format(industry, date))
    except Exception:
        Logger.error("Error occurred when writing {} on {}".format(industry, date))
        raise ValueError

    # json files are different from sql, cannot use update_schema()
    # therefore update schema information explicitly
    try:
        now = datetime.now()
        schema = get_schema('industry')
        schema[industry]["begin date"] = ""
        schema[industry]["end date"] = now.strftime('%Y-%m-%d')
        schema[industry]['last update'] = now.strftime('%Y-%m-%d %H:%M:%S')
        save_schema(schema, 'industry')

        Logger.info("schema updated: {}".format(industry))
    except Exception:
        Logger.error("Error occurred when updating schema of {}".format(industry))
        traceback.print_exc()
        raise ValueError
Exemplo n.º 16
0
def get_secs_IC(ic_code, trading_days=[], log=False):
    """
    从本地数据库中获取一段日期的单个IC的值,并返回 dict of DataFrame

    @ic_code (str): 单个IC stocks_info: 全A股
    @trading_days (["%Y-%m-%d"]): 日期列表
    @log (Bool): 是否打印log
    :return: {date: Dataframe},date, sec_id, sec_name, is_st, is_trade, ...
    """

    if log:
        Logger.info(
            "Reading {} from {} to {}".format(ic_code, trading_days[0],
                                              trading_days[-1]), "green")

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    with MySQLProxy(log=log) as proxy:
        output = {}
        proxy.connect(USER, PASSWORD, 'index')
        #  注: 单个值用=,需要加上引号,多个值用tuple
        if len(trading_days) == 1:
            query = "SELECT * FROM {} WHERE date = '{}' ".format(
                ic_code, trading_days[0])
        else:
            query = "SELECT * FROM {} WHERE date in {}".format(
                ic_code, tuple(trading_days))

        try:
            df = proxy.query_as_dataframe(query)
        except Exception:
            Logger.error("Error occurred when reading {} at {}".format(
                ic_code, date))
            traceback.print_exc()
            raise ValueError

    return df
Exemplo n.º 17
0
def update_factors_return(factors_ret_to_update=[],
                          trading_days=[],
                          group_num=10,
                          log=True):
    """
    根据trading_days更新factor_return数据

    @factors_ret_to_update (<list>):  factor列表
    @trading_days (<[%Y-%m-%d]>) : 日期列表
    @group_num (<int>): 分组个数
    @log (<Bool>): 是否打印log
    """
    factor_return_schema = get_schema('factor_return')
    if len(factors_ret_to_update) == 0:
        factors_ret_to_update = list(factor_return_schema.keys())

    for factor_ret in factors_ret_to_update:
        if factor_ret not in factor_return_schema:
            Logger.error("Unrecognized factor return: {}".format(factor_ret))
        else:
            update_single_factor_return(factor_ret, trading_days, group_num,
                                        log)
Exemplo n.º 18
0
def get_secs_industry(industry_code, sec_ids=[], date=""):
    """
    获取某日期某些股票的的行业分类信息,数据格式 {股票代码:行业分类}

    @industry_code (str): 子数据库名称,目前支持 ["A_SWL1", "H_SWL1", "H_GICSL1"]
    @sec_ids: (list) 股票列表
    @date: ("%Y-%m-%d") 单个日期
    return: {sec_id: industry},不存在则忽略
    """

    if len(sec_ids) == 0:
        Logger.warn("Empty sec_ids when reading {} on {}!".format(industry_code, date))
        return {}

    if industry_code in INDUSTRY_AS_SQL:
        output = get_secs_industry_from_sql(industry_code, sec_ids, date)
    elif industry_code in INDUSTRY_AS_JSON:
        output = get_secs_industry_from_json(industry_code, sec_ids)
    else:
        Logger.error("Unrecognized industry code: {}".format(industry_code))
        raise ValueError
    return output
Exemplo n.º 19
0
def update_factors(factors=[], trading_days=[], override=False, log=False):
    """
    更新多个factor的指定日期列表的数据

    @factors (<list>):factor名称构成的列表
    @trading_days ([%Y-%m-%d]): 日期列表
    @override (<Bool>): 是否覆盖原记录 默认为False 表示不覆盖
    @log (<Bool>): 是否打印log
    """

    SCHEMA = get_schema("factor")
    if not factors:
        factors = sorted(SCHEMA, key=lambda x: SCHEMA[x]["level"])

    for fac in factors:
        if fac in SCHEMA:
            update_single_factor(factor=fac,
                                 trading_days=trading_days,
                                 override=override,
                                 log=log)
        else:
            Logger.error("Unrecognized factor: {}".format(fac))
            Logger.info("------------------------------------------")
Exemplo n.º 20
0
def get_secs_factor(factor, sec_ids=[], date="", log=False):
    """
    从本地数据库中获取单个日期的单个factor的值,并返回 DataFrame

    @factor (str): 单个factor
    @sec_ids (list): 支持多个股票查询,默认为[],表示查询范围是全A股
    @date ('%Y-%m-%d'): 单个日期
    @log (Bool): 是否打印log
    :return: Dataframe 列为factor名,index为sec_id
    """

    if log:
        Logger.info("Reading {} at {}".format(factor, date), "green")

    if factor not in get_schema("factor"):
        Logger.error("Unrecognized factor: {}".format(factor))
        raise ValueError

    if not isinstance(sec_ids, list):
        Logger.error("sec_ids must be list!")
        raise ValueError

    if not date:
        Logger.error("Empty date")
        raise ValueError

    with SqliteProxy(log=log) as proxy:
        path = os.path.join(DB_FACTOR, '{}.db'.format(date[:4]))
        proxy.connect(path)

        if len(sec_ids) == 0:  # 为空默认全A股
            conds = ""
        elif len(sec_ids) == 1:
            conds = "AND sec_id = '{}'".format(sec_ids[0])
        else:
            conds = "AND sec_id IN {}".format(tuple(sec_ids))

        query = "SELECT sec_id, {} FROM [{}] WHERE date = '{}' {}".format(
            factor, factor, date, conds)
        try:
            df = proxy.query_as_dataframe(query)
        except Exception:
            Logger.error("Error occurred when reading {} at {}".format(
                factor, date))
            traceback.print_exc()
            raise ValueError
        return df.sort_values(by=['sec_id']).set_index(['sec_id'])
Exemplo n.º 21
0
def sqlize_db(db_name, subdb_list=[]):
    """将数据库sql化"""

    if not subdb_list:
        subdb_list = list(get_schema(db_name).keys())
    else:
        subdb_list = [s for s in subdb_list if s in get_schema(db_name)]

    db_path = os.path.join(DB_PATH, db_name)

    with SqliteProxy(log=False) as proxy:
        for subdb in subdb_list:
            Logger.info("SQLing {}/{}".format(db_name, subdb), "green")

            subdb_path = os.path.join(db_path, subdb)
            trading_days = listdir_advanced(subdb_path,
                                            'csv',
                                            strip_suffix=True)
            for year, dates in classify_dates_by_year(trading_days).items():
                path = os.path.join(db_path, '{}.db'.format(year))
                proxy.connect(path)

                if subdb not in proxy.list_tables:
                    creator = DB_CREATOR_MAP[db_name]
                    creator(proxy, subdb)

                for date in dates:
                    df = pd.read_csv(
                        os.path.join(subdb_path, '{}.csv'.format(date)))
                    df['date'] = date
                    try:
                        proxy.write_from_dataframe(df, subdb)
                    except Exception:
                        Logger.error(
                            "Error occurred when sqlizing {} on {}.".format(
                                subdb, date))
                        traceback.print_exc()
Exemplo n.º 22
0
def update_index_contents_to_sql(index_code,
                                 trading_days,
                                 override,
                                 log=False):
    with SqliteProxy(log=log) as proxy:
        date_classfier = classify_dates_by_year(trading_days)

        for year, date_list in date_classfier.items():
            path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(year))
            proxy.connect(path)
            if index_code not in proxy.list_tables:
                create_table(proxy, "index_contents", index_code)

            # 判断已有数据
            query = "SELECT DISTINCT(date) FROM [{}]".format(index_code)
            lookup = proxy.query_as_dataframe(query)
            lookup = set(lookup['date'].tolist())

            for date in date_list:
                if date in lookup and not override:  # 更新的日期已经存在于数据库时,不覆盖则跳过
                    if log:
                        Logger.warn("{} records on {} is existed.".format(
                            index_code, date))
                    continue

                try:
                    loader = LOADER_MAP[index_code]
                    df = loader(index_code, date)
                    df['date'] = date
                except Exception:
                    Logger.error("Error occurred when loading {} on {}".format(
                        index_code, date))
                    raise ValueError

                if df is not None:  # 从Wind下载数据成功时
                    try:
                        if date in lookup and override:  # 覆盖时删除原记录
                            proxy.execute(
                                "DELETE FROM [{}] WHERE date = '{}'".format(
                                    index_code, date))

                        proxy.write_from_dataframe(df, index_code)
                    except Exception:
                        Logger.error(
                            "Error occurred when writing {} on {}".format(
                                index_code, date))
                        traceback.print_exc()
                        raise ValueError

                    Logger.info("{} on {} is updated successfully".format(
                        index_code, date))
                else:  # 从wind提取数据失败时
                    Logger.error("Fail to fetch {} data on {}".format(
                        index_code, date))
                    raise ValueError

    update_schema('index_contents', index_code)
Exemplo n.º 23
0
def generate_table_template(db, table_name):
    """生成数据库建表模板"""

    if db in ("indicator", "factor"):
        template = [
            ("date", "CHAR(10)", False, False),
            ("sec_id", "TEXT", False, False),
            (table_name, "REAL", False, True),
        ]
    elif db == "index_contents":
        if table_name == "A_SWL1":
            template = [
                ("date", "CHAR(10)", False, False),
                ("sec_id", "TEXT", False, False),
                ("sec_name", "TEXT", False, False),
            ]
        elif table_name in ('000016.SH', '000300.SH', '000905.SH'):
            template = [
                ("date", "CHAR(10)", False, False),
                ("sec_id", "TEXT", False, False),
                ("sec_name", "TEXT", False, False),
                ("weight", "REAL", False, False),
            ]
        else:
            Logger.error("Unrecognized table name: {}".format(table_name))
            raise ValueError
    elif db == "industry":
        template = [
            ("date", "CHAR(10)", False, False),
            ("sec_id", "TEXT", False, False),
            ("industry", "TEXT", False, True),
        ]
    else:
        Logger.error("Unrecognized db name: {}".format(db))
        raise ValueError
    return template
Exemplo n.º 24
0
def get_index_contents_on_multidays(index_code, trading_days=[], log=False):
    """
    读取多个日期某指数全部股票列表

    @index_code (str): 指数代码,目前支持 ['A', '000905.SH', '000300.SH', '000016.SH']
    @trading_days (['%Y-%m-%d']): 日期列表
    @log (Bool): 是否打印log
    :return: ({date: list}), key为date value为 股票代码列表
    """

    if log:
        Logger.info(
            "Reading all {} records between trading_days ...".format(
                index_code), "green")

    if len(trading_days) == 0:
        Logger.error("Empty date")
        raise ValueError
    elif len(trading_days) == 1:
        date = trading_days[0]
        return {date: get_index_contents(index_code, date, log=False)}

    output = {}
    if index_code in IDXCONT_AS_SQL:
        with SqliteProxy(log=log) as proxy:
            for year, date_list in classify_dates_by_year(
                    trading_days).items():
                path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(year))
                proxy.connect(path)

                query = "SELECT date, sec_id FROM [{}] WHERE date IN {}".format(
                    index_code, tuple(date_list))
                try:
                    df = proxy.query_as_dataframe(query)
                except Exception:
                    Logger.error(
                        "Empty result when reading {} from {} to {}".format(
                            index_code, trading_days[0], trading_days[-1]))
                    traceback.print_exc()
                    raise ValueError

                if len(df) == 0:
                    Logger.warn(
                        "Empty result when reading {} from {} to {}".format(
                            index_code, trading_days[0], trading_days[-1]))

                for date in date_list:
                    output[date] = df[df.date == date]['sec_id'].tolist()
    elif index_code in IDXCONT_AS_CSV:
        info = get_index_contents_from_csv(index_code)
        output = {date: info for date in trading_days}
    else:
        Logger.error("Unrecognized index code: {}".format(index_code))
        raise ValueError
    return output
Exemplo n.º 25
0
def get_secs_index_std(index_std, sec_ids=[], trading_days=[], log=False):
    """
    从本地数据库中获取一段日期的单个index_std的值,并返回 DataFrame

    @index_std (str): 单个index_std
    @sec_ids (list): 支持多个股票查询,默认为[],表示查询范围是全A股
    @trading_days (["%Y-%m-%d"]): 日期列表
    @log (Bool): 是否打印log
    :return: {date: Dataframe},其中 DataFrame 列为index_std名,index_std为sec_id
    """

    if log:
        Logger.info("Reading {} from {} to {}".format(index_std, trading_days[0], trading_days[-1]), "green")

    # if index_std not in get_schema("index_std"):
    #     Logger.error("Unrecognized index_std: {}".format(index_std))
    #     raise ValueError

    if not isinstance(sec_ids, list):
        Logger.error("sec_ids must be list!")
        raise ValueError

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    with MySQLProxy(log=log) as proxy:
        output={}
        proxy.connect(USER, PASSWORD, "index_std")
        #  注: 单个值用=,需要加上引号,多个值用tuple
        if len(sec_ids) == 0:
            if len(trading_days) == 1:
                query="SELECT * FROM {} WHERE date = '{}' ".format(index_std, trading_days[0])
            else:
                query="SELECT * FROM {} WHERE date in {}".format(index_std, tuple(trading_days))
        elif len(sec_ids) == 1:
            if len(trading_days) == 1:
                query="SELECT * FROM {} WHERE date = '{}' AND sec_id = '{}' ".format(index_std, trading_days[0], sec_ids[0])
            else:
                query="SELECT * FROM {} WHERE date in {} AND sec_id = '{}' ".format(index_std, tuple(trading_days), sec_ids[0])
        else:
            if len(trading_days) == 1:
                query="SELECT * FROM {} WHERE date = '{}' AND sec_id in {}".format(index_std, trading_days[0], tuple(sec_ids))
            else:
                query="SELECT * FROM {} WHERE date in {} AND sec_id in {}".format(index_std, tuple(trading_days), tuple(sec_ids))

        try:
            df=proxy.query_as_dataframe(query)
        except Exception:
            Logger.error("Error occurred when reading {} ".format(inde))
            traceback.print_exc()
            raise ValueError
    df['date']=df['date'].apply(lambda x: str(x))
    return df
Exemplo n.º 26
0
def load_single_indicator_on_single_day_from_wind(indicator,
                                                  sec_ids,
                                                  date,
                                                  log=False):
    """
    从wind上下载某个指定日期的指标
    @sec_ids<list> : 股票代码列表
    @indicator (str): 指标名称,仅支持单个indicator传递
    @date ("%Y-%m-%d): 单个日期
    return: DataFrame,columns=['sec_id','indicator_name']
    """

    WindAPI.login(is_quiet=True)

    schema = SCHEMA[indicator]
    options = schema['kwargs']
    if len(sec_ids) != 0:  # 为空表示全A股
        universe = sec_ids
    if schema["type"] == "时间序列":
        if len(sec_ids) == 0:
            universe = get_index_contents(index_code="A", date=date, log=log)
            if universe is None:
                Logger.error("Fail to fetch stock lists on {}".format(date))
                raise ValueError
        options["tradeDate"] = date.replace("-", "")
    elif schema["type"] == "财报数据":
        # approx参数为True,保证财报日为非交易日的情形
        if len(sec_ids) == 0:
            universe = get_index_contents(index_code="A",
                                          date=date,
                                          approx=True,
                                          log=log)
            if universe is None:
                Logger.error("Fail to fetch stock lists on: {}".format(date))
                raise ValueError
        options["rptDate"] = date.replace("-", "")
    else:
        Logger.error("Unrecognized indicator type: {}".format(schema["type"]))
        raise ValueError
    response = WDServer.wss(codes=",".join(universe),
                            fields=SCHEMA[indicator]['field'],
                            options=options2str(options))
    WindAPI.test_error(response)
    df = {field: response.Data[i] for i, field in enumerate(response.Fields)}
    df = pd.DataFrame(df, index=response.Codes).reset_index()
    df.columns = ["sec_id", indicator]
    return df
Exemplo n.º 27
0
def update_index_contents_to_csv(index_code, trading_days, override):
    try:
        date = trading_days[-1]
        df = loader(index_code, date)
        loader = LOADER_MAP[index_code]
    except Exception:
        Logger.error("Error occurred when loading {}".format(index_code))
        raise ValueError

    try:
        path = os.path.join(DB_INDEX_CONTENTS, '{}.csv'.format(index_code))
        copy_to = os.path.join(DB_INDEX_CONTENTS,
                               '{}_backup.csv'.format(index_code))
        shutil.copy(path, copy_to)  # 保存副本,以防数据损坏
        df.to_csv(path, encoding="utf-8", index=False)

        Logger.info("{} on {} is updated successfully".format(
            index_code, date))
    except Exception:
        Logger.error("Error occurred when writing {}".format(index_code))
        traceback.print_exc()
        raise ValueError

    # csv files are different from sql, cannot use update_schema()
    # therefore update schema information explicitly
    try:
        now = datetime.now()
        schema = get_schema('index_contents')
        schema[index_code]["begin date"] = ""
        schema[index_code]["end date"] = now.strftime('%Y-%m-%d')
        schema[index_code]['last update'] = now.strftime('%Y-%m-%d %H:%M:%S')
        save_schema(schema, 'index_contents')

        Logger.info("schema updated: {}".format(index_code))
    except Exception:
        Logger.error(
            "Error occurred when updating schema of {}".format(index_code))
        traceback.print_exc()
        raise ValueError
Exemplo n.º 28
0
def update_index_contents(index_code,
                          trading_days=[],
                          override=False,
                          log=False):
    """
    从Wind更新index_contents相关数据

    @index_code (str): 要更新的指标
    @trading_days (['%Y-%m-%d']): 传入的日期列表
    @override (Bool): 是否覆盖旧数据,默认为False,表示不覆盖
    @log (Bool): 是否打印log
    """

    Logger.info("Updating index_contents {}".format(index_code), "green")

    if index_code not in get_schema('index_contents'):
        Logger.error("Unrecognized index: {}".format(index_code))
        return

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    if index_code in IDXCONT_AS_SQL:
        update_index_contents_to_sql(index_code, trading_days, override, log)
    elif index_code in IDXCONT_AS_CSV:
        # 非sql数据强制更新,原有的会自动保存副本
        update_index_contents_to_csv(index_code, trading_days)
    else:
        Logger.error("Unrecognized index code: {}".format(index_code))
        raise ValueError

    if log:
        Logger.info("index_content/{} is updated.".format(index_code),
                    color="green")
        Logger.info("------------------------------------------")
Exemplo n.º 29
0
def get_single_index_daily_return(index_std,
                                  trading_days,
                                  cycle,
                                  groups=10,
                                  save_plot=True,
                                  save_daily=True,
                                  save_cum=True):
    """
    对单个index_std进行回测,输出单日收益表
    @index_std <str>: index名称
    @trading_days <list of date>: 回测时间段 时间是datetime格式
    @cycle <int>: 以天为单位的周期
    @groups <int>: 分组个数 默认为10
    """

    if len(trading_days) == 0:
        Logger.error("Empty date!!")

    # 根据周期计算会用到的时间点
    trading_days = sorted(trading_days)
    selected_days = []
    for i in range(cycle, len(trading_days), cycle):
        selected_days.append(trading_days[i])
    selected_days = list(map(str, selected_days))
    # 一次性获取回测周期的前复权收盘价和index_std
    df_close = get_secs_index(index='close',
                              sec_ids=[],
                              trading_days=selected_days)
    df_index = get_secs_index_std(index_std=index_std,
                                  sec_ids=[],
                                  trading_days=selected_days)
    df_daily = pd.DataFrame()
    for i in range(len(selected_days) - 1):
        date_now = selected_days[i]
        date_next = selected_days[i + 1]
        index_now = df_index[df_index.date == date_now]
        close_now = df_close[df_close.date == date_now].rename(
            columns={'close': 'close_now'})
        del close_now['date']
        close_next = df_close[df_close.date == date_next].rename(
            columns={'close': 'close_next'})
        del close_next['date']
        df_all = index_now.merge(close_now, how='inner', on=['sec_id'])
        df_all = df_all.merge(close_next, how='inner', on=['sec_id'])
        df_all = df_all.sort_values(by=[index_std])
        df_all['return_rate'] = df_all['close_next'] / df_all['close_now'] - 1
        # 分组 如果不能整分 则将多余的归为最后一组 多余的个数不会超过分组个数
        df_all['group'] = np.nan
        group_index = df_all.columns.tolist().index('group')
        group = 1
        dist = int(df_all.shape[0] / groups)
        for j in range(0, df_all.shape[0], dist):
            if group < groups:
                df_all.iloc[j:j + dist, group_index] = group
            else:
                df_all.iloc[j:, group_index] = group
                break
            group += 1
        df_group = df_all.groupby([
            'group'
        ]).apply(lambda x: sum(x.return_rate *
                               (x.close_now / x.close_now.sum()))).to_frame()
        df_group = df_group.rename(columns={0: 'return_rate'})
        df_group = df_group.transpose()
        new_names = ['group{:0>2}'.format(i) for i in range(1, groups + 1)]
        df_group.columns = new_names
        df_group['date'] = date_now
        df_daily = df_daily.append(df_group)
    df_daily = df_daily.reset_index().drop(['index'], 1)
    df_daily['diff'] = df_daily['group{:0>2}'.format(
        groups)] - df_daily['group01']
    if save_daily:
        df_daily.to_csv(os.path.join(
            PROJECT_FILES_PATH, 'daily_return',
            "{}_cycle_{}_daily.csv".format(index_std, cycle)),
                        index=False,
                        encoding='utf-8')
    df_cum = df_daily.copy()
    for i in range(df_daily.shape[0]):
        if i == 0:
            df_cum.iloc[i, 0:groups] = 1
        else:  # 单利计算累计收益
            df_cum.iloc[i, 0:groups] = df_cum.iloc[
                i - 1, 0:groups] + df_daily.iloc[i - 1, 0:groups]
    if save_cum:
        df_cum.to_csv(os.path.join(
            PROJECT_FILES_PATH, 'cum_return',
            "{}_cycle_{}_cum.csv".format(index_std, cycle)),
                      index=False,
                      encoding='utf-8')
    fig = plt.figure(figsize=(12, 6))
    plt.ylabel("单利累计收益")
    ax = plt.subplot(111)
    df_cum = df_cum.set_index('date')
    df_cum[['group01', 'group{:0>2}'.format(groups)]].plot(ax=ax)
    ax.set_title("{}单利累计收益图  cycle={}".format(index_std, cycle))
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    if save_plot:
        picfile = os.path.join(PROJECT_FILES_PATH, 'plot',
                               "{}_cycle_{}_cum.png".format(index_std, cycle))
        plt.savefig(picfile)
        print("plot is saved to: {}".format(picfile))
Exemplo n.º 30
0
def update_single_indicator(indicator,
                            sec_ids=[],
                            trading_days=[],
                            override=False,
                            log=False):
    """
    更新单个indicator的指定日期列表的数据

    @indicator (str): 单个indicator的名称
    @sec_ids<list> : 股票代码列表
    @trading_days ([%Y-%m-%d]): 日期列表
    @override (Bool): 是否覆盖原记录 默认为False 表示不覆盖
    @log (Bool): 是否打印log
    """

    if log:
        Logger.info("Updating indicator {}".format(indicator), "green")

    if indicator not in get_schema('indicator'):
        Logger.error("Unrecognized indicator: {}".format(indicator))
        raise ValueError

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    with SqliteProxy(log=log) as proxy:
        date_classfier = classify_dates_by_year(trading_days)

        for year, date_list in date_classfier.items():
            path = os.path.join(DB_INDICATOR, '{}.db'.format(year))
            proxy.connect(path)

            if indicator not in proxy.list_tables:
                create_table(proxy, "indicator", indicator)

            # 判断已有数据
            if len(date_list) == 1:
                query = "SELECT DISTINCT(date) FROM {} WHERE date = '{}'".format(
                    indicator, date_list[0])
            else:
                query = "SELECT DISTINCT(date) FROM {} WHERE date in {}".format(
                    indicator, tuple(date_list))
            lookup = proxy.query_as_dataframe(query)
            lookup = set(lookup['date'].tolist())

            for date in date_list:
                if date in lookup and not override:  # 更新的日期已经存在于数据库时,不覆盖则跳过
                    if log:
                        Logger.warn("{} records on {} is existed.".format(
                            indicator, date))
                    continue

                try:
                    df = load_single_indicator_on_single_day_from_wind(
                        indicator=indicator, sec_ids=sec_ids, date=date)
                except Exception:
                    Logger.error("Error occurred when loading {} on {}".format(
                        indicator, date))
                    raise ValueError

                if df is not None:  # 从Wind下载数据成功时
                    if date in lookup and override:  # 覆盖时删除原记录
                        if len(sec_ids) == 0:
                            proxy.execute(
                                "DELETE FROM [{}] WHERE date = '{}'".format(
                                    indicator, date))
                        if len(sec_ids) == 1:
                            proxy.execute(
                                "DELETE FROM [{}] WHERE date = '{}' and sec_id = '{}'"
                                .format(indicator, date, sec_ids[0]))
                        else:
                            proxy.execute(
                                "DELETE FROM [{}] WHERE date = '{}' and sec_id in {}"
                                .format(indicator, date, tuple(sec_ids)))
                    df['date'] = date
                    try:
                        proxy.write_from_dataframe(df, indicator)
                    except Exception:
                        Logger.error(
                            "Error occurred when writing {} on {}".format(
                                indicator, date))
                        traceback.print_exc()
                        raise ValueError
                    if log:
                        Logger.info("{} on {} is updated successfully".format(
                            indicator, date))

                else:  # 从wind提取数据失败时
                    Logger.error("Fail to fetch {} data on {}".format(
                        indicator, date))
                    raise ValueError

    update_schema(db_name="indicator", sub_name=indicator)

    if log:
        Logger.info("indicator {} is updated.".format(indicator),
                    color="green")
        Logger.info("------------------------------------------")