Пример #1
0
def update_schema(db_name, sub_name):
    """
    更新schema相关的begin date,end date, last update 适用于非factor_return相关的数据库

    @db_name (str): db的名称 eg. FACTOR 排除factor_return
    @sub_name (str): db中各子数据库的名称 eg. VALUE GROWTH
    """

    schema = json2dict(os.path.join(DB_PATH_LIB[db_name], 'schema'))

    assert sub_name

    date_list = get_date_lists_in_table(DB_PATH_LIB[db_name], sub_name)

    schema[sub_name]['begin date'] = date_list[0]
    schema[sub_name]['end date'] = date_list[-1]
    schema[sub_name]['last update'] = datetime.now().strftime(
        '%Y-%m-%d %H:%M:%S')

    Logger.info("schema updated: {}".format(sub_name))
    dict2json(schema, os.path.join(DB_PATH_LIB[db_name], 'schema'), log=False)
    a = pd.DataFrame(schema).T
    col_names = [
        'aspect', 'type', 'begin date', 'end date', 'last update', 'col_names',
        'field', 'kwargs', 'explanation'
    ]
    b = a.reindex(columns=col_names).reset_index().rename(columns={
        'index': 'indicator'
    }).sort_values(['type', 'aspect', 'field'])
    b.to_csv(os.path.join(DB_PATH_LIB[db_name], 'schema.csv'), index=False)
Пример #2
0
def update_industry_to_json(industry, trading_days):
    try:
        date = trading_days[-1]
        index_code, loader = INDEX_LOADER_MAP[industry]
        info = loader(index_code, date, level=1)
    except Exception:
        Logger.error("Error occurred when loading {} on {}".format(industry, date))
        raise ValueError

    try:
        path = os.path.join(DB_INDUSTRY, '{}.json'.format(industry))
        copy_to = os.path.join(DB_INDUSTRY, '{}_backup.json'.format(industry))
        shutil.copy(path, copy_to)  # 保存副本,以防数据损坏
        dict2json(info, path, log=False)

        Logger.info("{} on {} is updated successfully".format(industry, date))
    except Exception:
        Logger.error("Error occurred when writing {} on {}".format(industry, date))
        raise ValueError

    # json files are different from sql, cannot use update_schema()
    # therefore update schema information explicitly
    try:
        now = datetime.now()
        schema = get_schema('industry')
        schema[industry]["begin date"] = ""
        schema[industry]["end date"] = now.strftime('%Y-%m-%d')
        schema[industry]['last update'] = now.strftime('%Y-%m-%d %H:%M:%S')
        save_schema(schema, 'industry')

        Logger.info("schema updated: {}".format(industry))
    except Exception:
        Logger.error("Error occurred when updating schema of {}".format(industry))
        traceback.print_exc()
        raise ValueError
Пример #3
0
def get_index_contents(index_code, date="", approx=False, log=False):
    """
    读取单个日期指数成分股列表

    @index_code (str): 指数代码,目前支持 ['A', 'H', '000905.SH', '000300.SH', '000016.SH', 'HSI.HI']
    @date ('%Y-%m-%d'): 单个日期
    @log (Bool): 是否打印log
    :return (list): 股票代码列表
    """

    if log:
        Logger.info(
            "Reading index contents of {} on {}".format(index_code, date),
            "green")

    if not date:
        Logger.error("Empty date")
        raise ValueError

    # approx 用于保证更新 indicator 财报数据时财报日非交易日的情况
    if approx:
        date = get_nearest_trading_day(date=date,
                                       direction='left',
                                       self_included=True)

    if index_code in IDXCONT_AS_SQL:
        output = get_index_contents_from_sql(index_code, date, log=log)
    elif index_code in IDXCONT_AS_CSV:
        output = get_index_contents_from_csv(index_code)
    else:
        Logger.error("Unrecognized index code: {}".format(index_code))
        raise ValueError
    return output
Пример #4
0
def get_secs_factor_on_multidays(factor,
                                 sec_ids=[],
                                 trading_days=[],
                                 log=False):
    """
    从本地数据库中获取一段日期的单个factor的值,并返回 dict of DataFrame

    @factor (str): 单个factor
    @sec_ids (list): 支持多个股票查询,默认为[],表示查询范围是全A股
    @trading_days (["%Y-%m-%d"]): 日期列表
    @log (Bool): 是否打印log
    :return: {date: Dataframe},其中 DataFrame 列为factor名,index为sec_id
    """

    if log:
        Logger.info(
            "Reading {} from {} to {}".format(factor, trading_days[0],
                                              trading_days[-1]), "green")

    if factor not in get_schema("factor"):
        Logger.error("Unrecognized factor: {}".format(factor))
        raise ValueError

    if not isinstance(sec_ids, list):
        Logger.error("sec_ids must be list!")
        raise ValueError

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    # 长连接效率更高,所以这里不是复用 get_secs_factor 而是重新写
    with SqliteProxy(log=log) as proxy:
        output = {}
        for year, date_list in classify_dates_by_year(trading_days).items():
            path = os.path.join(DB_FACTOR, '{}.db'.format(year))
            proxy.connect(path)
            for date in date_list:
                if len(sec_ids) == 0:  # 为空默认全A股
                    conds = ""
                elif len(sec_ids) == 1:
                    conds = "AND sec_id = '{}'".format(sec_ids[0])
                else:
                    conds = "AND sec_id IN {}".format(tuple(sec_ids))
                query = "SELECT sec_id, {} FROM [{}] WHERE date = '{}' {}".format(
                    factor, factor, date, conds)
                try:
                    df = proxy.query_as_dataframe(query)
                except Exception:
                    Logger.error("Error occurred when reading {} at {}".format(
                        factor, date))
                    traceback.print_exc()
                    raise ValueError

                output[date] = df

    return output
Пример #5
0
def update_index_contents_to_sql(index_code,
                                 trading_days,
                                 override,
                                 log=False):
    with SqliteProxy(log=log) as proxy:
        date_classfier = classify_dates_by_year(trading_days)

        for year, date_list in date_classfier.items():
            path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(year))
            proxy.connect(path)
            if index_code not in proxy.list_tables:
                create_table(proxy, "index_contents", index_code)

            # 判断已有数据
            query = "SELECT DISTINCT(date) FROM [{}]".format(index_code)
            lookup = proxy.query_as_dataframe(query)
            lookup = set(lookup['date'].tolist())

            for date in date_list:
                if date in lookup and not override:  # 更新的日期已经存在于数据库时,不覆盖则跳过
                    if log:
                        Logger.warn("{} records on {} is existed.".format(
                            index_code, date))
                    continue

                try:
                    loader = LOADER_MAP[index_code]
                    df = loader(index_code, date)
                    df['date'] = date
                except Exception:
                    Logger.error("Error occurred when loading {} on {}".format(
                        index_code, date))
                    raise ValueError

                if df is not None:  # 从Wind下载数据成功时
                    try:
                        if date in lookup and override:  # 覆盖时删除原记录
                            proxy.execute(
                                "DELETE FROM [{}] WHERE date = '{}'".format(
                                    index_code, date))

                        proxy.write_from_dataframe(df, index_code)
                    except Exception:
                        Logger.error(
                            "Error occurred when writing {} on {}".format(
                                index_code, date))
                        traceback.print_exc()
                        raise ValueError

                    Logger.info("{} on {} is updated successfully".format(
                        index_code, date))
                else:  # 从wind提取数据失败时
                    Logger.error("Fail to fetch {} data on {}".format(
                        index_code, date))
                    raise ValueError

    update_schema('index_contents', index_code)
Пример #6
0
def get_index_contents_on_multidays(index_code, trading_days=[], log=False):
    """
    读取多个日期某指数全部股票列表

    @index_code (str): 指数代码,目前支持 ['A', '000905.SH', '000300.SH', '000016.SH']
    @trading_days (['%Y-%m-%d']): 日期列表
    @log (Bool): 是否打印log
    :return: ({date: list}), key为date value为 股票代码列表
    """

    if log:
        Logger.info(
            "Reading all {} records between trading_days ...".format(
                index_code), "green")

    if len(trading_days) == 0:
        Logger.error("Empty date")
        raise ValueError
    elif len(trading_days) == 1:
        date = trading_days[0]
        return {date: get_index_contents(index_code, date, log=False)}

    output = {}
    if index_code in IDXCONT_AS_SQL:
        with SqliteProxy(log=log) as proxy:
            for year, date_list in classify_dates_by_year(
                    trading_days).items():
                path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(year))
                proxy.connect(path)

                query = "SELECT date, sec_id FROM [{}] WHERE date IN {}".format(
                    index_code, tuple(date_list))
                try:
                    df = proxy.query_as_dataframe(query)
                except Exception:
                    Logger.error(
                        "Empty result when reading {} from {} to {}".format(
                            index_code, trading_days[0], trading_days[-1]))
                    traceback.print_exc()
                    raise ValueError

                if len(df) == 0:
                    Logger.warn(
                        "Empty result when reading {} from {} to {}".format(
                            index_code, trading_days[0], trading_days[-1]))

                for date in date_list:
                    output[date] = df[df.date == date]['sec_id'].tolist()
    elif index_code in IDXCONT_AS_CSV:
        info = get_index_contents_from_csv(index_code)
        output = {date: info for date in trading_days}
    else:
        Logger.error("Unrecognized index code: {}".format(index_code))
        raise ValueError
    return output
Пример #7
0
def get_secs_index_std(index_std, sec_ids=[], trading_days=[], log=False):
    """
    从本地数据库中获取一段日期的单个index_std的值,并返回 DataFrame

    @index_std (str): 单个index_std
    @sec_ids (list): 支持多个股票查询,默认为[],表示查询范围是全A股
    @trading_days (["%Y-%m-%d"]): 日期列表
    @log (Bool): 是否打印log
    :return: {date: Dataframe},其中 DataFrame 列为index_std名,index_std为sec_id
    """

    if log:
        Logger.info("Reading {} from {} to {}".format(index_std, trading_days[0], trading_days[-1]), "green")

    # if index_std not in get_schema("index_std"):
    #     Logger.error("Unrecognized index_std: {}".format(index_std))
    #     raise ValueError

    if not isinstance(sec_ids, list):
        Logger.error("sec_ids must be list!")
        raise ValueError

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    with MySQLProxy(log=log) as proxy:
        output={}
        proxy.connect(USER, PASSWORD, "index_std")
        #  注: 单个值用=,需要加上引号,多个值用tuple
        if len(sec_ids) == 0:
            if len(trading_days) == 1:
                query="SELECT * FROM {} WHERE date = '{}' ".format(index_std, trading_days[0])
            else:
                query="SELECT * FROM {} WHERE date in {}".format(index_std, tuple(trading_days))
        elif len(sec_ids) == 1:
            if len(trading_days) == 1:
                query="SELECT * FROM {} WHERE date = '{}' AND sec_id = '{}' ".format(index_std, trading_days[0], sec_ids[0])
            else:
                query="SELECT * FROM {} WHERE date in {} AND sec_id = '{}' ".format(index_std, tuple(trading_days), sec_ids[0])
        else:
            if len(trading_days) == 1:
                query="SELECT * FROM {} WHERE date = '{}' AND sec_id in {}".format(index_std, trading_days[0], tuple(sec_ids))
            else:
                query="SELECT * FROM {} WHERE date in {} AND sec_id in {}".format(index_std, tuple(trading_days), tuple(sec_ids))

        try:
            df=proxy.query_as_dataframe(query)
        except Exception:
            Logger.error("Error occurred when reading {} ".format(inde))
            traceback.print_exc()
            raise ValueError
    df['date']=df['date'].apply(lambda x: str(x))
    return df
Пример #8
0
def get_secs_factor(factor, sec_ids=[], date="", log=False):
    """
    从本地数据库中获取单个日期的单个factor的值,并返回 DataFrame

    @factor (str): 单个factor
    @sec_ids (list): 支持多个股票查询,默认为[],表示查询范围是全A股
    @date ('%Y-%m-%d'): 单个日期
    @log (Bool): 是否打印log
    :return: Dataframe 列为factor名,index为sec_id
    """

    if log:
        Logger.info("Reading {} at {}".format(factor, date), "green")

    if factor not in get_schema("factor"):
        Logger.error("Unrecognized factor: {}".format(factor))
        raise ValueError

    if not isinstance(sec_ids, list):
        Logger.error("sec_ids must be list!")
        raise ValueError

    if not date:
        Logger.error("Empty date")
        raise ValueError

    with SqliteProxy(log=log) as proxy:
        path = os.path.join(DB_FACTOR, '{}.db'.format(date[:4]))
        proxy.connect(path)

        if len(sec_ids) == 0:  # 为空默认全A股
            conds = ""
        elif len(sec_ids) == 1:
            conds = "AND sec_id = '{}'".format(sec_ids[0])
        else:
            conds = "AND sec_id IN {}".format(tuple(sec_ids))

        query = "SELECT sec_id, {} FROM [{}] WHERE date = '{}' {}".format(
            factor, factor, date, conds)
        try:
            df = proxy.query_as_dataframe(query)
        except Exception:
            Logger.error("Error occurred when reading {} at {}".format(
                factor, date))
            traceback.print_exc()
            raise ValueError
        return df.sort_values(by=['sec_id']).set_index(['sec_id'])
Пример #9
0
def update_index_std(index, cp=3, log=False):
    """
    更新index_std
    更新原理: 无需指定trading_days 更新全部index中有的日期但在index_std中没有的日期
    @index <str>: index名称 不是index_std名称
    @cp <int>: winsorize的临界值
    """

    trading_days = get_unique_datelist_from_table("index", index)
    existed_days = get_unique_datelist_from_table("index_std", "{}_std".format(index))
    update_days = sorted(list(set(trading_days) - set(existed_days)))
    if len(update_days) == 0:
        Logger.warn("All given dates has existed. No need to update!!")
        return
    output = process_ts_index(index, update_days, cp)
    if len(output) == 0:
        Logger.error("Fail to process {} on given dates".format(index))
    df2mysql(USER, PASSWORD, "index_std", index + '_std', output)
    del output, trading_days, update_days
    gc.collect()
    Logger.info("Updated successfully!!")
Пример #10
0
def update_industry(industry, trading_days=[], override=False, log=False):
    """
    从Wind更新某指数成分股申万一级行业数据

    @industry (str): 行业数据库名称
    @trading_days (['%Y-%m-%d']): 日期列表
    @override (Bool): 是否覆盖原记录 默认为False 表示不覆盖
    @log (Bool): 是否打印日志信息
    """

    Logger.info("Updating industry {}".format(industry), "green")

    if industry not in get_schema('industry'):
        Logger.error("Unrecognized industry: {}".format(industry))
        return

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    if industry in INDUSTRY_AS_SQL:
        update_industry_to_sql(industry, trading_days, override, log)
    elif industry in INDUSTRY_AS_JSON:
        # 非sql数据强制更新,原有的会自动保存副本
        update_industry_to_json(industry, trading_days)
    else:
        Logger.error("Unrecognized industry: {}".format(industry))
        raise ValueError

    if log:
        Logger.info("industry/{} is updated.".format(industry), color="green")
        Logger.info("------------------------------------------")
Пример #11
0
def update_calendar(start_date, end_date, log=False):
    """
    从Wind更新calendar相关数据 每次更新将删除原有所有数据 更新到当前区间

    @start_date ("%Y-%m-%d"): 开始日日期 必须是月初日期
    @end_date ("%Y-%m-%d"): 结束日日期 必须是月月末日期
    @log (Bool): 是否打印log
    """

    Logger.info("Updating calendar ...", "green")

    max_existed_date = get_trading_days
    with SqliteProxy(log=log) as proxy:
        proxy.connect(os.path.join(DB_CALENDAR_PATH, "calendar.db"))
        proxy.execute("DELETE FROM calendar")
        try:
            df = load_calendar_from_wind(start_date, end_date)
        except Exception:
            Logger.error("Error occurred when loading")
            raise ValueError
        try:
            proxy.write_from_dataframe(df, "calendar")
        except Exception:
            Logger.error(
                "Error occurred when writing dataframe into sqlite db")
            traceback.print_exc()
            raise ValueError
    if log:
        Logger.info("calendar was updated from {} to {}".format(
            start_date, end_date),
                    color="green")
        Logger.info("------------------------------------------")
Пример #12
0
def update_index_contents_to_csv(index_code, trading_days, override):
    try:
        date = trading_days[-1]
        df = loader(index_code, date)
        loader = LOADER_MAP[index_code]
    except Exception:
        Logger.error("Error occurred when loading {}".format(index_code))
        raise ValueError

    try:
        path = os.path.join(DB_INDEX_CONTENTS, '{}.csv'.format(index_code))
        copy_to = os.path.join(DB_INDEX_CONTENTS,
                               '{}_backup.csv'.format(index_code))
        shutil.copy(path, copy_to)  # 保存副本,以防数据损坏
        df.to_csv(path, encoding="utf-8", index=False)

        Logger.info("{} on {} is updated successfully".format(
            index_code, date))
    except Exception:
        Logger.error("Error occurred when writing {}".format(index_code))
        traceback.print_exc()
        raise ValueError

    # csv files are different from sql, cannot use update_schema()
    # therefore update schema information explicitly
    try:
        now = datetime.now()
        schema = get_schema('index_contents')
        schema[index_code]["begin date"] = ""
        schema[index_code]["end date"] = now.strftime('%Y-%m-%d')
        schema[index_code]['last update'] = now.strftime('%Y-%m-%d %H:%M:%S')
        save_schema(schema, 'index_contents')

        Logger.info("schema updated: {}".format(index_code))
    except Exception:
        Logger.error(
            "Error occurred when updating schema of {}".format(index_code))
        traceback.print_exc()
        raise ValueError
Пример #13
0
def get_secs_IC(ic_code, trading_days=[], log=False):
    """
    从本地数据库中获取一段日期的单个IC的值,并返回 dict of DataFrame

    @ic_code (str): 单个IC stocks_info: 全A股
    @trading_days (["%Y-%m-%d"]): 日期列表
    @log (Bool): 是否打印log
    :return: {date: Dataframe},date, sec_id, sec_name, is_st, is_trade, ...
    """

    if log:
        Logger.info(
            "Reading {} from {} to {}".format(ic_code, trading_days[0],
                                              trading_days[-1]), "green")

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    with MySQLProxy(log=log) as proxy:
        output = {}
        proxy.connect(USER, PASSWORD, 'index')
        #  注: 单个值用=,需要加上引号,多个值用tuple
        if len(trading_days) == 1:
            query = "SELECT * FROM {} WHERE date = '{}' ".format(
                ic_code, trading_days[0])
        else:
            query = "SELECT * FROM {} WHERE date in {}".format(
                ic_code, tuple(trading_days))

        try:
            df = proxy.query_as_dataframe(query)
        except Exception:
            Logger.error("Error occurred when reading {} at {}".format(
                ic_code, date))
            traceback.print_exc()
            raise ValueError

    return df
Пример #14
0
def update_factors(factors=[], trading_days=[], override=False, log=False):
    """
    更新多个factor的指定日期列表的数据

    @factors (<list>):factor名称构成的列表
    @trading_days ([%Y-%m-%d]): 日期列表
    @override (<Bool>): 是否覆盖原记录 默认为False 表示不覆盖
    @log (<Bool>): 是否打印log
    """

    SCHEMA = get_schema("factor")
    if not factors:
        factors = sorted(SCHEMA, key=lambda x: SCHEMA[x]["level"])

    for fac in factors:
        if fac in SCHEMA:
            update_single_factor(factor=fac,
                                 trading_days=trading_days,
                                 override=override,
                                 log=log)
        else:
            Logger.error("Unrecognized factor: {}".format(fac))
            Logger.info("------------------------------------------")
Пример #15
0
def update_factor_return_schema(factor):
    """
    更新factor_return的schema相关的begin date,end date, last update

    @factor (str): factor的名称
    """

    schema = json2dict(os.path.join(DB_PATH_LIB['factor_return'], 'schema'))

    filepath = os.path.join(DB_PATH_LIB['factor_return'],
                            "{}.csv".format(factor))
    df = pd.read_csv(filepath, encoding="utf-8")["date"]
    schema[factor]['begin date'] = df.min()

    schema[factor]['end date'] = df.max()

    schema[factor]['last update'] = \
        datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    Logger.info("schema updated: {}".format(factor))
    dict2json(schema,
              os.path.join(DB_PATH_LIB['factor_return'], 'schema'),
              log=False)
Пример #16
0
def sqlize_db(db_name, subdb_list=[]):
    """将数据库sql化"""

    if not subdb_list:
        subdb_list = list(get_schema(db_name).keys())
    else:
        subdb_list = [s for s in subdb_list if s in get_schema(db_name)]

    db_path = os.path.join(DB_PATH, db_name)

    with SqliteProxy(log=False) as proxy:
        for subdb in subdb_list:
            Logger.info("SQLing {}/{}".format(db_name, subdb), "green")

            subdb_path = os.path.join(db_path, subdb)
            trading_days = listdir_advanced(subdb_path,
                                            'csv',
                                            strip_suffix=True)
            for year, dates in classify_dates_by_year(trading_days).items():
                path = os.path.join(db_path, '{}.db'.format(year))
                proxy.connect(path)

                if subdb not in proxy.list_tables:
                    creator = DB_CREATOR_MAP[db_name]
                    creator(proxy, subdb)

                for date in dates:
                    df = pd.read_csv(
                        os.path.join(subdb_path, '{}.csv'.format(date)))
                    df['date'] = date
                    try:
                        proxy.write_from_dataframe(df, subdb)
                    except Exception:
                        Logger.error(
                            "Error occurred when sqlizing {} on {}.".format(
                                subdb, date))
                        traceback.print_exc()
Пример #17
0
def update_index_contents(index_code,
                          trading_days=[],
                          override=False,
                          log=False):
    """
    从Wind更新index_contents相关数据

    @index_code (str): 要更新的指标
    @trading_days (['%Y-%m-%d']): 传入的日期列表
    @override (Bool): 是否覆盖旧数据,默认为False,表示不覆盖
    @log (Bool): 是否打印log
    """

    Logger.info("Updating index_contents {}".format(index_code), "green")

    if index_code not in get_schema('index_contents'):
        Logger.error("Unrecognized index: {}".format(index_code))
        return

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    if index_code in IDXCONT_AS_SQL:
        update_index_contents_to_sql(index_code, trading_days, override, log)
    elif index_code in IDXCONT_AS_CSV:
        # 非sql数据强制更新,原有的会自动保存副本
        update_index_contents_to_csv(index_code, trading_days)
    else:
        Logger.error("Unrecognized index code: {}".format(index_code))
        raise ValueError

    if log:
        Logger.info("index_content/{} is updated.".format(index_code),
                    color="green")
        Logger.info("------------------------------------------")
Пример #18
0
def update_single_factor_return(factor_return,
                                trading_days=[],
                                group_num=10,
                                log=True):
    """
    根据trading_days更新factor_return数据

    @factor_return (<str>): factor的名称
    @trading_days (<[%Y-%m-%d]>) : 日期列表
    @group_num (<int>): 分组个数
    """

    if log:
        Logger.info("Updating factor_return {}...".format(factor_return))

    if factor_return not in get_schema("factor_return"):
        Logger.error("Unrecognized factor_return: {}".format(factor_return))
        return
    factor_path = DB_PATH_LIB['factor']
    factor_exist_dates = get_date_lists_in_table(factor_path, factor_return)
    not_found_date = list(set(trading_days) - set(factor_exist_dates))
    if len(not_found_date) != 0 and log:
        Logger.warn(
            "Fail to update these factor returns on following dates due to lack factor:{}"
            .format(not_found_date))

    trading_days = list(set(trading_days) - set(not_found_date))
    if len(trading_days) == 0:
        Logger.error("No valid date to update")
        return

    trading_days = sorted(trading_days)
    db_factor_return_path = os.path.join(DB_PATH, "factor_return")
    filepath = os.path.join(db_factor_return_path,
                            '{}.csv'.format(factor_return))
    df_new = load_single_factor_return_on_multidays(factor_return,
                                                    trading_days, group_num)

    _n_updated_date = len(df_new)

    if not os.path.exists(filepath):  # 没有已经更新过的记录
        Logger.info("首次更新 {}数据".format(factor_return))
        output = df_new.copy()
        output.to_csv(filepath, encoding="utf-8")
    else:
        df_old = pd.read_csv(filepath, encoding="utf-8")  # 已经存在的所有return数据
        min_exist_date = normalize(df_old['date'].min(), "%Y-%m-%d")
        max_exist_date = normalize(df_old['date'].max(), "%Y-%m-%d")
        max_update_date = trading_days[-1]
        min_update_date = trading_days[1]  # 因为只能第二个日期才能计算收益

        if (max(min_update_date, max_update_date) < min_exist_date) or \
           (min(min_update_date, max_update_date) > max_exist_date):
            Logger.error("非法更新:待更新时间段孤立于现有的时间段")
            Logger.error("开始更新日期:{}  结束更新日期:{}".format(min_update_date,
                                                       max_update_date))
            Logger.error("原有开始日期:{}  原有结束日期:{}".format(min_exist_date,
                                                       max_exist_date))
            return

        if (min_update_date < min_exist_date) and \
           (max_update_date <= max_exist_date) and \
           (max_update_date >= min_exist_date):
            Logger.info("左更新:更新之前记录")

        if (min_update_date >= min_exist_date) and (max_update_date <=
                                                    max_exist_date):
            Logger.info("存量更新:更新当前已经有的记录")

        if (min_update_date >= min_exist_date) and \
           (min_update_date <= max_exist_date) and \
           (max_update_date > max_exist_date):
            Logger.info("右更新:更新未来的记录")

        if (min_update_date < min_exist_date) and \
           (max_update_date > max_exist_date):
            Logger.info("全更新:当前已经存在的日期是待更新日期的子集")

        df_old['date'] = df_old['date'].apply(
            lambda x: normalize(x, "%Y-%m-%d"))
        df_new['date'] = df_new['date'].apply(
            lambda x: normalize(x, "%Y-%m-%d"))
        bool_list = df_old['date'].isin(df_new['date']).apply(
            lambda x: not x)  # 旧数据不在更新日期中为True
        # 取出那些不在本次更新范围内但原数据已经存在的日期列表 这些日期直接copy 无需计算
        df_old = df_old[bool_list]
        output = df_old.append(df_new).sort_values(by=['date'])

    output = output.set_index(['date'])
    format_var_name_list = [
        'group{:0>2}'.format(i) for i in range(1, group_num + 1)
    ]
    format_var_name_list.append('{}'.format(factor_return))
    output = output.reindex(columns=format_var_name_list)
    output.to_csv(filepath, encoding="utf-8")

    update_factor_return_schema(factor_return)
    if log:
        _n_all_date = len(output)
        _n_existed_date = _n_all_date - _n_updated_date

        Logger.info("传入日期数:{}  已经存在个数:{}  实际写入次数:{}".format(
            _n_all_date, _n_existed_date, _n_updated_date))
        Logger.info("factor_return {} is updated.".format(factor_return),
                    color="green")
        Logger.info("------------------------------------------")
Пример #19
0
def update_single_factor(factor, trading_days=[], override=False, log=False):
    """
    更新单个factor的指定日期列表的数据

    @factor (str): factor名称
    @trading_days ([%Y-%m-%d]): 日期列表
    @override (Bool): 是否覆盖原记录,默认为False,表示不覆盖
    @log (Bool): 是否打印log
    """

    Logger.info("Updating factor {}".format(factor), "green")

    _n_updated_date = 0

    if factor not in get_schema('factor'):
        Logger.error("Unrecognized factor: {}".format(factor))
        raise ValueError

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    with SqliteProxy(log=log) as proxy:
        date_classfier = classify_dates_by_year(trading_days)

        for year, date_list in date_classfier.items():
            path = os.path.join(DB_FACTOR, '{}.db'.format(year))
            proxy.connect(path)

            if factor not in proxy.list_tables:
                create_table(proxy, "factor", factor)

            # 判断已有数据
            if len(date_list) == 1:
                query = "SELECT DISTINCT(date) FROM {} WHERE date = '{}'".format(
                    factor, date_list[0])
            else:
                query = "SELECT DISTINCT(date) FROM {} WHERE date in {}".format(
                    factor, tuple(date_list))
            lookup = proxy.query_as_dataframe(query)
            lookup = set(lookup['date'].tolist())

            for date in date_list:
                if date in lookup and not override:  # 更新的日期已经存在于数据库时,不覆盖则跳过
                    if log:
                        Logger.warn("{} records on {} is existed.".format(
                            factor, date))
                    continue

                try:
                    df = load_single_factor_on_single_day(factor=factor,
                                                          date=date)
                except Exception:
                    Logger.error("Error occurred when loading {} on {}".format(
                        factor, date))
                    traceback.print_exc()
                    continue

                if df is not None:  # 成功取得indicator
                    if date in lookup and override:  # 覆盖时删除原记录
                        proxy.execute(
                            "DELETE FROM [{}] WHERE date = '{}'".format(
                                factor, date))

                    df['date'] = date
                    try:
                        proxy.write_from_dataframe(df, factor)
                    except Exception:
                        Logger.error(
                            "Error occurred when writing {} on {}".format(
                                factor, date))
                        traceback.print_exc()
                        raise ValueError

                    if log:
                        Logger.info("{} on {} is updated successfully".format(
                            factor, date))
                    _n_updated_date += 1
                else:  # 从wind提取数据失败时
                    Logger.error("Fail to fetch {} data on {}".format(
                        factor, date))
                    raise ValueError

    update_schema(db_name="factor", sub_name=factor)

    if log:
        _n_all_date = len(trading_days)
        _n_existed_date = _n_all_date - _n_updated_date
        Logger.info("传入日期数:{}  已经存在个数:{}  实际写入次数:{}".format(
            _n_all_date, _n_existed_date, _n_updated_date))
        Logger.info("factor {} is updated.".format(factor), color="green")
        Logger.info("------------------------------------------")
Пример #20
0
def update_single_indicator(indicator,
                            sec_ids=[],
                            trading_days=[],
                            override=False,
                            log=False):
    """
    更新单个indicator的指定日期列表的数据

    @indicator (str): 单个indicator的名称
    @sec_ids<list> : 股票代码列表
    @trading_days ([%Y-%m-%d]): 日期列表
    @override (Bool): 是否覆盖原记录 默认为False 表示不覆盖
    @log (Bool): 是否打印log
    """

    if log:
        Logger.info("Updating indicator {}".format(indicator), "green")

    if indicator not in get_schema('indicator'):
        Logger.error("Unrecognized indicator: {}".format(indicator))
        raise ValueError

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    with SqliteProxy(log=log) as proxy:
        date_classfier = classify_dates_by_year(trading_days)

        for year, date_list in date_classfier.items():
            path = os.path.join(DB_INDICATOR, '{}.db'.format(year))
            proxy.connect(path)

            if indicator not in proxy.list_tables:
                create_table(proxy, "indicator", indicator)

            # 判断已有数据
            if len(date_list) == 1:
                query = "SELECT DISTINCT(date) FROM {} WHERE date = '{}'".format(
                    indicator, date_list[0])
            else:
                query = "SELECT DISTINCT(date) FROM {} WHERE date in {}".format(
                    indicator, tuple(date_list))
            lookup = proxy.query_as_dataframe(query)
            lookup = set(lookup['date'].tolist())

            for date in date_list:
                if date in lookup and not override:  # 更新的日期已经存在于数据库时,不覆盖则跳过
                    if log:
                        Logger.warn("{} records on {} is existed.".format(
                            indicator, date))
                    continue

                try:
                    df = load_single_indicator_on_single_day_from_wind(
                        indicator=indicator, sec_ids=sec_ids, date=date)
                except Exception:
                    Logger.error("Error occurred when loading {} on {}".format(
                        indicator, date))
                    raise ValueError

                if df is not None:  # 从Wind下载数据成功时
                    if date in lookup and override:  # 覆盖时删除原记录
                        if len(sec_ids) == 0:
                            proxy.execute(
                                "DELETE FROM [{}] WHERE date = '{}'".format(
                                    indicator, date))
                        if len(sec_ids) == 1:
                            proxy.execute(
                                "DELETE FROM [{}] WHERE date = '{}' and sec_id = '{}'"
                                .format(indicator, date, sec_ids[0]))
                        else:
                            proxy.execute(
                                "DELETE FROM [{}] WHERE date = '{}' and sec_id in {}"
                                .format(indicator, date, tuple(sec_ids)))
                    df['date'] = date
                    try:
                        proxy.write_from_dataframe(df, indicator)
                    except Exception:
                        Logger.error(
                            "Error occurred when writing {} on {}".format(
                                indicator, date))
                        traceback.print_exc()
                        raise ValueError
                    if log:
                        Logger.info("{} on {} is updated successfully".format(
                            indicator, date))

                else:  # 从wind提取数据失败时
                    Logger.error("Fail to fetch {} data on {}".format(
                        indicator, date))
                    raise ValueError

    update_schema(db_name="indicator", sub_name=indicator)

    if log:
        Logger.info("indicator {} is updated.".format(indicator),
                    color="green")
        Logger.info("------------------------------------------")