示例#1
0
def retrieve_factor(start_date='', end_date=''):
    code_dates_mapping = get_idx_cons('000300.XSHG')
    feature_mapping = load_json_file('E:\pycharm\\algo_trading\quant_models\quant_models\conf\\feature_mapping.json')
    all_trading_dates = get_all_trading_dates(start_date, end_date)

    #
    # code_lst = get_cons_by_date(code_dates_mapping, d)
    # for f, lst in feature_mapping:
    #     rows, desc = data_fetcher.get_equ_factor(factor_type=f, security_ids=code_lst, start_date=d, end_date=d, fields=lst)
    #     dfs.append(pd.DataFrame(rows, columns=desc))
    testing_mappings = {'return': feature_mapping.get('return')}
    for f, lst in feature_mapping.items():
        dfs = []
        for d in all_trading_dates:
            code_lst = get_cons_by_date(code_dates_mapping, d)
            rows, desc = data_fetcher.get_equ_factor(factor_type=f, security_ids=code_lst, start_date=d,
                                                     end_date=d, fields=lst)
            dfs.append(pd.DataFrame(rows, columns=desc))
        df = dfs[0]
        for data in dfs[1:]:
            df = df.append(data)
        del dfs
        df.to_csv(
            "E:\pycharm\\algo_trading\quant_models\quant_models\data\\features\\{0}_{1}_{2}.csv".format(f, start_date,
                                                                                                        end_date))
def get_source_feature_mappings(feature_types=None):
    root = get_source_root()
    feature_mapping = load_json_file(
        os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json'))
    if not feature_types:
        return feature_mapping
    _tmp = copy.deepcopy(feature_mapping)
    for k, v in _tmp.items():
        if not k in feature_types:
            feature_mapping.pop(k)
    return feature_mapping
示例#3
0
def retrieve_announcement(start_date='', end_date=''):
    code_dates_mapping = get_idx_cons('000300.XSHG')
    feature_mapping = load_json_file('E:\pycharm\\algo_trading\quant_models\quant_models\conf\\feature_mapping.json')
    all_trading_dates = get_all_trading_dates(start_date, end_date)

    #
    # code_lst = get_cons_by_date(code_dates_mapping, d)
    # for f, lst in feature_mapping:
    #     rows, desc = data_fetcher.get_equ_factor(factor_type=f, security_ids=code_lst, start_date=d, end_date=d, fields=lst)
    #     dfs.append(pd.DataFrame(rows, columns=desc))
    testing_mappings = {'return': feature_mapping.get('return')}
    d = start_date
    from quant_models.utils.date_utils import datetime_delta
def get_significant_features(top_ratio=0.5, bottom_ratio=0.2):
    root = get_source_root()
    tops = []
    bottoms = []
    ret = defaultdict(dict)
    score_lst = []
    f_types = set()

    root = get_source_root()
    feature_mapping = load_json_file(
        os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json'))
    # for start_date, end_date in [('20180103', '20181230'), ('20140603', '20160103'), ('20160103', '20171230')]:
    for start_date, end_date in [('20150103', '20181231')]:
        corr_path = os.path.join(
            os.path.realpath(root), 'conf',
            'score_{0}_{1}.csv'.format(start_date, end_date))
        df = pd.read_csv(corr_path)
        score_ret = defaultdict(list)
        for idx, k, s, ft in df.values:
            _tmp = score_ret.get(ft) or list()
            _tmp.append([k, s])
            score_ret.update({ft: _tmp})
            f_types.add(ft)
        for ft, val in score_ret.items():
            logger.debug(ft, len(val))
            val.sort(key=lambda x: x[1], reverse=True)
            t_dict = ret.get(ft) or dict()
            top_dict = t_dict.get('top_features') or list()
            bottom_dict = t_dict.get('bottom_features') or list()
            top_idx = int(len(val) *
                          top_ratio) if int(len(val) * top_ratio) > 5 else 5
            bottom_idx = int(
                len(val) *
                bottom_ratio) if int(len(val) * bottom_ratio) > 1 else 1
            top_dict.extend(item[0] for item in val[:top_idx])
            bottom_dict.extend(item[0] for item in val[-bottom_idx:])
            t_dict.update({'top_features': top_dict})
            t_dict.update({'bottom_features': bottom_dict})
            ret.update({ft: t_dict})
    for ft, val in ret.items():
        top_lst = list(set(val['top_features']))
        bottom_lst = list(set(val['bottom_features']))
        top_lst.extend(bottom_lst)
        ret.update({ft: top_lst})
    return ret
def feature_refine():
    root = get_source_root()
    feature_mapping = load_json_file(
        os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json'))
    score_file = 'testing_train_features_score_20160103_20171230.csv'
    score_file = os.path.join(os.path.realpath(root), 'conf', score_file)
    df = pd.read_csv(score_file)
    values = df.values
    type_lst = []
    for item in values:
        f, s = item[1:]
        for k, v in feature_mapping.items():
            if f in v:
                type_lst.append(k)
    df['feature_type'] = type_lst
    save_file = os.path.join(os.path.realpath(root), 'conf',
                             'score_20160103_20171230.csv')
    df.to_csv(save_file)
示例#6
0
def create_features_table():
    db = SQLiteHelper()
    root = get_source_root()
    # get the file name of the features
    feature_mapping_source = os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json')
    feature_mapping = load_json_file(feature_mapping_source)
    _vals = list(feature_mapping.values())
    fields = []
    for item in _vals:
        fields.extend(item)
    table_name = 'FEATURE_CACHE'
    s1 = "CREATE TABLE {0} (TICKER_SYMBOL INT, TRADE_DATE TEXT,SECURITY_ID TEXT,D_LABEL REAL,M_LABEL REAL, ".format(
        table_name)
    for f in fields:
        s1 += "{0} REAL,".format(f)
    s1 = s1[:-1] + ')'
    print(s1)
    try:
        db.execute_sql(s1)
    except Exception as ex:
        print(ex)
def get_equity_daily_features(security_ids=[],
                              features={'ma': ['ACD6', 'ACD20']},
                              start_date=20181101,
                              end_date=20181102,
                              trade_date=None,
                              source=0):
    logger.info(
        'Start calculate features from {0} to {1} for sec_ids:{2} and features types{3}'
        .format(start_date, end_date, len(security_ids), len(features)))

    ret_features = defaultdict(dict)
    # query on one date
    if trade_date:
        start_date = end_date = trade_date
    if isinstance(start_date, str):
        start_date = int(start_date)
    if isinstance(end_date, str):
        end_date = int(end_date)
    retrieve_feature_names = list()
    for f_type, f_val in features.items():
        retrieve_feature_names.extend(f_val)
    retrieve_feature_names = list(set(retrieve_feature_names))
    root = get_source_root()
    feature_mapping = load_json_file(
        os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json'))
    source_features = []
    for item in list(feature_mapping.values()):
        source_features.extend(item)
    cal_features = list(set(retrieve_feature_names) - set(source_features))
    _df = g_db_fetcher.get_data_fetcher_obj(source)
    excluded = [
        'CREATE_TIME', 'UPDATE_TIME', 'TMSTAMP', 'ID', 'SECURITY_ID_INT',
        'SECURITY_ID', 'TRADE_DATE', 'TICKER_SYMBOL'
    ]
    retrieve_feature_names = [item.upper() for item in retrieve_feature_names]
    for f_type, f_fields in features.items():
        rows, desc = _df.get_equ_factor(fields=f_fields,
                                        factor_type=f_type,
                                        security_ids=security_ids,
                                        start_date=start_date,
                                        end_date=end_date)
        # logger.info('Complete querying')
        id_idx = desc.index('SECURITY_ID')
        date_idx = desc.index('TRADE_DATE')
        # logger.info('start processing rows for factor type:{0}'.format(f_type))

        if not f_fields:
            continue
        for item in rows:
            sec_id, date = item[id_idx], item[date_idx]
            date_dict = ret_features[date] or {}
            if date_dict and sec_id in date_dict:
                curr_dict = date_dict[sec_id]
            else:
                curr_dict = {}
                date_dict[sec_id] = {}
            idx_lst = []
            for idx, val in enumerate(desc):
                if val.upper() in retrieve_feature_names:
                    idx_lst.append(idx)
            # idx_lst = [idx for idx, val in enumerate(desc) if val.upper() in retrieve_feature_names]
            tmp_lst = [item[idx] for idx in idx_lst]
            tmp_dict = dict(zip([desc[idx] for idx in idx_lst], tmp_lst))
            tmp_dict1 = copy.deepcopy(tmp_dict)
            for k1, v1 in tmp_dict1.items():
                if k1 in excluded:
                    tmp_dict.pop(k1)
            # add the pre-defined calculated featuers
            tmp_dict = get_cal_features(tmp_dict, cal_features)
            if tmp_dict:
                curr_dict.update(tmp_dict)
            if curr_dict:
                ret_features[date][sec_id] = curr_dict
        # logger.info('complete processing rows for factor type:{0}'.format(f_type))
        del rows
        gc.collect()
        time.sleep(3)
    for date, val in ret_features.items():
        for sec_id, _val in val.items():
            _keys = set(_val.keys())
            _add_keys = set(retrieve_feature_names) - _keys
            _remove_keys = _keys - set(retrieve_feature_names)
            for _k in _add_keys:
                _val.update({_k: None})
            for _k in _remove_keys:
                _val.pop(_k)
    # FIXME check whether the length of the features are the same now
    return ret_features