def retrieve_factor(start_date='', end_date=''): code_dates_mapping = get_idx_cons('000300.XSHG') feature_mapping = load_json_file('E:\pycharm\\algo_trading\quant_models\quant_models\conf\\feature_mapping.json') all_trading_dates = get_all_trading_dates(start_date, end_date) # # code_lst = get_cons_by_date(code_dates_mapping, d) # for f, lst in feature_mapping: # rows, desc = data_fetcher.get_equ_factor(factor_type=f, security_ids=code_lst, start_date=d, end_date=d, fields=lst) # dfs.append(pd.DataFrame(rows, columns=desc)) testing_mappings = {'return': feature_mapping.get('return')} for f, lst in feature_mapping.items(): dfs = [] for d in all_trading_dates: code_lst = get_cons_by_date(code_dates_mapping, d) rows, desc = data_fetcher.get_equ_factor(factor_type=f, security_ids=code_lst, start_date=d, end_date=d, fields=lst) dfs.append(pd.DataFrame(rows, columns=desc)) df = dfs[0] for data in dfs[1:]: df = df.append(data) del dfs df.to_csv( "E:\pycharm\\algo_trading\quant_models\quant_models\data\\features\\{0}_{1}_{2}.csv".format(f, start_date, end_date))
def get_source_feature_mappings(feature_types=None): root = get_source_root() feature_mapping = load_json_file( os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json')) if not feature_types: return feature_mapping _tmp = copy.deepcopy(feature_mapping) for k, v in _tmp.items(): if not k in feature_types: feature_mapping.pop(k) return feature_mapping
def retrieve_announcement(start_date='', end_date=''): code_dates_mapping = get_idx_cons('000300.XSHG') feature_mapping = load_json_file('E:\pycharm\\algo_trading\quant_models\quant_models\conf\\feature_mapping.json') all_trading_dates = get_all_trading_dates(start_date, end_date) # # code_lst = get_cons_by_date(code_dates_mapping, d) # for f, lst in feature_mapping: # rows, desc = data_fetcher.get_equ_factor(factor_type=f, security_ids=code_lst, start_date=d, end_date=d, fields=lst) # dfs.append(pd.DataFrame(rows, columns=desc)) testing_mappings = {'return': feature_mapping.get('return')} d = start_date from quant_models.utils.date_utils import datetime_delta
def get_significant_features(top_ratio=0.5, bottom_ratio=0.2): root = get_source_root() tops = [] bottoms = [] ret = defaultdict(dict) score_lst = [] f_types = set() root = get_source_root() feature_mapping = load_json_file( os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json')) # for start_date, end_date in [('20180103', '20181230'), ('20140603', '20160103'), ('20160103', '20171230')]: for start_date, end_date in [('20150103', '20181231')]: corr_path = os.path.join( os.path.realpath(root), 'conf', 'score_{0}_{1}.csv'.format(start_date, end_date)) df = pd.read_csv(corr_path) score_ret = defaultdict(list) for idx, k, s, ft in df.values: _tmp = score_ret.get(ft) or list() _tmp.append([k, s]) score_ret.update({ft: _tmp}) f_types.add(ft) for ft, val in score_ret.items(): logger.debug(ft, len(val)) val.sort(key=lambda x: x[1], reverse=True) t_dict = ret.get(ft) or dict() top_dict = t_dict.get('top_features') or list() bottom_dict = t_dict.get('bottom_features') or list() top_idx = int(len(val) * top_ratio) if int(len(val) * top_ratio) > 5 else 5 bottom_idx = int( len(val) * bottom_ratio) if int(len(val) * bottom_ratio) > 1 else 1 top_dict.extend(item[0] for item in val[:top_idx]) bottom_dict.extend(item[0] for item in val[-bottom_idx:]) t_dict.update({'top_features': top_dict}) t_dict.update({'bottom_features': bottom_dict}) ret.update({ft: t_dict}) for ft, val in ret.items(): top_lst = list(set(val['top_features'])) bottom_lst = list(set(val['bottom_features'])) top_lst.extend(bottom_lst) ret.update({ft: top_lst}) return ret
def feature_refine(): root = get_source_root() feature_mapping = load_json_file( os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json')) score_file = 'testing_train_features_score_20160103_20171230.csv' score_file = os.path.join(os.path.realpath(root), 'conf', score_file) df = pd.read_csv(score_file) values = df.values type_lst = [] for item in values: f, s = item[1:] for k, v in feature_mapping.items(): if f in v: type_lst.append(k) df['feature_type'] = type_lst save_file = os.path.join(os.path.realpath(root), 'conf', 'score_20160103_20171230.csv') df.to_csv(save_file)
def create_features_table(): db = SQLiteHelper() root = get_source_root() # get the file name of the features feature_mapping_source = os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json') feature_mapping = load_json_file(feature_mapping_source) _vals = list(feature_mapping.values()) fields = [] for item in _vals: fields.extend(item) table_name = 'FEATURE_CACHE' s1 = "CREATE TABLE {0} (TICKER_SYMBOL INT, TRADE_DATE TEXT,SECURITY_ID TEXT,D_LABEL REAL,M_LABEL REAL, ".format( table_name) for f in fields: s1 += "{0} REAL,".format(f) s1 = s1[:-1] + ')' print(s1) try: db.execute_sql(s1) except Exception as ex: print(ex)
def get_equity_daily_features(security_ids=[], features={'ma': ['ACD6', 'ACD20']}, start_date=20181101, end_date=20181102, trade_date=None, source=0): logger.info( 'Start calculate features from {0} to {1} for sec_ids:{2} and features types{3}' .format(start_date, end_date, len(security_ids), len(features))) ret_features = defaultdict(dict) # query on one date if trade_date: start_date = end_date = trade_date if isinstance(start_date, str): start_date = int(start_date) if isinstance(end_date, str): end_date = int(end_date) retrieve_feature_names = list() for f_type, f_val in features.items(): retrieve_feature_names.extend(f_val) retrieve_feature_names = list(set(retrieve_feature_names)) root = get_source_root() feature_mapping = load_json_file( os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json')) source_features = [] for item in list(feature_mapping.values()): source_features.extend(item) cal_features = list(set(retrieve_feature_names) - set(source_features)) _df = g_db_fetcher.get_data_fetcher_obj(source) excluded = [ 'CREATE_TIME', 'UPDATE_TIME', 'TMSTAMP', 'ID', 'SECURITY_ID_INT', 'SECURITY_ID', 'TRADE_DATE', 'TICKER_SYMBOL' ] retrieve_feature_names = [item.upper() for item in retrieve_feature_names] for f_type, f_fields in features.items(): rows, desc = _df.get_equ_factor(fields=f_fields, factor_type=f_type, security_ids=security_ids, start_date=start_date, end_date=end_date) # logger.info('Complete querying') id_idx = desc.index('SECURITY_ID') date_idx = desc.index('TRADE_DATE') # logger.info('start processing rows for factor type:{0}'.format(f_type)) if not f_fields: continue for item in rows: sec_id, date = item[id_idx], item[date_idx] date_dict = ret_features[date] or {} if date_dict and sec_id in date_dict: curr_dict = date_dict[sec_id] else: curr_dict = {} date_dict[sec_id] = {} idx_lst = [] for idx, val in enumerate(desc): if val.upper() in retrieve_feature_names: idx_lst.append(idx) # idx_lst = [idx for idx, val in enumerate(desc) if val.upper() in retrieve_feature_names] tmp_lst = [item[idx] for idx in idx_lst] tmp_dict = dict(zip([desc[idx] for idx in idx_lst], tmp_lst)) tmp_dict1 = copy.deepcopy(tmp_dict) for k1, v1 in tmp_dict1.items(): if k1 in excluded: tmp_dict.pop(k1) # add the pre-defined calculated featuers tmp_dict = get_cal_features(tmp_dict, cal_features) if tmp_dict: curr_dict.update(tmp_dict) if curr_dict: ret_features[date][sec_id] = curr_dict # logger.info('complete processing rows for factor type:{0}'.format(f_type)) del rows gc.collect() time.sleep(3) for date, val in ret_features.items(): for sec_id, _val in val.items(): _keys = set(_val.keys()) _add_keys = set(retrieve_feature_names) - _keys _remove_keys = _keys - set(retrieve_feature_names) for _k in _add_keys: _val.update({_k: None}) for _k in _remove_keys: _val.pop(_k) # FIXME check whether the length of the features are the same now return ret_features