def check_data_structure(x): ''' All the data structure should belong to the following list: 1. singleIndexed Series (index.name,series.name is required) 2. singleIndexed DataFrame with multiple columns(index.name,columns.name is required) 3. multIndexed DataFrame with multiple columns(index.names,columns.names is required) Rules: 1. If there is "t" axis,always put it in index. 2. For multiIndexed DataFrame,if there is 't','sid',they should be put in index,with 't' as level0 and 'sid' as level1,just like 'col1' 'col2' 't' 'sid' a1 a2 '1990' '1' a1 a2 '1990' '2' a1 a2 '1991' '1' a1 a2 '1991' '2' a1 a2 ''' if x.ndim == 1 and isinstance(x.index, pd.MultiIndex): raise MyError( "Series with MultiIndex is not allowed ! you'd betterconvert it into singleIndexed DataFrame !" ) elif x.ndim == 2 and x.shape[1] == 1: raise MyError( "DataFrame with only one column is not allowed,you'd better convert it to Series !" )
def _check_s_for_saving_name(s, name): if not s.name: raise MyError('No name for Series') elif s.name != name: raise MyError( 'The file name "{}" to save is different with the name of Series "{}"' .format(name, s.name))
def _check_multiIndex(axis): dic = {'t': pd.Timestamp, 'sid': str} names = axis.names values = axis[0] for n, v in zip(names, values): if not isinstance(v, dic[n]): raise MyError( 'The data type of "{}" should be "{}",rather than "{}"!'. format(n, dic[n], type(v))) if axis.has_duplicates: raise MyError('The axis "{}" has duplicates'.format(axis.name))
def single_sorting_factor(indicator, q, weight=False): # method1 independent way ''' This function is used to construct a new factor by a given indicator. We first group stocks into "q" portfolios based on the rank of "indicator" every month.Then,at the next month we calculate the corresponding monthly value-weighted (if weight is True) portfolio return.The factor return is the spread between the return of the top portfolio and bottom portfolio. :param indicator: :param q: :param weight: :return:Series ''' if isinstance(q, int): labels = ['g{}'.format(i) for i in range(1, q + 1)] elif isinstance(q, (list, tuple)): labels = ['g{}'.format(i) for i in range(1, len(q))] else: raise MyError('q:"{}" is wrong!'.format(repr(q))) comb = combine_with_datalagged([indicator]) comb['g'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[indicator], q, labels)) if weight: panel=comb.groupby(['t','g']).apply( lambda df:my_average(df,'stockEretM',wname='weight'))\ .unstack(level=['g']) else: panel = comb.groupby(['t', 'g'])['stockEretM'].mean().unstack(level=['g']) factor = panel[labels[-1]] - panel[labels[0]] return factor
def construct_playingField(vars, model): ''' :param vars: list :param model: belong to {'5x5','2x4x4'} :return: ''' if model == '5x5': v1, v2 = tuple(vars) comb = data_for_bivariate(v1, v2, 5, 5, independent=True) assets=comb.groupby(['t','g1','g2']).apply( lambda df:my_average(df,'stockEretM',wname='weight'))\ .unstack(level=['g1','g2']) elif model == '2x4x4': #v1 must belong to size category v1, v2, v3 = tuple(vars) comb = combine_with_datalagged([v1, v2, v3]) comb = comb.dropna() comb['g1'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[v1], 2, range(1, 3))) comb['g2'] = comb.groupby(['t', 'g1'], group_keys=False).apply( lambda df: assign_port_id(df[v2], 4, range(1, 5))) comb['g3'] = comb.groupby(['t', 'g1'], group_keys=False).apply( lambda df: assign_port_id(df[v3], 4, range(1, 5))) assets=comb.groupby(['t','g1','g2','g3']).apply( lambda df: my_average(df, 'stockEretM', wname='weight')) \ .unstack(level=['g1', 'g2','g3']) else: raise MyError('Model "{}" is not supported currently'.format(model)) return assets
def control_sid(conditions): ''' is_sz is_sh is_gem 创业板 is_cross not_financial is_industry :param conditions: :return:a list of stock codes ''' #TODO: is_gem,is_industry, condition_set=['is_sz','is_sh','not_cross','not_financial'] info=read_unfiltered('listInfo') def _one_condition(condition): if condition in condition_set: sids=info[info[condition]].index.tolist() return sids else: raise ValueError('The "conditions" should be one of {}'.format(repr(condition_set))) if isinstance(conditions,str): return _one_condition(conditions) elif isinstance(conditions, list): l_sids=[_one_condition(con) for con in conditions] return sorted(list(set.intersection(*map(set,l_sids)))) else: raise MyError('no such conditon as {}'.format(conditions))
def _check_singleIndex(axis): #for single index dic = {'sid': str, 't': pd.Timestamp, 'type': str} # check data type if not axis.name: raise MyError('axis name is missing !') elif axis.name not in dic.keys(): raise MyError('The axis name is "{}",not included in {}'.format( axis.name, str(dic.keys()))) elif not isinstance(axis[0], dic[axis.name]): raise MyError( 'The data type of "{}" should be "{}",rather than "{}"!'.format( axis.name, dic[axis.name], type(axis[0]))) # check duplicates if axis.has_duplicates: raise MyError('The axis "{}" has duplicates'.format(axis.name))
def get_single_sorting_assets(indicator, q, weight=True): if isinstance(q, int): labels = ['g{}'.format(i) for i in range(1, q + 1)] elif isinstance(q, (list, tuple)): labels = ['g{}'.format(i) for i in range(1, len(q))] else: raise MyError('q:"{}" is wrong!'.format(repr(q))) comb = combine_with_datalagged([indicator]) comb['g'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[indicator], q, labels)) if weight: assets=comb.groupby(['t','g']).apply( lambda df:my_average(df,'stockEretM',wname='weight'))\ .unstack(level=['g']) else: assets = comb.groupby(['t', 'g'])['stockEretM'].mean().unstack(level=['g']) return assets
def load_data(name): ''' By default,it will load filtered data if there is,or it will load unfiltered data. Args: name: Returns: ''' fns1 = os.listdir(PKL_FILTERED_PATH) fns2 = os.listdir(PKL_UNFILTERED_PATH) if name + '.pkl' in fns1: x = pd.read_pickle(os.path.join(PKL_FILTERED_PATH, name + '.pkl')) return x elif name + '.pkl' in fns2: x = read_unfiltered(name) return x else: raise MyError( 'There is no such data named "{}.pkl" in the repository!'.format( name))