Пример #1
0
def check_data_structure(x):
    '''
    All the data structure should belong to the following list:
    1. singleIndexed Series (index.name,series.name is required)
    2. singleIndexed DataFrame with multiple columns(index.name,columns.name is required)
    3. multIndexed DataFrame with multiple columns(index.names,columns.names is required)

    Rules:
    1. If there  is "t" axis,always put it in index.
    2. For multiIndexed DataFrame,if there is 't','sid',they should be put in index,with 't' as level0 and
        'sid' as level1,just like
                            'col1'  'col2'
            't'     'sid'     a1      a2
            '1990'   '1'      a1      a2
            '1990'   '2'      a1      a2
            '1991'   '1'      a1      a2
            '1991'   '2'      a1      a2

    '''
    if x.ndim == 1 and isinstance(x.index, pd.MultiIndex):
        raise MyError(
            "Series with MultiIndex is not allowed ! you'd betterconvert it into singleIndexed DataFrame !"
        )
    elif x.ndim == 2 and x.shape[1] == 1:
        raise MyError(
            "DataFrame with only one column is not allowed,you'd better convert it to Series !"
        )
Пример #2
0
def _check_s_for_saving_name(s, name):
    if not s.name:
        raise MyError('No name for Series')
    elif s.name != name:
        raise MyError(
            'The file name "{}" to save is different with the name of Series "{}"'
            .format(name, s.name))
Пример #3
0
def _check_multiIndex(axis):
    dic = {'t': pd.Timestamp, 'sid': str}
    names = axis.names
    values = axis[0]
    for n, v in zip(names, values):
        if not isinstance(v, dic[n]):
            raise MyError(
                'The data type of "{}" should be "{}",rather than "{}"!'.
                format(n, dic[n], type(v)))

    if axis.has_duplicates:
        raise MyError('The axis "{}" has duplicates'.format(axis.name))
Пример #4
0
def single_sorting_factor(indicator, q, weight=False):
    # method1 independent way
    '''
    This function is used to construct a new factor by a given indicator.
    We first group stocks into "q" portfolios based on the rank of "indicator"
    every month.Then,at the next month we calculate the corresponding monthly
    value-weighted (if weight is True) portfolio return.The factor return is
    the spread between the return of the top portfolio and bottom portfolio.

    :param indicator:
    :param q:
    :param weight:
    :return:Series
    '''
    if isinstance(q, int):
        labels = ['g{}'.format(i) for i in range(1, q + 1)]
    elif isinstance(q, (list, tuple)):
        labels = ['g{}'.format(i) for i in range(1, len(q))]
    else:
        raise MyError('q:"{}"  is wrong!'.format(repr(q)))

    comb = combine_with_datalagged([indicator])
    comb['g'] = comb.groupby('t', group_keys=False).apply(
        lambda df: assign_port_id(df[indicator], q, labels))

    if weight:
        panel=comb.groupby(['t','g']).apply(
            lambda df:my_average(df,'stockEretM',wname='weight'))\
            .unstack(level=['g'])
    else:
        panel = comb.groupby(['t',
                              'g'])['stockEretM'].mean().unstack(level=['g'])

    factor = panel[labels[-1]] - panel[labels[0]]
    return factor
Пример #5
0
def construct_playingField(vars, model):
    '''
    :param vars: list
    :param model: belong to {'5x5','2x4x4'}
    :return:
    '''
    if model == '5x5':
        v1, v2 = tuple(vars)
        comb = data_for_bivariate(v1, v2, 5, 5, independent=True)
        assets=comb.groupby(['t','g1','g2']).apply(
            lambda df:my_average(df,'stockEretM',wname='weight'))\
            .unstack(level=['g1','g2'])
    elif model == '2x4x4':
        #v1 must belong to size category
        v1, v2, v3 = tuple(vars)
        comb = combine_with_datalagged([v1, v2, v3])
        comb = comb.dropna()
        comb['g1'] = comb.groupby('t', group_keys=False).apply(
            lambda df: assign_port_id(df[v1], 2, range(1, 3)))

        comb['g2'] = comb.groupby(['t', 'g1'], group_keys=False).apply(
            lambda df: assign_port_id(df[v2], 4, range(1, 5)))

        comb['g3'] = comb.groupby(['t', 'g1'], group_keys=False).apply(
            lambda df: assign_port_id(df[v3], 4, range(1, 5)))

        assets=comb.groupby(['t','g1','g2','g3']).apply(
            lambda df: my_average(df, 'stockEretM', wname='weight')) \
            .unstack(level=['g1', 'g2','g3'])
    else:
        raise MyError('Model "{}" is not supported currently'.format(model))

    return assets
Пример #6
0
def control_sid(conditions):
    '''
    is_sz
    is_sh
    is_gem 创业板
    is_cross
    not_financial
    is_industry

    :param conditions:
    :return:a list of stock codes
    '''
    #TODO: is_gem,is_industry,
    condition_set=['is_sz','is_sh','not_cross','not_financial']
    info=read_unfiltered('listInfo')

    def _one_condition(condition):
        if condition in condition_set:
            sids=info[info[condition]].index.tolist()
            return sids
        else:
            raise ValueError('The "conditions" should be one of {}'.format(repr(condition_set)))

    if isinstance(conditions,str):
        return _one_condition(conditions)
    elif isinstance(conditions, list):
        l_sids=[_one_condition(con) for con in conditions]
        return sorted(list(set.intersection(*map(set,l_sids))))
    else:
        raise MyError('no such conditon as {}'.format(conditions))
Пример #7
0
def _check_singleIndex(axis):
    #for single index
    dic = {'sid': str, 't': pd.Timestamp, 'type': str}

    # check data type
    if not axis.name:
        raise MyError('axis name is missing !')
    elif axis.name not in dic.keys():
        raise MyError('The axis name is "{}",not included in {}'.format(
            axis.name, str(dic.keys())))
    elif not isinstance(axis[0], dic[axis.name]):
        raise MyError(
            'The data type of "{}" should be "{}",rather than "{}"!'.format(
                axis.name, dic[axis.name], type(axis[0])))

    # check duplicates
    if axis.has_duplicates:
        raise MyError('The axis "{}" has duplicates'.format(axis.name))
Пример #8
0
def get_single_sorting_assets(indicator, q, weight=True):
    if isinstance(q, int):
        labels = ['g{}'.format(i) for i in range(1, q + 1)]
    elif isinstance(q, (list, tuple)):
        labels = ['g{}'.format(i) for i in range(1, len(q))]
    else:
        raise MyError('q:"{}"  is wrong!'.format(repr(q)))

    comb = combine_with_datalagged([indicator])
    comb['g'] = comb.groupby('t', group_keys=False).apply(
        lambda df: assign_port_id(df[indicator], q, labels))

    if weight:
        assets=comb.groupby(['t','g']).apply(
            lambda df:my_average(df,'stockEretM',wname='weight'))\
            .unstack(level=['g'])
    else:
        assets = comb.groupby(['t',
                               'g'])['stockEretM'].mean().unstack(level=['g'])
    return assets
Пример #9
0
def load_data(name):
    '''
    By default,it will load filtered data if there is,or it will load unfiltered
    data.

    Args:
        name:

    Returns:

    '''
    fns1 = os.listdir(PKL_FILTERED_PATH)
    fns2 = os.listdir(PKL_UNFILTERED_PATH)
    if name + '.pkl' in fns1:
        x = pd.read_pickle(os.path.join(PKL_FILTERED_PATH, name + '.pkl'))
        return x
    elif name + '.pkl' in fns2:
        x = read_unfiltered(name)
        return x
    else:
        raise MyError(
            'There is no such data named "{}.pkl" in the repository!'.format(
                name))