예제 #1
0
def categorical_to_int(data, levels, NA_action, origin=None):
    assert isinstance(levels, tuple)
    # In this function, missing values are always mapped to -1
    if isinstance(data, pd.Categorical):
        data_levels_tuple = tuple(data.levels)
        if not data_levels_tuple == levels:
            raise PatsyError("mismatching levels: expected %r, got %r"
                             % (levels, data_levels_tuple), origin)
        # pd.Categorical also uses -1 to indicate NA, and we don't try to
        # second-guess its NA detection, so we can just pass it back.
        return data.labels
    elif hasattr(data, 'dtype') and hasattr(data, 'astype') and \
            np.issubdtype(data.dtype, np.bool_):
        return data.astype('int')
    if isinstance(data, _CategoricalBox):
        if data.levels is not None and tuple(data.levels) != levels:
            raise PatsyError("mismatching levels: expected %r, got %r"
                             % (levels, tuple(data.levels)), origin)
        data = data.data
    if hasattr(data, "shape") and len(data.shape) > 1:
        raise PatsyError("categorical data must be 1-dimensional",
                         origin)
    if not iterable(data) or isinstance(data, basestring):
        raise PatsyError("categorical data must be an iterable container")
    try:
        level_to_int = dict(zip(levels, xrange(len(levels))))
    except TypeError:
        raise PatsyError("Error interpreting categorical data: "
                         "all items must be hashable", origin)
    out = np.empty(len(data), dtype=int)
    for i, value in enumerate(data):
        if NA_action.is_categorical_NA(value):
            out[i] = -1
        else:
            try:
                out[i] = level_to_int[value]
            except KeyError:
                SHOW_LEVELS = 4
                level_strs = []
                if len(levels) <= SHOW_LEVELS:
                    level_strs += [repr(level) for level in levels]
                else:
                    level_strs += [repr(level)
                                   for level in levels[:SHOW_LEVELS//2]]
                    level_strs.append("...")
                    level_strs += [repr(level)
                                   for level in levels[-SHOW_LEVELS//2:]]
                level_str = "[%s]" % (", ".join(level_strs))
                raise PatsyError("Error converting data to categorical: "
                                 "observation with value %r does not match "
                                 "any of the expected levels (expected: %s)"
                                 % (value, level_str), origin)
            except TypeError:
                raise PatsyError("Error converting data to categorical: "
                                 "encountered unhashable value %r"
                                 % (value,), origin)
    if isinstance(data, pd.Series):
        out = pd.Series(out, index=data.index)
    return out
예제 #2
0
def _categorical_shape_fix(data):
    # helper function
    # data should not be a _CategoricalBox or pandas Categorical or anything
    # -- it should be an actual iterable of data, but which might have the
    # wrong shape.
    if hasattr(data, "ndim") and data.ndim > 1:
        raise PatsyError("categorical data cannot be >1-dimensional")
    # coerce scalars into 1d, which is consistent with what we do for numeric
    # factors. (See statsmodels/statsmodels#1881)
    if (not iterable(data)
        or isinstance(data, (six.text_type, six.binary_type))):
        data = [data]
    return data
예제 #3
0
def _categorical_shape_fix(data):
    # helper function
    # data should not be a _CategoricalBox or pandas Categorical or anything
    # -- it should be an actual iterable of data, but which might have the
    # wrong shape.
    if hasattr(data, "ndim") and data.ndim > 1:
        raise PatsyError("categorical data cannot be >1-dimensional")
    # coerce scalars into 1d, which is consistent with what we do for numeric
    # factors. (See statsmodels/statsmodels#1881)
    if (not iterable(data) or isinstance(data,
                                         (six.text_type, six.binary_type))):
        data = [data]
    return data