Пример #1
0
    def ordinal_encoding_dict_lookup(X_in,
                                     mapping=None,
                                     cols=None,
                                     handle_unknown='value',
                                     handle_missing='value'):
        """
        Normal ordinal encoding is an O(N) implementation and does not use a dict for faster lookup.
        This aims at providing an improved version with dict lookup
        This implementation does not support nan in the data
        """

        X = X_in.copy(deep=True)

        if mapping is not None:
            mapping_out = mapping
            for switch in mapping:
                column = switch.get('col')
                col_mapping = switch['mapping']
                X[column] = [col_mapping.get(x, -1) for x in X[column]]
        else:
            if cols is None:
                cols = X.columns.values
            mapping_out = []
            for col in cols:
                nan_identity = np.nan

                if util.is_category(X[col].dtype):
                    categories = X[col].cat.categories.tolist()
                    if X[col].isna().any():
                        categories += [np.nan]
                else:
                    categories = X[col].unique()

                index = pd.Series(categories).fillna(nan_identity).unique()

                data = pd.Series(index=index, data=range(1, len(index) + 1))

                if handle_missing == 'value' and ~data.index.isnull().any():
                    data.loc[nan_identity] = -2
                elif handle_missing == 'return_nan':
                    data.loc[nan_identity] = -2

                data = data.to_dict()
                mapping_out.append(
                    {
                        'col': col,
                        'mapping': data,
                        'data_type': X[col].dtype
                    }, )

        return X, mapping_out
Пример #2
0
    def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_unknown='impute'):
        """
        Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed
        in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes
        are assumed to have no true order and integers are selected at random.
        """

        X = X_in.copy(deep=True)

        if cols is None:
            cols = X.columns.values

        if mapping is not None:
            mapping_out = mapping
            for switch in mapping:
                categories_dict = dict(switch.get('mapping'))
                column = switch.get('col')
                transformed_column = X[column].map(lambda x: categories_dict.get(x, np.nan))

                try:
                    transformed_column = transformed_column.astype(int)
                except ValueError as e:
                    transformed_column = transformed_column.astype(float)

                if impute_missing:
                    if handle_unknown == 'impute':
                        transformed_column.fillna(0, inplace=True)
                    elif handle_unknown == 'error':
                        missing = transformed_column.isnull()
                        if any(missing):
                            raise ValueError('Unexpected categories found in column %s' % column)
                X[column] = transformed_column

        else:
            mapping_out = []
            for col in cols:

                if util.is_category(X[col].dtype):
                    categories = X[col].cat.categories
                else:
                    categories = [x for x in pd.unique(X[col].values) if x is not None]

                categories_dict = {x: i + 1 for i, x in enumerate(categories)}

                mapping_out.append({'col': col, 'mapping': list(categories_dict.items()), 'data_type': X[col].dtype}, )

        return X, mapping_out
    def ordinal_encoding(X_in,
                         mapping=None,
                         cols=None,
                         handle_unknown='value',
                         handle_missing='value'):
        """
        Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed
        in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes
        are assumed to have no true order and integers are selected at random.
        """

        return_nan_series = pd.Series(data=[np.nan], index=[-2])

        X = X_in.copy(deep=True)

        if cols is None:
            cols = X.columns.values

        if mapping is not None:
            mapping_out = mapping
            for switch in mapping:
                column = switch.get('col')
                X[column] = X[column].map(switch['mapping'])

                try:
                    X[column] = X[column].astype(int)
                except ValueError as e:
                    X[column] = X[column].astype(float)

                if handle_unknown == 'value':
                    X[column].fillna(-1, inplace=True)
                elif handle_unknown == 'error':
                    missing = X[column].isnull()
                    if any(missing):
                        raise ValueError(
                            'Unexpected categories found in column %s' %
                            column)

                if handle_missing == 'return_nan':
                    X[column] = X[column].map(return_nan_series).where(
                        X[column] == -2, X[column])

        else:
            mapping_out = []
            for col in cols:

                nan_identity = np.nan

                if util.is_category(X[col].dtype):
                    categories = X[col].cat.categories
                else:
                    categories = X[col].unique()

                index = pd.Series(categories).fillna(nan_identity).unique()

                data = pd.Series(index=index, data=range(1, len(index) + 1))

                if handle_missing == 'value' and ~data.index.isnull().any():
                    data.loc[nan_identity] = -2
                elif handle_missing == 'return_nan':
                    data.loc[nan_identity] = -2

                mapping_out.append(
                    {
                        'col': col,
                        'mapping': data,
                        'data_type': X[col].dtype
                    }, )

        return X, mapping_out
Пример #4
0
    def ordinal_encoding(X_in,
                         mapping=None,
                         cols=None,
                         impute_missing=True,
                         handle_unknown='impute'):
        """
        Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed
        in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes
        are assumed to have no true order and integers are selected at random.
        """

        X = X_in.copy(deep=True)

        if cols is None:
            cols = X.columns.values

        if mapping is not None:
            mapping_out = mapping
            for switch in mapping:
                column = switch.get('col')
                X[column] = X[column].map(switch['mapping'])

                try:
                    X[column] = X[column].astype(int)
                except ValueError as e:
                    X[column] = X[column].astype(float)

                if impute_missing:
                    if handle_unknown == 'impute':
                        X[column].fillna(0, inplace=True)
                    elif handle_unknown == 'error':
                        missing = X[column].isnull()
                        if any(missing):
                            raise ValueError(
                                'Unexpected categories found in column %s' %
                                column)

        else:
            mapping_out = []
            for col in cols:

                if util.is_category(X[col].dtype):
                    categories = X[col].cat.categories
                else:
                    categories = [
                        x for x in pd.unique(X[col].values) if x is not None
                    ]

                index = []
                values = []

                for i in range(len(categories)):
                    index.append(categories[i])
                    values.append(i + 1)

                mapping = pd.Series(data=values, index=index)

                mapping_out.append(
                    {
                        'col': col,
                        'mapping': mapping,
                        'data_type': X[col].dtype
                    }, )

        return X, mapping_out