Exemplo n.º 1
0
 def preprocessing_schema():
     return {
         "char_tokenizer": {
             "type": "string",
             "enum": sorted(list(tokenizer_registry.keys()))
         },
         "char_vocab_file": {
             "type": ["string", "null"]
         },
         "char_sequence_length_limit": {
             "type": "integer",
             "minimum": 0
         },
         "char_most_common": {
             "type": "integer",
             "minimum": 0
         },
         "word_tokenizer": {
             "type": "string",
             "enum": sorted(list(tokenizer_registry.keys()))
         },
         "pretrained_model_name_or_path": {
             "type": ["string", "null"]
         },
         "word_vocab_file": {
             "type": ["string", "null"]
         },
         "word_sequence_length_limit": {
             "type": "integer",
             "minimum": 0
         },
         "word_most_common": {
             "type": "integer",
             "minimum": 0
         },
         "padding_symbol": {
             "type": "string"
         },
         "unknown_symbol": {
             "type": "string"
         },
         "padding": {
             "type": "string",
             "enum": ["right", "left"]
         },
         "lowercase": {
             "type": "boolean"
         },
         "missing_value_strategy": {
             "type": "string",
             "enum": MISSING_VALUE_STRATEGY_OPTIONS
         },
         "fill_value": {
             "type": "string"
         },
         "computed_fill_value": {
             "type": "string"
         },
     }
Exemplo n.º 2
0
class SetFeatureMixin:
    type = SET
    preprocessing_defaults = {
        "tokenizer": "space",
        "most_common": 10000,
        "lowercase": False,
        "missing_value_strategy": FILL_WITH_CONST,
        "fill_value": UNKNOWN_SYMBOL,
    }

    preprocessing_schema = {
        "tokenizer": {"type": "string", "enum": sorted(list(tokenizer_registry.keys()))},
        "most_common": {"type": "integer", "minimum": 0},
        "lowercase": {"type": "boolean"},
        "missing_value_strategy": {"type": "string", "enum": MISSING_VALUE_STRATEGY_OPTIONS},
        "fill_value": {"type": "string"},
        "computed_fill_value": {"type": "string"},
    }

    @staticmethod
    def cast_column(column, backend):
        return column

    @staticmethod
    def get_feature_meta(column, preprocessing_parameters, backend):
        column = column.astype(str)
        idx2str, str2idx, str2freq, max_size, _, _, _ = create_vocabulary(
            column,
            preprocessing_parameters["tokenizer"],
            num_most_frequent=preprocessing_parameters["most_common"],
            lowercase=preprocessing_parameters["lowercase"],
            processor=backend.df_engine,
        )
        return {
            "idx2str": idx2str,
            "str2idx": str2idx,
            "str2freq": str2freq,
            "vocab_size": len(str2idx),
            "max_set_size": max_size,
        }

    @staticmethod
    def feature_data(column, metadata, preprocessing_parameters, backend):
        def to_dense(x):
            feature_vector = set_str_to_idx(x, metadata["str2idx"], preprocessing_parameters["tokenizer"])

            set_vector = np.zeros((len(metadata["str2idx"]),))
            set_vector[feature_vector] = 1
            return set_vector.astype(np.bool)

        return backend.df_engine.map_objects(column, to_dense)

    @staticmethod
    def add_feature_data(
        feature, input_df, proc_df, metadata, preprocessing_parameters, backend, skip_save_processed_input
    ):
        proc_df[feature[PROC_COLUMN]] = SetFeatureMixin.feature_data(
            input_df[feature[COLUMN]].astype(str), metadata[feature[NAME]], preprocessing_parameters, backend
        )
        return proc_df
Exemplo n.º 3
0
 def preprocessing_schema():
     return {
         "timeseries_length_limit": {
             "type": "integer",
             "minimum": 0
         },
         "padding_value": {
             "type": "number"
         },
         "padding": {
             "type": "string",
             "enum": ["right", "left"]
         },
         "tokenizer": {
             "type": "string",
             "enum": sorted(list(tokenizer_registry.keys()))
         },
         "missing_value_strategy": {
             "type": "string",
             "enum": MISSING_VALUE_STRATEGY_OPTIONS
         },
         "fill_value": {
             "type": "string"
         },
         "computed_fill_value": {
             "type": "string"
         },
     }
Exemplo n.º 4
0
 def preprocessing_schema():
     return {
         "tokenizer": {
             "type": "string",
             "enum": sorted(list(tokenizer_registry.keys()))
         },
         "most_common": {
             "type": "integer",
             "minimum": 0
         },
         "lowercase": {
             "type": "boolean"
         },
         "missing_value_strategy": {
             "type": "string",
             "enum": MISSING_VALUE_STRATEGY_OPTIONS
         },
         "fill_value": {
             "type": "string"
         },
         "computed_fill_value": {
             "type": "string"
         },
     }
Exemplo n.º 5
0
class SequenceFeatureMixin:
    type = SEQUENCE

    preprocessing_defaults = {
        'sequence_length_limit': 256,
        'most_common': 20000,
        'padding_symbol': PADDING_SYMBOL,
        'unknown_symbol': UNKNOWN_SYMBOL,
        'padding': 'right',
        'tokenizer': 'space',
        'lowercase': False,
        'vocab_file': None,
        'missing_value_strategy': FILL_WITH_CONST,
        'fill_value': UNKNOWN_SYMBOL
    }

    preprocessing_schema = {
        'sequence_length_limit': {
            'type': 'integer',
            'minimum': 0
        },
        'most_common': {
            'type': 'integer',
            'minimum': 0
        },
        'padding_symbol': {
            'type': 'string'
        },
        'unknown_symbol': {
            'type': 'string'
        },
        'padding': {
            'type': 'string',
            'enum': ['right', 'left']
        },
        'tokenizer': {
            'type': 'string',
            'enum': sorted(list(tokenizer_registry.keys()))
        },
        'lowercase': {
            'type': 'boolean'
        },
        'missing_value_strategy': {
            'type': 'string',
            'enum': MISSING_VALUE_STRATEGY_OPTIONS
        },
        'fill_value': {
            'type': 'string'
        },
        'computed_fill_value': {
            'type': 'string'
        },
    }

    @staticmethod
    def cast_column(column, backend):
        return column

    @staticmethod
    def get_feature_meta(column, preprocessing_parameters, backend):
        column = column.astype(str)
        idx2str, str2idx, str2freq, max_length, _, _, _ = create_vocabulary(
            column,
            preprocessing_parameters['tokenizer'],
            lowercase=preprocessing_parameters['lowercase'],
            num_most_frequent=preprocessing_parameters['most_common'],
            vocab_file=preprocessing_parameters['vocab_file'],
            unknown_symbol=preprocessing_parameters['unknown_symbol'],
            padding_symbol=preprocessing_parameters['padding_symbol'],
            processor=backend.df_engine)
        max_length = min(preprocessing_parameters['sequence_length_limit'],
                         max_length)
        return {
            'idx2str': idx2str,
            'str2idx': str2idx,
            'str2freq': str2freq,
            'vocab_size': len(idx2str),
            'max_sequence_length': max_length
        }

    @staticmethod
    def feature_data(column, metadata, preprocessing_parameters, backend):
        sequence_data = build_sequence_matrix(
            sequences=column,
            inverse_vocabulary=metadata['str2idx'],
            tokenizer_type=preprocessing_parameters['tokenizer'],
            length_limit=metadata['max_sequence_length'],
            padding_symbol=preprocessing_parameters['padding_symbol'],
            padding=preprocessing_parameters['padding'],
            unknown_symbol=preprocessing_parameters['unknown_symbol'],
            lowercase=preprocessing_parameters['lowercase'],
            tokenizer_vocab_file=preprocessing_parameters['vocab_file'],
            processor=backend.df_engine)
        return sequence_data

    @staticmethod
    def add_feature_data(feature, input_df, proc_df, metadata,
                         preprocessing_parameters, backend,
                         skip_save_processed_input):
        sequence_data = SequenceInputFeature.feature_data(
            input_df[feature[COLUMN]].astype(str), metadata[feature[NAME]],
            preprocessing_parameters, backend)
        proc_df[feature[PROC_COLUMN]] = sequence_data
        return proc_df
Exemplo n.º 6
0
class TimeseriesFeatureMixin(object):
    type = TIMESERIES

    preprocessing_defaults = {
        'timeseries_length_limit': 256,
        'padding_value': 0,
        'padding': 'right',
        'tokenizer': 'space',
        'missing_value_strategy': FILL_WITH_CONST,
        'fill_value': ''
    }

    preprocessing_schema = {
        'timeseries_length_limit': {
            'type': 'integer',
            'minimum': 0
        },
        'padding_value': {
            'type': 'number'
        },
        'padding': {
            'type': 'string',
            'enum': ['right', 'left']
        },
        'tokenizer': {
            'type': 'string',
            'enum': sorted(list(tokenizer_registry.keys()))
        },
        'missing_value_strategy': {
            'type': 'string',
            'enum': MISSING_VALUE_STRATEGY_OPTIONS
        },
        'fill_value': {
            'type': 'string'
        },
        'computed_fill_value': {
            'type': 'string'
        },
    }

    @staticmethod
    def cast_column(feature, dataset_df, backend):
        return dataset_df

    @staticmethod
    def get_feature_meta(column, preprocessing_parameters, backend):
        column = column.astype(str)
        tokenizer = get_from_registry(preprocessing_parameters['tokenizer'],
                                      tokenizer_registry)()
        max_length = 0
        for timeseries in column:
            processed_line = tokenizer(timeseries)
            max_length = max(max_length, len(processed_line))
        max_length = min(preprocessing_parameters['timeseries_length_limit'],
                         max_length)

        return {'max_timeseries_length': max_length}

    @staticmethod
    def build_matrix(timeseries, tokenizer_name, length_limit, padding_value,
                     padding, backend):
        tokenizer = get_from_registry(tokenizer_name, tokenizer_registry)()

        ts_vectors = backend.df_engine.map_objects(
            timeseries, lambda ts: np.array(tokenizer(ts)).astype(np.float32))

        max_length = backend.df_engine.compute(ts_vectors.map(len).max())
        if max_length < length_limit:
            logger.debug('max length of {0}: {1} < limit: {2}'.format(
                tokenizer_name, max_length, length_limit))
        max_length = length_limit

        def pad(vector):
            padded = np.full((max_length, ), padding_value, dtype=np.float32)
            limit = min(vector.shape[0], max_length)
            if padding == 'right':
                padded[:limit] = vector[:limit]
            else:  # if padding == 'left
                padded[max_length - limit:] = vector[:limit]
            return padded

        return backend.df_engine.map_objects(ts_vectors, pad)

    @staticmethod
    def feature_data(column, metadata, preprocessing_parameters, backend):
        timeseries_data = TimeseriesFeatureMixin.build_matrix(
            column, preprocessing_parameters['tokenizer'],
            metadata['max_timeseries_length'],
            preprocessing_parameters['padding_value'],
            preprocessing_parameters['padding'], backend)
        return timeseries_data

    @staticmethod
    def add_feature_data(feature, input_df, proc_df, metadata,
                         preprocessing_parameters, backend,
                         skip_save_processed_input):
        proc_df[feature[PROC_COLUMN]] = TimeseriesFeatureMixin.feature_data(
            input_df[feature[COLUMN]].astype(str), metadata[feature[NAME]],
            preprocessing_parameters, backend)
        return proc_df
Exemplo n.º 7
0
class TextFeatureMixin:
    type = TEXT

    preprocessing_defaults = {
        "char_tokenizer": "characters",
        "char_vocab_file": None,
        "char_sequence_length_limit": 1024,
        "char_most_common": 70,
        "word_tokenizer": "space_punct",
        "pretrained_model_name_or_path": None,
        "word_vocab_file": None,
        "word_sequence_length_limit": 256,
        "word_most_common": 20000,
        "padding_symbol": PADDING_SYMBOL,
        "unknown_symbol": UNKNOWN_SYMBOL,
        "padding": "right",
        "lowercase": True,
        "missing_value_strategy": FILL_WITH_CONST,
        "fill_value": UNKNOWN_SYMBOL,
    }

    preprocessing_schema = {
        "char_tokenizer": {"type": "string", "enum": sorted(list(tokenizer_registry.keys()))},
        "char_vocab_file": {"type": ["string", "null"]},
        "char_sequence_length_limit": {"type": "integer", "minimum": 0},
        "char_most_common": {"type": "integer", "minimum": 0},
        "word_tokenizer": {"type": "string", "enum": sorted(list(tokenizer_registry.keys()))},
        "pretrained_model_name_or_path": {"type": ["string", "null"]},
        "word_vocab_file": {"type": ["string", "null"]},
        "word_sequence_length_limit": {"type": "integer", "minimum": 0},
        "word_most_common": {"type": "integer", "minimum": 0},
        "padding_symbol": {"type": "string"},
        "unknown_symbol": {"type": "string"},
        "padding": {"type": "string", "enum": ["right", "left"]},
        "lowercase": {"type": "boolean"},
        "missing_value_strategy": {"type": "string", "enum": MISSING_VALUE_STRATEGY_OPTIONS},
        "fill_value": {"type": "string"},
        "computed_fill_value": {"type": "string"},
    }

    @staticmethod
    def cast_column(column, backend):
        return column

    @staticmethod
    def feature_meta(column, preprocessing_parameters, backend):
        (
            char_idx2str,
            char_str2idx,
            char_str2freq,
            char_max_len,
            char_pad_idx,
            char_pad_symbol,
            char_unk_symbol,
        ) = create_vocabulary(
            column,
            tokenizer_type="characters",
            num_most_frequent=preprocessing_parameters["char_most_common"],
            lowercase=preprocessing_parameters["lowercase"],
            unknown_symbol=preprocessing_parameters["unknown_symbol"],
            padding_symbol=preprocessing_parameters["padding_symbol"],
            pretrained_model_name_or_path=preprocessing_parameters["pretrained_model_name_or_path"],
            processor=backend.df_engine,
        )
        (
            word_idx2str,
            word_str2idx,
            word_str2freq,
            word_max_len,
            word_pad_idx,
            word_pad_symbol,
            word_unk_symbol,
        ) = create_vocabulary(
            column,
            tokenizer_type=preprocessing_parameters["word_tokenizer"],
            num_most_frequent=preprocessing_parameters["word_most_common"],
            lowercase=preprocessing_parameters["lowercase"],
            vocab_file=preprocessing_parameters["word_vocab_file"],
            unknown_symbol=preprocessing_parameters["unknown_symbol"],
            padding_symbol=preprocessing_parameters["padding_symbol"],
            pretrained_model_name_or_path=preprocessing_parameters["pretrained_model_name_or_path"],
            processor=backend.df_engine,
        )
        return (
            char_idx2str,
            char_str2idx,
            char_str2freq,
            char_max_len,
            char_pad_idx,
            char_pad_symbol,
            char_unk_symbol,
            word_idx2str,
            word_str2idx,
            word_str2freq,
            word_max_len,
            word_pad_idx,
            word_pad_symbol,
            word_unk_symbol,
        )

    @staticmethod
    def get_feature_meta(column, preprocessing_parameters, backend):
        column = column.astype(str)
        tf_meta = TextFeatureMixin.feature_meta(column, preprocessing_parameters, backend)
        (
            char_idx2str,
            char_str2idx,
            char_str2freq,
            char_max_len,
            char_pad_idx,
            char_pad_symbol,
            char_unk_symbol,
            word_idx2str,
            word_str2idx,
            word_str2freq,
            word_max_len,
            word_pad_idx,
            word_pad_symbol,
            word_unk_symbol,
        ) = tf_meta
        char_max_len = min(preprocessing_parameters["char_sequence_length_limit"], char_max_len)
        word_max_len = min(preprocessing_parameters["word_sequence_length_limit"], word_max_len)
        return {
            "char_idx2str": char_idx2str,
            "char_str2idx": char_str2idx,
            "char_str2freq": char_str2freq,
            "char_vocab_size": len(char_idx2str),
            "char_max_sequence_length": char_max_len,
            "char_pad_idx": char_pad_idx,
            "char_pad_symbol": char_pad_symbol,
            "char_unk_symbol": char_unk_symbol,
            "word_idx2str": word_idx2str,
            "word_str2idx": word_str2idx,
            "word_str2freq": word_str2freq,
            "word_vocab_size": len(word_idx2str),
            "word_max_sequence_length": word_max_len,
            "word_pad_idx": word_pad_idx,
            "word_pad_symbol": word_pad_symbol,
            "word_unk_symbol": word_unk_symbol,
        }

    @staticmethod
    def feature_data(column, metadata, preprocessing_parameters, backend):
        char_data = build_sequence_matrix(
            sequences=column,
            inverse_vocabulary=metadata["char_str2idx"],
            tokenizer_type=preprocessing_parameters["char_tokenizer"],
            length_limit=metadata["char_max_sequence_length"],
            padding_symbol=metadata["char_pad_symbol"],
            padding=preprocessing_parameters["padding"],
            unknown_symbol=metadata["char_unk_symbol"],
            lowercase=preprocessing_parameters["lowercase"],
            tokenizer_vocab_file=preprocessing_parameters["char_vocab_file"],
            pretrained_model_name_or_path=preprocessing_parameters["pretrained_model_name_or_path"],
            processor=backend.df_engine,
        )
        word_data = build_sequence_matrix(
            sequences=column,
            inverse_vocabulary=metadata["word_str2idx"],
            tokenizer_type=preprocessing_parameters["word_tokenizer"],
            length_limit=metadata["word_max_sequence_length"],
            padding_symbol=metadata["word_pad_symbol"],
            padding=preprocessing_parameters["padding"],
            unknown_symbol=metadata["word_unk_symbol"],
            lowercase=preprocessing_parameters["lowercase"],
            tokenizer_vocab_file=preprocessing_parameters["word_vocab_file"],
            pretrained_model_name_or_path=preprocessing_parameters["pretrained_model_name_or_path"],
            processor=backend.df_engine,
        )

        return char_data, word_data

    @staticmethod
    def add_feature_data(
        feature, input_df, proc_df, metadata, preprocessing_parameters, backend, skip_save_processed_input
    ):
        chars_data, words_data = TextFeatureMixin.feature_data(
            input_df[feature[COLUMN]].astype(str), metadata[feature[NAME]], preprocessing_parameters, backend
        )
        proc_df[f"{feature[PROC_COLUMN]}_char"] = chars_data
        proc_df[f"{feature[PROC_COLUMN]}_word"] = words_data
        return proc_df
Exemplo n.º 8
0
class SequenceFeatureMixin:
    type = SEQUENCE

    preprocessing_defaults = {
        "sequence_length_limit": 256,
        "most_common": 20000,
        "padding_symbol": PADDING_SYMBOL,
        "unknown_symbol": UNKNOWN_SYMBOL,
        "padding": "right",
        "tokenizer": "space",
        "lowercase": False,
        "vocab_file": None,
        "missing_value_strategy": FILL_WITH_CONST,
        "fill_value": UNKNOWN_SYMBOL,
    }

    preprocessing_schema = {
        "sequence_length_limit": {
            "type": "integer",
            "minimum": 0
        },
        "most_common": {
            "type": "integer",
            "minimum": 0
        },
        "padding_symbol": {
            "type": "string"
        },
        "unknown_symbol": {
            "type": "string"
        },
        "padding": {
            "type": "string",
            "enum": ["right", "left"]
        },
        "tokenizer": {
            "type": "string",
            "enum": sorted(list(tokenizer_registry.keys()))
        },
        "lowercase": {
            "type": "boolean"
        },
        "vocab_file": {
            "type": ["string", "null"]
        },
        "missing_value_strategy": {
            "type": "string",
            "enum": MISSING_VALUE_STRATEGY_OPTIONS
        },
        "fill_value": {
            "type": "string"
        },
        "computed_fill_value": {
            "type": "string"
        },
    }

    @staticmethod
    def cast_column(column, backend):
        return column

    @staticmethod
    def get_feature_meta(column, preprocessing_parameters, backend):
        column = column.astype(str)
        idx2str, str2idx, str2freq, max_length, _, _, _ = create_vocabulary(
            column,
            preprocessing_parameters["tokenizer"],
            lowercase=preprocessing_parameters["lowercase"],
            num_most_frequent=preprocessing_parameters["most_common"],
            vocab_file=preprocessing_parameters["vocab_file"],
            unknown_symbol=preprocessing_parameters["unknown_symbol"],
            padding_symbol=preprocessing_parameters["padding_symbol"],
            processor=backend.df_engine,
        )
        max_length = min(preprocessing_parameters["sequence_length_limit"],
                         max_length)
        return {
            "idx2str": idx2str,
            "str2idx": str2idx,
            "str2freq": str2freq,
            "vocab_size": len(idx2str),
            "max_sequence_length": max_length,
        }

    @staticmethod
    def feature_data(column, metadata, preprocessing_parameters, backend):
        sequence_data = build_sequence_matrix(
            sequences=column,
            inverse_vocabulary=metadata["str2idx"],
            tokenizer_type=preprocessing_parameters["tokenizer"],
            length_limit=metadata["max_sequence_length"],
            padding_symbol=preprocessing_parameters["padding_symbol"],
            padding=preprocessing_parameters["padding"],
            unknown_symbol=preprocessing_parameters["unknown_symbol"],
            lowercase=preprocessing_parameters["lowercase"],
            tokenizer_vocab_file=preprocessing_parameters["vocab_file"],
            processor=backend.df_engine,
        )
        return sequence_data

    @staticmethod
    def add_feature_data(feature, input_df, proc_df, metadata,
                         preprocessing_parameters, backend,
                         skip_save_processed_input):
        sequence_data = SequenceInputFeature.feature_data(
            input_df[feature[COLUMN]].astype(str), metadata[feature[NAME]],
            preprocessing_parameters, backend)
        proc_df[feature[PROC_COLUMN]] = sequence_data
        return proc_df
Exemplo n.º 9
0
class SetFeatureMixin:
    type = SET
    preprocessing_defaults = {
        'tokenizer': 'space',
        'most_common': 10000,
        'lowercase': False,
        'missing_value_strategy': FILL_WITH_CONST,
        'fill_value': UNKNOWN_SYMBOL
    }

    preprocessing_schema = {
        'tokenizer': {
            'type': 'string',
            'enum': sorted(list(tokenizer_registry.keys()))
        },
        'most_common': {
            'type': 'integer',
            'minimum': 0
        },
        'lowercase': {
            'type': 'boolean'
        },
        'missing_value_strategy': {
            'type': 'string',
            'enum': MISSING_VALUE_STRATEGY_OPTIONS
        },
        'fill_value': {
            'type': 'string'
        },
        'computed_fill_value': {
            'type': 'string'
        },
    }

    @staticmethod
    def cast_column(feature, dataset_df, backend):
        return dataset_df

    @staticmethod
    def get_feature_meta(column, preprocessing_parameters, backend):
        column = column.astype(str)
        idx2str, str2idx, str2freq, max_size, _, _, _ = create_vocabulary(
            column,
            preprocessing_parameters['tokenizer'],
            num_most_frequent=preprocessing_parameters['most_common'],
            lowercase=preprocessing_parameters['lowercase'],
            processor=backend.df_engine)
        return {
            'idx2str': idx2str,
            'str2idx': str2idx,
            'str2freq': str2freq,
            'vocab_size': len(str2idx),
            'max_set_size': max_size
        }

    @staticmethod
    def feature_data(column, metadata, preprocessing_parameters, backend):
        def to_dense(x):
            feature_vector = set_str_to_idx(
                x, metadata['str2idx'], preprocessing_parameters['tokenizer'])

            set_vector = np.zeros((len(metadata['str2idx']), ))
            set_vector[feature_vector] = 1
            return set_vector.astype(np.bool)

        return backend.df_engine.map_objects(column, to_dense)

    @staticmethod
    def add_feature_data(feature, input_df, proc_df, metadata,
                         preprocessing_parameters, backend,
                         skip_save_processed_input):
        proc_df[feature[PROC_COLUMN]] = SetFeatureMixin.feature_data(
            input_df[feature[COLUMN]].astype(str), metadata[feature[NAME]],
            preprocessing_parameters, backend)
        return proc_df
Exemplo n.º 10
0
class TextFeatureMixin:
    type = TEXT

    preprocessing_defaults = {
        'char_tokenizer': 'characters',
        'char_vocab_file': None,
        'char_sequence_length_limit': 1024,
        'char_most_common': 70,
        'word_tokenizer': 'space_punct',
        'pretrained_model_name_or_path': None,
        'word_vocab_file': None,
        'word_sequence_length_limit': 256,
        'word_most_common': 20000,
        'padding_symbol': PADDING_SYMBOL,
        'unknown_symbol': UNKNOWN_SYMBOL,
        'padding': 'right',
        'lowercase': True,
        'missing_value_strategy': FILL_WITH_CONST,
        'fill_value': UNKNOWN_SYMBOL
    }

    preprocessing_schema = {
        'char_tokenizer': {
            'type': 'string',
            'enum': sorted(list(tokenizer_registry.keys()))
        },
        'char_vocab_file': {
            'type': ['string', 'null']
        },
        'char_sequence_length_limit': {
            'type': 'integer',
            'minimum': 0
        },
        'char_most_common': {
            'type': 'integer',
            'minimum': 0
        },
        'word_tokenizer': {
            'type': 'string',
            'enum': sorted(list(tokenizer_registry.keys()))
        },
        'pretrained_model_name_or_path': {
            'type': ['string', 'null']
        },
        'word_vocab_file': {
            'type': ['string', 'null']
        },
        'word_sequence_length_limit': {
            'type': 'integer',
            'minimum': 0
        },
        'word_most_common': {
            'type': 'integer',
            'minimum': 0
        },
        'padding_symbol': {
            'type': 'string'
        },
        'unknown_symbol': {
            'type': 'string'
        },
        'padding': {
            'type': 'string',
            'enum': ['right', 'left']
        },
        'lowercase': {
            'type': 'boolean'
        },
        'missing_value_strategy': {
            'type': 'string',
            'enum': MISSING_VALUE_STRATEGY_OPTIONS
        },
        'fill_value': {
            'type': 'string'
        },
        'computed_fill_value': {
            'type': 'string'
        },
    }

    @staticmethod
    def cast_column(column, backend):
        return column

    @staticmethod
    def feature_meta(column, preprocessing_parameters, backend):
        (
            char_idx2str,
            char_str2idx,
            char_str2freq,
            char_max_len,
            char_pad_idx,
            char_pad_symbol,
            char_unk_symbol,
        ) = create_vocabulary(
            column,
            tokenizer_type='characters',
            num_most_frequent=preprocessing_parameters['char_most_common'],
            lowercase=preprocessing_parameters['lowercase'],
            unknown_symbol=preprocessing_parameters['unknown_symbol'],
            padding_symbol=preprocessing_parameters['padding_symbol'],
            pretrained_model_name_or_path=preprocessing_parameters[
                'pretrained_model_name_or_path'],
            processor=backend.df_engine)
        (
            word_idx2str,
            word_str2idx,
            word_str2freq,
            word_max_len,
            word_pad_idx,
            word_pad_symbol,
            word_unk_symbol,
        ) = create_vocabulary(
            column,
            tokenizer_type=preprocessing_parameters['word_tokenizer'],
            num_most_frequent=preprocessing_parameters['word_most_common'],
            lowercase=preprocessing_parameters['lowercase'],
            vocab_file=preprocessing_parameters['word_vocab_file'],
            unknown_symbol=preprocessing_parameters['unknown_symbol'],
            padding_symbol=preprocessing_parameters['padding_symbol'],
            pretrained_model_name_or_path=preprocessing_parameters[
                'pretrained_model_name_or_path'],
            processor=backend.df_engine)
        return (
            char_idx2str,
            char_str2idx,
            char_str2freq,
            char_max_len,
            char_pad_idx,
            char_pad_symbol,
            char_unk_symbol,
            word_idx2str,
            word_str2idx,
            word_str2freq,
            word_max_len,
            word_pad_idx,
            word_pad_symbol,
            word_unk_symbol,
        )

    @staticmethod
    def get_feature_meta(column, preprocessing_parameters, backend):
        column = column.astype(str)
        tf_meta = TextFeatureMixin.feature_meta(column,
                                                preprocessing_parameters,
                                                backend)
        (
            char_idx2str,
            char_str2idx,
            char_str2freq,
            char_max_len,
            char_pad_idx,
            char_pad_symbol,
            char_unk_symbol,
            word_idx2str,
            word_str2idx,
            word_str2freq,
            word_max_len,
            word_pad_idx,
            word_pad_symbol,
            word_unk_symbol,
        ) = tf_meta
        char_max_len = min(
            preprocessing_parameters['char_sequence_length_limit'],
            char_max_len)
        word_max_len = min(
            preprocessing_parameters['word_sequence_length_limit'],
            word_max_len)
        return {
            'char_idx2str': char_idx2str,
            'char_str2idx': char_str2idx,
            'char_str2freq': char_str2freq,
            'char_vocab_size': len(char_idx2str),
            'char_max_sequence_length': char_max_len,
            'char_pad_idx': char_pad_idx,
            'char_pad_symbol': char_pad_symbol,
            'char_unk_symbol': char_unk_symbol,
            'word_idx2str': word_idx2str,
            'word_str2idx': word_str2idx,
            'word_str2freq': word_str2freq,
            'word_vocab_size': len(word_idx2str),
            'word_max_sequence_length': word_max_len,
            'word_pad_idx': word_pad_idx,
            'word_pad_symbol': word_pad_symbol,
            'word_unk_symbol': word_unk_symbol,
        }

    @staticmethod
    def feature_data(column, metadata, preprocessing_parameters, backend):
        char_data = build_sequence_matrix(
            sequences=column,
            inverse_vocabulary=metadata['char_str2idx'],
            tokenizer_type=preprocessing_parameters['char_tokenizer'],
            length_limit=metadata['char_max_sequence_length'],
            padding_symbol=metadata['char_pad_symbol'],
            padding=preprocessing_parameters['padding'],
            unknown_symbol=metadata['char_unk_symbol'],
            lowercase=preprocessing_parameters['lowercase'],
            tokenizer_vocab_file=preprocessing_parameters['char_vocab_file'],
            pretrained_model_name_or_path=preprocessing_parameters[
                'pretrained_model_name_or_path'],
            processor=backend.df_engine)
        word_data = build_sequence_matrix(
            sequences=column,
            inverse_vocabulary=metadata['word_str2idx'],
            tokenizer_type=preprocessing_parameters['word_tokenizer'],
            length_limit=metadata['word_max_sequence_length'],
            padding_symbol=metadata['word_pad_symbol'],
            padding=preprocessing_parameters['padding'],
            unknown_symbol=metadata['word_unk_symbol'],
            lowercase=preprocessing_parameters['lowercase'],
            tokenizer_vocab_file=preprocessing_parameters['word_vocab_file'],
            pretrained_model_name_or_path=preprocessing_parameters[
                'pretrained_model_name_or_path'],
            processor=backend.df_engine)

        return char_data, word_data

    @staticmethod
    def add_feature_data(feature, input_df, proc_df, metadata,
                         preprocessing_parameters, backend,
                         skip_save_processed_input):
        chars_data, words_data = TextFeatureMixin.feature_data(
            input_df[feature[COLUMN]].astype(str), metadata[feature[NAME]],
            preprocessing_parameters, backend)
        proc_df['{}_char'.format(feature[PROC_COLUMN])] = chars_data
        proc_df['{}_word'.format(feature[PROC_COLUMN])] = words_data
        return proc_df