def __init__(self,
                 field_names,
                 pad_symbol,
                 symbol_to_mask=None,
                 padding_mode='both',
                 axis=1,
                 new_mask_field_name_suffix="mask",
                 **kwargs):
        """
        :param field_names: str or list of str names that should represent
                            fields that should be padded.
        :param pad_symbol: a symbol that should be used for padding.
        :param symbol_to_mask: a symbol(token) that should be masked in
                               sequences. E.g. Can be used to mask <UNK> tokens.
        :param padding_mode: left, right, or both. Defines the side to which
                             padding symbols should be appended.
        :param axis: defines an axis of data to which padding should be applied.
                     Currently only axes 1 or 2 are supported.
        :param new_mask_field_name_suffix: a suffix of a new padded field that is
                                          created for each field_names.
                                          See create_mask_field_name().
        """
        try:
            validate_field_names(field_names)
        except StandardError as e:
            raise e

        super(Padder, self).__init__(**kwargs)
        self.field_names = listify(field_names)
        self.pad_symbol = pad_symbol
        self.symbol_to_mask = symbol_to_mask
        self.padding_mode = padding_mode
        self.axis = axis
        self.new_mask_fn_suffix = new_mask_field_name_suffix
Exemplo n.º 2
0
    def __init__(self,
                 field_names,
                 window_size=5,
                 step_size=1,
                 only_full_windows=False,
                 new_window_field_name_suffix='window',
                 **kwargs):
        """
        :param field_names: str or list of str (str) corresponding to fields
                            which should be slided over.
        :param window_size: self-explanatory.
        :param step_size: self-explanatory.
        :param only_full_windows: if set to True guarantees that all windows
                                  will be of the same size.
        :param new_window_field_name_suffix: suffix for all newly created fields.
        """
        try:
            validate_field_names(field_names)
        except StandardError as e:
            raise e

        super(WindowSlider, self).__init__(**kwargs)
        self.field_names = listify(field_names)
        self.window_size = window_size
        self.step_size = step_size
        self.only_full_windows = only_full_windows
        self.new_windw_fn_suffix = new_window_field_name_suffix
    def create(self, data_source, data_field_names):
        """
        Create vocabulary by passing data_source to the corresponding data-chunk
        iterable and fetching chunks out of it.

        Assumes that tokens are strings, if they are not, it will try to convert
        them to strings.

        :param data_source: dictionary of attributes that should be passed to
                            the data_chunk_iterable.
        :param data_field_names: String or List of (string) attributes that map
                                to the symbols which should be used to create
                                the vocabulary.
        """
        logger.info("Creating a vocabulary for data_attributes: '%s'" %
                    data_field_names)
        try:
            validate_field_names(data_field_names)
        except StandardError as e:
            raise e

        data_field_names = listify(data_field_names)
        temp_token_to_count = {}
        for data_chunk in self._data_chunk_iterable.iter(**data_source):
            for data_attr in data_field_names:
                for tokens in data_chunk[data_attr]:

                    if not isinstance(tokens, (list, np.ndarray)):
                        tokens = [tokens]

                    for token in flatten(tokens):
                        if token == '':
                            continue

                        if not isinstance(token, (int, float, unicode, str)):
                            raise TypeError("Token is not of a correct type"
                                            " (should be int, float, str,"
                                            " unicode)")

                        if isinstance(token, (int, float)):
                            token = str(token)

                        if token not in temp_token_to_count:
                            temp_token_to_count[token] = 0
                        temp_token_to_count[token] += 1

        # populate the collectors
        for token, count in sort_hash(temp_token_to_count, by_key=False):
            if self.max_size and len(self) >= self.max_size:
                break
            if count >= self.min_count:
                symbol = self._add_symbol(token, count)
                self._total_count += count
                if match_special_symbol(token):
                    self.special_symbols[token] = symbol
        if self.add_default_special_symbols:
            self._add_special_symbols(DEFAULT_SPECIAL_TOKENS)
    def __init__(self, field_names, **kwargs):
        """
        :param field_names: str or list of str names that should represent
                            fields that should be selected from data-chunks.
                            Other fields are discarded.
        """
        try:
            validate_field_names(field_names)
        except StandardError as e:
            raise e

        super(FieldsSelector, self).__init__(**kwargs)
        self.field_names = listify(field_names)
Exemplo n.º 5
0
    def __init__(self, field_name_to_func, **kwargs):
        """
        :param field_name_to_func: a dict of mappings, where values are
                                  functions of the form: x -> y.
        """
        try:
            validate_field_names(field_name_to_func.keys())
        except StandardError as e:
            raise e
        for f in field_name_to_func.values():
            if not callable(f):
                raise ValueError(
                    "Please provide all valid callable functions.")

        super(FunctionApplier, self).__init__(**kwargs)
        self.field_name_to_func = field_name_to_func
Exemplo n.º 6
0
    def __init__(self, field_names, tokenization_func=lambda x: x.split(),
                 token_cleaning_func=None, token_matching_func=None,
                 lower_case=True, **kwargs):
        """
        :param field_names: str or list of string corresponding to fields that
                            should be tokenized.
        :param tokenization_func: a function that splits string sequences into
                                  sequences of tokens. The form should be:
                                  x -> y where x is a str and y is a list/array
                                  of tokens.
        :param token_cleaning_func: the function responsible for normalization
                                    of tokens, elimination of unwanted
                                    characters, etc. format: x -> y, where x is
                                    a str token, and y is a clean str token.
        :param token_matching_func: a function that matches raw text tokens to
                                    to a special set of tokens. E.g. to twitter
                                    emoticons ':)' -> '<POSIT_EMOT>'.
                                    The format: x -> y, where x is a str token,
                                    and y is either False, if it does not match
                                    or a string token otherwise.
        :param lower_case: whether to lower-case strings before tokenization.
        """
        try:
            validate_field_names(field_names)
        except StandardError as e:
            raise e
        msg = "Please provide a valid callable %s function."
        if not callable(tokenization_func):
            raise ValueError(msg % "tokenization")
        if token_cleaning_func is not None and not callable(token_cleaning_func):
            raise ValueError(msg % "token cleaning")
        if token_matching_func is not None and not callable(token_matching_func):
            raise ValueError(msg % "token matching")

        super(TokenProcessor, self).__init__(**kwargs)
        self.field_names = listify(field_names)
        self.tokenization_func = tokenization_func
        self.token_cleaning_func = token_cleaning_func
        self.token_matching_func = token_matching_func
        self.lower_case = lower_case