def __init__(self, fname, new_mask_fname, pad_symbol, symbol_to_mask=None, padding_mode='both', axis=1, **kwargs): """ :param fname: str or list of str names that should represent fields that should be padded. :param new_mask_fname: str or list of str names that will contain masks. :param pad_symbol: a symbol that should be used for padding. :param symbol_to_mask: a symbol(token) that should be masked in sequences. E.g. Can be used to mask <UNK> tokens. :param padding_mode: left, right, or both. Defines the side to which padding symbols should be appended. :param axis: defines an axis of data to which padding should be applied. Currently only axes 1 or 2 are supported. """ try: validate_field_names(fname) except Exception as e: raise e super(Padder, self).__init__(**kwargs) self.fnames = listify(fname) self.mask_fnames = listify(new_mask_fname) assert (len(self.mask_fnames) == len(self.fnames)) self.pad_symbol = pad_symbol self.symbol_to_mask = symbol_to_mask self.padding_mode = padding_mode self.axis = axis
def __init__(self, field_names, window_size=5, step_size=1, only_full_windows=False, new_window_field_name_suffix='window', **kwargs): """ :param field_names: str or list of str (str) corresponding to fields which should be slided over. :param window_size: self-explanatory. :param step_size: self-explanatory. :param only_full_windows: if set to True guarantees that all windows will be of the same size. :param new_window_field_name_suffix: suffix for all newly created fields. """ try: validate_field_names(field_names) except Exception as e: raise e super(WindowSlider, self).__init__(**kwargs) self.field_names = listify(field_names) self.window_size = window_size self.step_size = step_size self.only_full_windows = only_full_windows self.new_windw_fn_suffix = new_window_field_name_suffix
def __init__(self, fnames, **kwargs): """ :param fnames: str or list of str names that should represent fields that should be selected from data-chunks. Other fields are discarded. """ try: validate_field_names(fnames) except Exception as e: raise e super(FieldSelector, self).__init__(**kwargs) self.fnames = listify(fnames)
def __init__(self, field_name_to_func, **kwargs): """ :param field_name_to_func: a dict of mappings, where values are functions of the form: x -> y. """ try: validate_field_names(list(field_name_to_func.keys())) except Exception as e: raise e for f in field_name_to_func.values(): if not callable(f): raise ValueError( "Please provide all valid callable functions.") super(FunctionApplier, self).__init__(**kwargs) self.field_name_to_func = field_name_to_func
def __init__(self, fnames, tokenization_func=lambda x: x.split(), token_cleaning_func=None, token_matching_func=None, lower_case=True, **kwargs): """ :param fnames: str or list of string corresponding to fields that should be tokenized. :param tokenization_func: a function that splits string sequences into sequences of tokens. The form should be: x -> y where x is a str and y is a list/array of tokens. :param token_cleaning_func: the function responsible for normalization of tokens, elimination of unwanted characters, etc. format: x -> y, where x is a str token, and y is a clean str token. :param token_matching_func: a function that matches raw text tokens to to a special set of tokens. E.g. to twitter emoticons ':)' -> '<POSIT_EMOT>'. The format: x -> y, where x is a str token, and y is either False, if it does not match or a string token otherwise. :param lower_case: whether to lower-case strings before tokenization. """ try: validate_field_names(fnames) except Exception as e: raise e msg = "Please provide a valid callable %s function." if not callable(tokenization_func): raise ValueError(msg % "tokenization") if token_cleaning_func is not None and not callable( token_cleaning_func): raise ValueError(msg % "token cleaning") if token_matching_func is not None and not callable( token_matching_func): raise ValueError(msg % "token matching") super(TokenProcessor, self).__init__(**kwargs) self.field_names = listify(fnames) self.tokenization_func = tokenization_func self.token_cleaning_func = token_cleaning_func self.token_matching_func = token_matching_func self.lower_case = lower_case
def create(self, data_source, data_fnames, min_count=1, max_size=None, add_default_special_symbols=True): """ Create vocabulary by passing data_source to the corresponding data-chunk iterable and fetching chunks out of it. Assumes that tokens are strings, if they are not, it will try to convert them to strings. :param data_source: dictionary of attributes that should be passed to the data_chunk_iterable. :param data_fnames: String or List of (string) attributes that map to the symbols which should be used to create the vocabulary. :param min_count: minimum frequency of a token to be added to the vocabulary. :param max_size: maximum number of symbols to store to the vocabulary. :param add_default_special_symbols: whether default symbols, such as <PAD> and <UNK> should be added. In some cases, e.g. labels vocab those symbols are not necessary. """ try: validate_field_names(data_fnames) except Exception as e: raise e data_fnames = listify(data_fnames) dfn_formatted_str = ', '.join(["'%s'" % dfn for dfn in data_fnames]) logger.info("Creating a vocabulary from %s data_source, and %s" " chunk field(s). min_count: %d, max_vocab_size: %s." % (data_source, dfn_formatted_str, min_count, str(max_size))) temp_token_to_count = {} for data_chunk in self._data_chunk_iterable.iter(**data_source): for data_attr in data_fnames: for tokens in data_chunk[data_attr]: if not isinstance(tokens, (list, np.ndarray)): tokens = [tokens] for token in flatten(tokens): if token == '': continue if not isinstance(token, (int, float, str)): raise TypeError("Token is not of a correct type" " (should be int, float, str," " unicode).") if isinstance(token, (int, float)): token = str(token) if token not in temp_token_to_count: temp_token_to_count[token] = 0 temp_token_to_count[token] += 1 # populate the collectors for token, count in sort_hash(temp_token_to_count, by_key=False): if max_size and len(self) >= max_size: break if count >= min_count: symbol = self.add_symbol(token, count) self._total_count += count if match_special_symbol(token): self.special_symbols[token] = symbol if add_default_special_symbols: self.add_special_symbols(DEFAULT_SPECIAL_TOKENS) logger.info("Created the vocabulary.")