def __init__(self, field_names, pad_symbol, symbol_to_mask=None, padding_mode='both', axis=1, new_mask_field_name_suffix="mask", **kwargs): """ :param field_names: str or list of str names that should represent fields that should be padded. :param pad_symbol: a symbol that should be used for padding. :param symbol_to_mask: a symbol(token) that should be masked in sequences. E.g. Can be used to mask <UNK> tokens. :param padding_mode: left, right, or both. Defines the side to which padding symbols should be appended. :param axis: defines an axis of data to which padding should be applied. Currently only axes 1 or 2 are supported. :param new_mask_field_name_suffix: a suffix of a new padded field that is created for each field_names. See create_mask_field_name(). """ try: validate_field_names(field_names) except StandardError as e: raise e super(Padder, self).__init__(**kwargs) self.field_names = listify(field_names) self.pad_symbol = pad_symbol self.symbol_to_mask = symbol_to_mask self.padding_mode = padding_mode self.axis = axis self.new_mask_fn_suffix = new_mask_field_name_suffix
def __init__(self, field_names, window_size=5, step_size=1, only_full_windows=False, new_window_field_name_suffix='window', **kwargs): """ :param field_names: str or list of str (str) corresponding to fields which should be slided over. :param window_size: self-explanatory. :param step_size: self-explanatory. :param only_full_windows: if set to True guarantees that all windows will be of the same size. :param new_window_field_name_suffix: suffix for all newly created fields. """ try: validate_field_names(field_names) except StandardError as e: raise e super(WindowSlider, self).__init__(**kwargs) self.field_names = listify(field_names) self.window_size = window_size self.step_size = step_size self.only_full_windows = only_full_windows self.new_windw_fn_suffix = new_window_field_name_suffix
def create(self, data_source, data_field_names): """ Create vocabulary by passing data_source to the corresponding data-chunk iterable and fetching chunks out of it. Assumes that tokens are strings, if they are not, it will try to convert them to strings. :param data_source: dictionary of attributes that should be passed to the data_chunk_iterable. :param data_field_names: String or List of (string) attributes that map to the symbols which should be used to create the vocabulary. """ logger.info("Creating a vocabulary for data_attributes: '%s'" % data_field_names) try: validate_field_names(data_field_names) except StandardError as e: raise e data_field_names = listify(data_field_names) temp_token_to_count = {} for data_chunk in self._data_chunk_iterable.iter(**data_source): for data_attr in data_field_names: for tokens in data_chunk[data_attr]: if not isinstance(tokens, (list, np.ndarray)): tokens = [tokens] for token in flatten(tokens): if token == '': continue if not isinstance(token, (int, float, unicode, str)): raise TypeError("Token is not of a correct type" " (should be int, float, str," " unicode)") if isinstance(token, (int, float)): token = str(token) if token not in temp_token_to_count: temp_token_to_count[token] = 0 temp_token_to_count[token] += 1 # populate the collectors for token, count in sort_hash(temp_token_to_count, by_key=False): if self.max_size and len(self) >= self.max_size: break if count >= self.min_count: symbol = self._add_symbol(token, count) self._total_count += count if match_special_symbol(token): self.special_symbols[token] = symbol if self.add_default_special_symbols: self._add_special_symbols(DEFAULT_SPECIAL_TOKENS)
def __init__(self, field_names, **kwargs): """ :param field_names: str or list of str names that should represent fields that should be selected from data-chunks. Other fields are discarded. """ try: validate_field_names(field_names) except StandardError as e: raise e super(FieldsSelector, self).__init__(**kwargs) self.field_names = listify(field_names)
def __init__(self, field_name_to_func, **kwargs): """ :param field_name_to_func: a dict of mappings, where values are functions of the form: x -> y. """ try: validate_field_names(field_name_to_func.keys()) except StandardError as e: raise e for f in field_name_to_func.values(): if not callable(f): raise ValueError( "Please provide all valid callable functions.") super(FunctionApplier, self).__init__(**kwargs) self.field_name_to_func = field_name_to_func
def __init__(self, field_names, tokenization_func=lambda x: x.split(), token_cleaning_func=None, token_matching_func=None, lower_case=True, **kwargs): """ :param field_names: str or list of string corresponding to fields that should be tokenized. :param tokenization_func: a function that splits string sequences into sequences of tokens. The form should be: x -> y where x is a str and y is a list/array of tokens. :param token_cleaning_func: the function responsible for normalization of tokens, elimination of unwanted characters, etc. format: x -> y, where x is a str token, and y is a clean str token. :param token_matching_func: a function that matches raw text tokens to to a special set of tokens. E.g. to twitter emoticons ':)' -> '<POSIT_EMOT>'. The format: x -> y, where x is a str token, and y is either False, if it does not match or a string token otherwise. :param lower_case: whether to lower-case strings before tokenization. """ try: validate_field_names(field_names) except StandardError as e: raise e msg = "Please provide a valid callable %s function." if not callable(tokenization_func): raise ValueError(msg % "tokenization") if token_cleaning_func is not None and not callable(token_cleaning_func): raise ValueError(msg % "token cleaning") if token_matching_func is not None and not callable(token_matching_func): raise ValueError(msg % "token matching") super(TokenProcessor, self).__init__(**kwargs) self.field_names = listify(field_names) self.tokenization_func = tokenization_func self.token_cleaning_func = token_cleaning_func self.token_matching_func = token_matching_func self.lower_case = lower_case