def preprocess(self, x): """Load a single example using this field, tokenizing if necessary. If the input is a Python 2 `str`, it will be converted to Unicode first. If `sequential=True`, it will be tokenized. Then the input will be optionally lowercased and passed to the user-provided `preprocessing` Pipeline.""" if (six.PY2 and isinstance(x, six.string_types) and not isinstance(x, six.text_type)): # never x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x) if self.sequential and isinstance(x, six.text_type): # never x = self.tokenize(x.rstrip('\n')) if self.lower: x = [Pipeline(six.text_type.lower)(xx) for xx in x] if self.preprocessing is not None: return self.preprocessing(x) else: return x
def preprocess(self, x): """Load a single example using this field, tokenizing if necessary. If the input is a Python 2 `str`, it will be converted to Unicode first. If `sequential=True`, it will be tokenized. Then the input will be optionally lowercased and passed to the user-provided `preprocessing` Pipeline.""" if (six.PY2 and isinstance(x, six.string_types) and not isinstance(x, six.text_type)): x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x) if isinstance(x, six.text_type): x = self.tokenize(x.rstrip('\n')) if self.lower: x = Pipeline(six.text_type.lower)(x) # The Pipeline that will be applied to examples using this field after # tokenizing but before numericalizing. Many Datasets replace this # attribute with a custom preprocessor. Default: None. if self.preprocessing is not None: return self.preprocessing(x) else: return x