Exemplo n.º 1
0
 def __init__(self, doc=None, regex=CRE_TOKEN, strip=True, nonwords=False, nonwords_set=None, nonwords_regex=RE_NONWORD,
              lower=None, stem=None, ngrams=1):
     # specific set of characters to strip
     self.strip_chars = None
     if isinstance(strip, basestring):
         self.strip_chars = strip
         # strip_chars takes care of the stripping config, so no need for strip function anymore
         self.strip = None
     elif strip is True:
         self.strip_chars = '-_*`()"' + '"'
     strip = strip or None
     # strip whitespace, overrides strip() method
     self.strip = strip if callable(strip) else (str_strip if strip else None)
     self.doc = to_ascii(doc)
     self.regex = regex
     if isinstance(self.regex, basestring):
         self.regex = re.compile(self.regex)
     self.nonwords = nonwords  # whether to use the default REGEX for nonwords
     self.nonwords_set = nonwords_set or set()
     self.nonwords_regex = nonwords_regex
     self.lower = lower if callable(lower) else (str_lower if lower else None)
     self.stemmer_name, self.stem = make_named_stemmer(stem)  # stem can be a callable Stemmer instance or just a function
     self.ngrams = ngrams or 1  # ngram degree, numger of ngrams per token
     if isinstance(self.nonwords_regex, basestring):
         self.nonwords_regex = re.compile(self.nonwords_regex)
     elif self.nonwords:
         try:
             self.nonwords_set = set(self.nonwords)
         except TypeError:
             self.nonwords_set = set(['None', 'none', 'and', 'but'])
             # if a set of nonwords has been provided dont use the internal nonwords REGEX?
             self.nonwords = not bool(self.nonwords)
Exemplo n.º 2
0
    def __call__(self, doc):
        """Lazily tokenize a new document (tokens aren't generated until the class instance is iterated)

        >>> list(Tokenizer()('new string to parse'))
        ['new', 'string', 'to', 'parse']
        """
        # tokenization doesn't happen until you try to iterate through the Tokenizer instance or class
        self.doc = to_ascii(doc)
        # need to return self so that this will work: Tokenizer()('doc (str) to parse even though default doc is None')
        return self