def Tokenize(text, minsize = 3, use_stopwords = True, stem = True): tokens = [] try: text = smart_unicode(text) except: try: text = text.decode("latin-1") except: pass for cur_token in re_word.findall(text): if len(cur_token) >= minsize: lt = cur_token.lower() if ((use_stopwords and not (lt in STOPWORDS)) or (not use_stopwords)): tokens.append(smart_str(lt)) for t in re_secondary.findall(lt): if (not smart_str(t) in tokens) and ((use_stopwords and not (t in STOPWORDS)) or (not use_stopwords)): tokens.append(smart_str(t)) return tokens
def TokenizeInput(text, minsize = 3, use_stopwords = True, stem = True): terms = [] including = [] excluding = [] text = smart_unicode(text) for cur_token in re_input.findall(text): if len(cur_token) >= minsize: lt = cur_token.lower() if lt[0] == '+': lt = lt.lstrip('+') if ((use_stopwords and not (lt in STOPWORDS)) or (not use_stopwords)): including.append(smart_str(lt)) elif lt[0] == '-': lt = lt.lstrip('-') if ((use_stopwords and not (lt in STOPWORDS)) or (not use_stopwords)): excluding.append(smart_str(lt)) else: if ((use_stopwords and not (lt in STOPWORDS)) or (not use_stopwords)): terms.append(smart_str(lt)) for t in re_secondary.findall(lt): if (smart_str(t) not in terms) and ((use_stopwords and not (t in STOPWORDS)) or (not use_stopwords)): terms.append(smart_str(t)) continue return terms, including, excluding