예제 #1
0
def Tokenize(text, minsize = 3, use_stopwords = True, stem = True):
    tokens = []
    try:
        text = smart_unicode(text)
    except:
        try:
            text = text.decode("latin-1")
        except:
            pass
    for cur_token in re_word.findall(text):
        if len(cur_token) >= minsize:
            lt = cur_token.lower()
            if ((use_stopwords and not (lt in STOPWORDS)) or (not use_stopwords)):
                tokens.append(smart_str(lt))
                for t in re_secondary.findall(lt):
                    if (not smart_str(t) in tokens) and ((use_stopwords and not (t in STOPWORDS)) or (not use_stopwords)):
                        tokens.append(smart_str(t))
    return tokens
예제 #2
0
def TokenizeInput(text, minsize = 3, use_stopwords = True, stem = True):
    terms = []
    including = []
    excluding = []
    text = smart_unicode(text) 
    for cur_token in re_input.findall(text):
        if len(cur_token) >= minsize:
            lt = cur_token.lower()
            if lt[0] == '+':
                lt = lt.lstrip('+')
                if ((use_stopwords and not (lt in STOPWORDS)) or (not use_stopwords)):
                    including.append(smart_str(lt))
            elif lt[0] == '-':
                lt = lt.lstrip('-')
                if ((use_stopwords and not (lt in STOPWORDS)) or (not use_stopwords)):
                    excluding.append(smart_str(lt))
            else:
                if ((use_stopwords and not (lt in STOPWORDS)) or (not use_stopwords)):
                    terms.append(smart_str(lt))
                    for t in re_secondary.findall(lt):
                        if (smart_str(t) not in terms) and ((use_stopwords and not (t in STOPWORDS)) or (not use_stopwords)):
                            terms.append(smart_str(t))
                continue
    return terms, including, excluding