Python TextUtility示例

编程语言: Python

命名空间/包名称: TextUtility

类/类型: TextUtility

hotexamples.com的示例: 5

Python TextUtility - 已找到5个示例。这些是从开源项目中提取的最受好评的TextUtility.TextUtility现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

text_to_wordlist(3)

示例#1

显示文件

def run_tfidf(train, test, grams='123', n_dim=40000, clf=LogisticRegression(class_weight="auto")):
    clean_train = []
    for text in train['text']:
        clean_train.append(" ".join(TextUtility.text_to_wordlist(text)))

    clean_test = []
    for text in test['text']:
        clean_test.append(" ".join(TextUtility.text_to_wordlist(text)))
    
    ngram_range = (int(grams[0]), int(grams[-1])) 
    vectorizer = TfidfVectorizer(max_features=n_dim, ngram_range=ngram_range, sublinear_tf=True)
    
    X_train = vectorizer.fit_transform(clean_train)
    X_test = vectorizer.transform(clean_test)
    y_train = train['label']

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    try:
        y_prob = clf.predict_proba(X_test)
    except:
        # for svm with probability output
        clf.set_params(probability=True)
        y_prob_pos = clf.predict(X_test)
        y_prob_neg = np.ones(X_test.shape[0]) - y_prob_pos
        y_prob = np.column_stack((y_prob_neg, y_prob_pos))
        
    return y_pred, y_prob

示例#2

显示文件

文件： maxent_nblcr.py 项目： bluedrone/psb-adr

def tokenize(sentence, grams):
    words = TextUtility.text_to_wordlist(sentence)
    tokens = []
    for gram in grams:
        for i in range(len(words) - gram + 1):
            tokens += ["_*_".join(words[i:i+gram])]
    return tokens

示例#3

显示文件

文件： maxent_nblcr.py 项目： trunghlt/psb-adr

def tokenize(sentence, grams):
    words = TextUtility.text_to_wordlist(sentence)
    tokens = []
    for gram in grams:
        for i in range(len(words) - gram + 1):
            tokens += ["_*_".join(words[i:i + gram])]
    return tokens

示例#4

显示文件

文件： maxent_we.py 项目： bluedrone/psb-adr

def getClean(data):
    clean_data = []
    for text in data["text"]:
        clean_data.append(TextUtility.text_to_wordlist(text, True))
    return clean_data

示例#5

显示文件

文件： maxent_we.py 项目： trunghlt/psb-adr

def getClean(data):
    clean_data = []
    for text in data["text"]:
        clean_data.append(TextUtility.text_to_wordlist(text, True))
    return clean_data