Python tokenize_nltk示例

编程语言: Python

命名空间/包名称: Chapter01.tokenization

方法/功能: tokenize_nltk

hotexamples.com的示例: 5

Python tokenize_nltk - 已找到5个示例。这些是从开源项目中提取的最受好评的Chapter01.tokenization.tokenize_nltk现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： unsupervised_text_classification.py 项目： thetradingwhisperer/Python-Natural-Language-Processing-Cookbook

def get_most_frequent_words(text):
    word_list = tokenize_nltk(text)
    word_list = [
        word for word in word_list if word not in stopwords
        and word not in string.punctuation and re.search('[a-zA-Z]', word)
    ]
    freq_dist = FreqDist(word_list)
    top_200 = freq_dist.most_common(200)
    top_200 = [word[0] for word in top_200]
    return top_200

示例#2

显示文件

文件： train_word2vec.py 项目： thetradingwhisperer/Python-Natural-Language-Processing-Cookbook

def main():
    sentences = get_all_book_sentences(books_dir)
    sentences = [tokenize_nltk(s.lower()) for s in sentences]
    #model = train_word2vec(sentences)
    #test_model()
    model = pickle.load(open(word2vec_model_path, 'rb'))
    #accuracy_list = evaluate_model(model, evaluation_file)
    #print(accuracy_list)
    (analogy_score, word_list) = model.wv.evaluate_word_analogies(
        datapath('questions-words.txt'))
    print(analogy_score)
    pretrained_model = KeyedVectors.load_word2vec_format(pretrained_model_path,
                                                         binary=True)
    (analogy_score, word_list) = pretrained_model.evaluate_word_analogies(
        datapath('questions-words.txt'))
    print(analogy_score)

示例#3

显示文件

文件： keyword_classification.py 项目： thetradingwhisperer/Python-Natural-Language-Processing-Cookbook

def create_vectorizers(data_dict):
    topic_list = list(data_dict.keys())
    vectorizer_dict = {}
    for topic in topic_list:
        text_array = data_dict[topic]
        text = " ".join(text_array)
        word_list = tokenize_nltk(text)
        word_list = [word for word in word_list if word not in stopwords]
        freq_dist = FreqDist(word_list)
        top_200 = freq_dist.most_common(200)
        vocab = [
            wtuple[0] for wtuple in top_200 if wtuple[0] not in stopwords
            and wtuple[0] not in string.punctuation
        ]
        vectorizer_dict[topic] = CountVectorizer(vocabulary=vocab)
    return vectorizer_dict

示例#4

显示文件

文件： preprocess_bbc_dataset.py 项目： thetradingwhisperer/Python-Natural-Language-Processing-Cookbook

def get_stats(text, num_words=200):
    word_list = tokenize_nltk(text)
    word_list = [word for word in word_list if word not in stopwords and re.search("[A-Za-z]", word)]
    freq_dist = FreqDist(word_list)
    print(freq_dist.most_common(num_words))
    return freq_dist

示例#5

显示文件

文件： pos_tagging.py 项目： thetradingwhisperer/Python-Natural-Language-Processing-Cookbook

def pos_tag_nltk(text):
    words = tokenize_nltk(text)
    words_with_pos = nltk.pos_tag(words)
    return words_with_pos