Python Preprocessing_nlp示例

编程语言: Python

类/类型: Preprocessing_nlp

hotexamples.com的示例: 2

Python Preprocessing_nlp - 已找到2个示例。这些是从开源项目中提取的最受好评的Preprocessing_nlp现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

review_to_wordlist(1)

review_to_words(1)

示例#1

显示文件

文件： Tutorial_1.py 项目： jp1989326/Machine_learning_for_reliability_analysis

    # Create clean_train_reviews and clean_test_reviews as we did before
    #

    # Read data from files
    train = pd.read_csv(data_path + 'labeledTrainData.tsv', header = 0, \
                        delimiter = '\t', quoting = 3)
    test = pd.read_csv(data_path + 'testData.tsv', header = 0, \
                        delimiter = '\t', quoting = 3)
    unlabeled_train = pd.read_csv(data_path + 'unlabeledTrainData.tsv', header = 0, \
                        delimiter = '\t', quoting = 3)


    print "Cleaning training reviews"
    clean_train_reviews = []
    for review in train["review"]:
        clean_train_reviews.append( pre.review_to_wordlist( review, \
            remove_stopwords=True ))

    print "Cleaning test reviews"
    clean_test_reviews = []
    for review in test["review"]:
        clean_test_reviews.append( pre.review_to_wordlist( review, \
            remove_stopwords=True ))


    # ****** Create bags of centroids
    #
    # Pre-allocate an array for the training set bags of centroids (for speed)
    train_centroids = np.zeros( (train["review"].size, num_clusters), \
        dtype="float32" )

    # Transform the training set reviews into bags of centroids

示例#2

显示文件

文件： Format1_clean.py 项目： jp1989326/Machine_learning_for_reliability_analysis

df = pd.read_csv(data_path + 'labeledTrainData.tsv', header = 0, \
                    delimiter = '\t', quoting = 3)
num_docus = train['review'].size


#1. remove the HTML markup( like <br>), remove non-letters, convert to lower case, split into 
# words, remove stopwords, join words back into one string separated by space

import Preprocessing_nlp as pre
clean_docus = []
for i in xrange(0, num_docus):
    
    if ((i+1)%1000 == 0):
        print "review %d of %d\n" % (i+1, num_docus)
    
    clean_docus.append(pre.review_to_words(df['clean_url'][i], filter_words = 'timeline'))

# 1.2 further filtering more words (optional)


    
#####################################################
#2.1    create features from the bag of words
print 'creating the bag of words...\n'
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer( analyzer = 'word', \
                            tokenizer = None, \
                            preprocessor = None, \
                            stop_words = None, \
                            max_features = 5000)