# Create clean_train_reviews and clean_test_reviews as we did before # # Read data from files train = pd.read_csv(data_path + 'labeledTrainData.tsv', header = 0, \ delimiter = '\t', quoting = 3) test = pd.read_csv(data_path + 'testData.tsv', header = 0, \ delimiter = '\t', quoting = 3) unlabeled_train = pd.read_csv(data_path + 'unlabeledTrainData.tsv', header = 0, \ delimiter = '\t', quoting = 3) print "Cleaning training reviews" clean_train_reviews = [] for review in train["review"]: clean_train_reviews.append( pre.review_to_wordlist( review, \ remove_stopwords=True )) print "Cleaning test reviews" clean_test_reviews = [] for review in test["review"]: clean_test_reviews.append( pre.review_to_wordlist( review, \ remove_stopwords=True )) # ****** Create bags of centroids # # Pre-allocate an array for the training set bags of centroids (for speed) train_centroids = np.zeros( (train["review"].size, num_clusters), \ dtype="float32" ) # Transform the training set reviews into bags of centroids
df = pd.read_csv(data_path + 'labeledTrainData.tsv', header = 0, \ delimiter = '\t', quoting = 3) num_docus = train['review'].size #1. remove the HTML markup( like <br>), remove non-letters, convert to lower case, split into # words, remove stopwords, join words back into one string separated by space import Preprocessing_nlp as pre clean_docus = [] for i in xrange(0, num_docus): if ((i+1)%1000 == 0): print "review %d of %d\n" % (i+1, num_docus) clean_docus.append(pre.review_to_words(df['clean_url'][i], filter_words = 'timeline')) # 1.2 further filtering more words (optional) ##################################################### #2.1 create features from the bag of words print 'creating the bag of words...\n' from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer( analyzer = 'word', \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000)