vectorizer = CountVectorizer(analyzer = "word", \ vocabulary = vocab, \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000) ############################################################ #Make predictions ############################################################ # Create an empty list and append the clean reviews one by one clean_test_reviews = [] print "Cleaning and parsing the test set movie reviews...\n" clean_review = review_to_words( test ) clean_test_reviews.append( clean_review ) # Get a bag of words for the test set, and convert to a numpy array test_data_features = vectorizer.transform(clean_test_reviews) test_data_features = test_data_features.toarray() # Use the random forest to make sentiment label predictions result = forest.predict(test_data_features) # Copy the results to a pandas dataframe with an "id" column and # a "sentiment" column print result if result[0] == 0 :
from helper_01_dataCleaning import review_to_words ######################################################## # Clean all the data # See method "review_to_words" in helper_01_dataCleaning ######################################################## train = pd.read_csv("../data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) # Get the number of reviews based on the dataframe column size num_reviews = train["review"].size # Initialize an empty list to hold the clean reviews clean_train_reviews = [] print "Cleaning and parsing the training set movie reviews...\n" # Loop over each review; create an index i that goes from 0 to the length # of the movie review list for i in xrange( 0, num_reviews ): # Call our function for each one, and add the result to the list of # clean reviews clean_train_reviews.append( review_to_words( train["review"][i] ) ) # If the index is evenly divisible by 1000, print a message if( (i+1)%1000 == 0 ): print "Review %d of %d\n" % ( i+1, num_reviews ) #Serialize cleaned data with codecs.open('output/clean_train_reviews.json', 'w', encoding='utf-8') as f: json.dump(clean_train_reviews, f, indent=3)
stop_words = None, \ max_features = 5000) ############################################################ #Make predictions ############################################################ # Create an empty list and append the clean reviews one by one num_reviews = len(test["review"]) clean_test_reviews = [] print "Cleaning and parsing the test set movie reviews...\n" for i in xrange(0,num_reviews): if( (i+1) % 1000 == 0 ): print "Review %d of %d\n" % (i+1, num_reviews) clean_review = review_to_words( test["review"][i] ) clean_test_reviews.append( clean_review ) # Get a bag of words for the test set, and convert to a numpy array test_data_features = vectorizer.transform(clean_test_reviews) test_data_features = test_data_features.toarray() # Use the random forest to make sentiment label predictions result = forest.predict(test_data_features) # Copy the results to a pandas dataframe with an "id" column and # a "sentiment" column output = pd.DataFrame( data={"id":test["id"], "sentiment":result} ) # Use pandas to write the comma-separated output file output.to_csv( "results/Bag_of_Words_model_2.csv", index=False, quoting=3 )