# In[7]: # print data.ix[0:10] print((data.iloc[:10]['text'])) # print data['text'][2] # In[8]: review_sents = [] print ("Cleaning and parsing the reviews...\n") for i in range( 0, len(data["text"])): # sent_reviews += Word2VecUtility.review_to_sentences(data["text"][i], tokenizer) review_sents += Word2VecUtility.review_to_sentences(data.iloc[i]["text"], tokenizer) # In[53]: out = open('review_sents_1859888.pkl', 'wb') pickle.dump(review_sents, out) out.close() # In[11]: # review_sents = pickle.load(open('review_sents_1859888.pkl', 'rb'))
# Verify the number of reviews that were read (100,000 in total) print "Read %d labeled train reviews, %d labeled test reviews, " \ "and %d unlabeled reviews\n" % (train["Paper_content"].size, test["Paper_content"].size, unlabeled_train["Paper_content"].size ) # Load the punkt tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # ****** Split the labeled and unlabeled training sets into clean sentences # sentences = [] # Initialize an empty list of sentences print "Parsing sentences from training set" for review in train["Paper_content"]: sentences += Word2VecUtility.review_to_sentences(review, tokenizer) print "Parsing sentences from unlabeled set" for review in unlabeled_train["Paper_content"]: sentences += Word2VecUtility.review_to_sentences(review, tokenizer) # ****** Set parameters and train the word2vec model # # Import the built-in logging module and configure it so that Word2Vec # creates nice output messages logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) # Set values for various parameters num_features = 300 # Word vector dimensionality min_word_count = 15 # Minimum word count
# In[7]: # print data.ix[0:10] print data.iloc[:10]['text'] # print data['text'][2] # In[8]: review_sents = [] print "Cleaning and parsing the reviews...\n" for i in xrange( 0, len(data["text"])): # sent_reviews += Word2VecUtility.review_to_sentences(data["text"][i], tokenizer) review_sents += Word2VecUtility.review_to_sentences(data.iloc[i]["text"], tokenizer) # # In[53]: out = open('review_sents_1859888.pkl', 'wb') pickle.dump(review_sents, out) out.close() # # In[11]: review_sents = pickle.load(open('review_sents_1859888.pkl', 'rb')) print len(review_sents) print review_sents[:5]