def get_volcabulary_and_list_words(data): reviews_words = [] volcabulary = [] for review in data["text"]: review_words = Word2VecUtility.review_to_wordlist( review, remove_stopwords=True) reviews_words.append(review_words) for word in review_words: volcabulary.append(word) volcabulary = set(volcabulary) return volcabulary, reviews_words
#input("Press Enter to continue...") #print ('Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...') #nltk.download() # Download text data sets, including stop words # Initialize an empty list to hold the clean reviews clean_train_reviews = [] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list print("Cleaning and parsing the training set movie reviews...\n") for i in range(0, len(train["review"])): clean_train_reviews.append(" ".join( Word2VecUtility.review_to_wordlist(train["review"][i], True))) # ****** Create a bag of words from the training set # print("Creating the bag of words...\n") # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data
delId = np.array(delId).tolist() delId = [str(int(i)) for i in delId] post_tree=ET.parse('/Users/XW/Desktop/datascience.stackexchange.com/Posts.xml') post=[(i.attrib.get("PostTypeId"),i.attrib.get("CreationDate"),i.attrib.get("Body") ) for i in post_tree.getroot() if i.attrib.get("PostTypeId") =='2' and i.attrib.get("Id") not in aaId and i.attrib.get("Id") not in delId] post_frame=DataFrame(post,columns=['PostTypeId','CreationDate','Body']) post_body=post_frame.loc[:,'Body'] clean_post = [] print "Cleaning and parsing the posts...\n" for i in xrange( 0, len(post_body)): tmp=BeautifulSoup(post_body[i].replace('\n',""),'html.parser').get_text() if tmp=='': continue clean_post=" ".join(Word2VecUtility.review_to_wordlist(tmp, True)) f = file('/Users/XW/Desktop/datascience.stackexchange.com/parse/' + str(i), 'w') f.write(clean_post.encode('utf-8')) import textmining import os xDIR = '/Users/XW/Desktop/datascience.stackexchange.com/parse' def termdocumentmatrix_example(xDIR): # Initialize class to create term-document matrix count=0 tdm = textmining.TermDocumentMatrix() for i in os.listdir(xDIR): Res = tdm.add_doc(open(os.path.join(xDIR,i)).read())
'/Users/XW/Desktop/datascience.stackexchange.com/Posts.xml') post = [(i.attrib.get("PostTypeId"), i.attrib.get("CreationDate"), i.attrib.get("Body")) for i in post_tree.getroot() if i.attrib.get("PostTypeId") == '2' and i.attrib.get("Id") not in aaId and i.attrib.get("Id") not in delId] post_frame = DataFrame(post, columns=['PostTypeId', 'CreationDate', 'Body']) post_body = post_frame.loc[:, 'Body'] clean_post = [] print "Cleaning and parsing the posts...\n" for i in xrange(0, len(post_body)): tmp = BeautifulSoup(post_body[i].replace('\n', ""), 'html.parser').get_text() if tmp == '': continue clean_post = " ".join(Word2VecUtility.review_to_wordlist(tmp, True)) f = file('/Users/XW/Desktop/datascience.stackexchange.com/parse/' + str(i), 'w') f.write(clean_post.encode('utf-8')) import textmining import os xDIR = '/Users/XW/Desktop/datascience.stackexchange.com/parse' def termdocumentmatrix_example(xDIR): # Initialize class to create term-document matrix count = 0 tdm = textmining.TermDocumentMatrix() for i in os.listdir(xDIR):
words_set = set(model.index2word) word2index = { word : (i + index_from) for i,word in enumerate(words_set) } index2word = { i : word for word, i in list(word2index.items()) } index2word[0] = '0' index2word[1] = '1' index2word[2] = '2' # 'Word2Vec' object does not support item assignment padding_model = {} padding_model['0'] = np.random.standard_normal(num_features) padding_model['1'] = np.random.standard_normal(num_features) padding_model['2'] = np.random.standard_normal(num_features) reviews_words = [] for review in data["text"]: review_words = Word2VecUtility.review_to_wordlist(review, remove_stopwords = True) # each word index increased with 3. review_words = [start] + [word2index[w] if (w in words_set) else oov for w in review_words] # review_words = [oov if (ix > (max_words + index_from)) else ix for ix in review_words] reviews_words.append(review_words) # padding with 0, each review has max_length now. reviews_words = sequence.pad_sequences(reviews_words, maxlen = max_length, padding='post', truncating='post') print(reviews_words.shape) # In[47]: data_matrix = np.empty((reviews_words.shape[0], max_length, num_features)) print(data_matrix.shape)
def getCleanReviews(reviews): clean_reviews = [] for review in reviews["Paper_content"]: clean_reviews.append( Word2VecUtility.review_to_wordlist(review, remove_stopwords=True)) return clean_reviews
import numpy as np if __name__ == '__main__': train = pd.read_csv(os.path.join(os.path.dirname(__file__)), 'data', 'labeledTrainData.tsv', header = 0,\ delimiter = "\t", quoting=3) test = pd.read_csv(os.path.join(os.path.dirname(__file__)), 'data', 'testData.tsv', header = 0,\ delimiter = "\t", quoting=3) print 'The first review is:' print train["review"][0] raw_input("Press Enter to continue..") # Initialize an empty list to hold clean_reviews clean_train_reviews = [] # Loop over each review; create an index i that goes from 0 to the length of the movie review list print "Cleaning and parsing the training set movie reviews...\n" for i in xrange(0, len(train["review"])): clean_train_reviews.append("".join(Word2VecUtility.review_to_wordlist(train["review"][i], True))) # ****** Create a bag of words from the training set print "Creating the bag of words...\n" # Initialize CountVectorizer object which is Scikit-learn bag of words tool vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features=5000) # fit_transform does two functions: First it fits the model and learns the vocabulary # second; it transforms our training data into feature vectors. The input should be a list of strings train_data_features = vectorizer.fit_transform(clean_train_reviews) # Convert to numpy array train_data_features = train_data_features.toarray()
from Word2VecUtility import Word2VecUtility import sklearn import sklearn.feature_extraction post_tree = ET.parse( '/Users/Zhen/Desktop/Courses/BigData/stackexchange/data/Posts.xml') post = [(i.attrib.get("PostTypeId"), i.attrib.get("CreationDate"), i.attrib.get("Body")) for i in post_tree.getroot()] post_frame = DataFrame(post, columns=['PostTypeId', 'CreationDate', 'Body']) post_body = post_frame.loc[:, 'Body'] clean_post = [] print "Cleaning and parsing the posts...\n" for i in xrange(0, len(post_body)): clean_post.append(" ".join( Word2VecUtility.review_to_wordlist(post_body[i], True))) clean_postdf = pd.DataFrame(clean_post) clean_postdf.to_csv('post_body.csv', sep=',', encoding='utf-8') # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = sklearn.feature_extraction.text.CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000,min_df=1) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. data_features = vectorizer.fit_transform(clean_post)
from time import mktime import sys sys.path.insert(0, '/Users/Zhen/Desktop/Courses/BigData/stackexchange/') from Word2VecUtility import Word2VecUtility import sklearn import sklearn.feature_extraction post_tree=ET.parse('/Users/Zhen/Desktop/Courses/BigData/stackexchange/data/Posts.xml') post=[(i.attrib.get("PostTypeId"),i.attrib.get("CreationDate"),i.attrib.get("Body") ) for i in post_tree.getroot()] post_frame=DataFrame(post,columns=['PostTypeId','CreationDate','Body']) post_body=post_frame.loc[:,'Body'] clean_post = [] print "Cleaning and parsing the posts...\n" for i in xrange( 0, len(post_body)): clean_post.append(" ".join(Word2VecUtility.review_to_wordlist(post_body[i], True))) clean_postdf=pd.DataFrame(clean_post) clean_postdf.to_csv('post_body.csv',sep=',',encoding = 'utf-8') # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = sklearn.feature_extraction.text.CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000,min_df=1) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings.
word2index = { word : (i + index_from) for i,word in enumerate(words_set) } index2word = { i : word for word, i in word2index.items() } index2word[0] = '0' index2word[1] = '1' index2word[2] = '2' # 'Word2Vec' object does not support item assignment padding_model = {} padding_model['0'] = np.random.standard_normal(num_features) padding_model['1'] = np.random.standard_normal(num_features) padding_model['2'] = np.random.standard_normal(num_features) data = pd.read_csv('review_sub_399850.tsv', header=0, delimiter="\t", quoting=3, encoding='utf-8') reviews_words = [] for review in data["text"]: review_words = Word2VecUtility.review_to_wordlist(review, remove_stopwords = True) # each word index has already been increased by 3. review_words = [start] + [word2index[w] if (w in model) else oov for w in review_words] # index from 0,1,... to 5002 review_words = [oov if (ix >= (max_words + index_from)) else ix for ix in review_words] reviews_words.append(review_words) # padding with 0, each review has max_length now. reviews_words = sequence.pad_sequences(reviews_words, maxlen = max_length, padding='post', truncating='post') print reviews_words[:20, :12] print reviews_words.shape labels = data["stars"] # print labels[:10], labels.shape labels[labels <= 3] = 0
# Read data from files train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data2', 'labeledTrainData.csv'), header=0, delimiter="\t", quoting=3) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data2', 'testData.csv'), header=0, delimiter="\t", quoting=3) print "Cleaning training content" clean_train_reviews = [] for review in train["Paper_content"]: clean_train_reviews.append( Word2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) print "Cleaning test content" clean_test_reviews = [] for review in test["Paper_content"]: clean_test_reviews.append( Word2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) # ****** Create bags of centroids # # Pre-allocate an array for the training set bags of centroids (for speed) train_centroids = np.zeros( (train["Paper_content"].size, num_clusters), \ dtype="float32" ) # Transform the training set reviews into bags of centroids counter = 0
# In[15]: review_words = [] print((type(model.index2word))) print((len(model.index2word))) print((model.index2word[:100])) index2word_set = set(model.index2word) print((len(index2word_set))) # In[16]: words = Word2VecUtility.review_to_wordlist(data.iloc[0]['text']) print(words) for word in words: print((word in index2word_set)) # In[8]: clean_labels = np.array(data["stars"]) print((clean_labels[:10], clean_labels.shape)) clean_labels[clean_labels <= 3] = 0 clean_labels[clean_labels > 3] = 1 print((clean_labels[:10])) # num of positive reviews print(((clean_labels == 1).sum()))