def get_volcabulary_and_list_words(data):
    reviews_words = []
    volcabulary = []
    for review in data["text"]:
        review_words = Word2VecUtility.review_to_wordlist(
            review, remove_stopwords=True)
        reviews_words.append(review_words)
        for word in review_words:
            volcabulary.append(word)
    volcabulary = set(volcabulary)
    return volcabulary, reviews_words
Exemplo n.º 2
0
def get_volcabulary_and_list_words(data):
    reviews_words = []
    volcabulary = []
    for review in data["text"]:
        review_words = Word2VecUtility.review_to_wordlist(
            review, remove_stopwords=True)
        reviews_words.append(review_words)
        for word in review_words:
            volcabulary.append(word)
    volcabulary = set(volcabulary)
    return volcabulary, reviews_words
Exemplo n.º 3
0
# print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...'
# nltk.download()  # Download text data sets, including stop words

# # Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# # Loop over each review
print "Cleaning and parsing the training set reviews...\n"

num_reviews = len(train["review"])
for i in xrange(0, num_reviews):
    if ((i + 1) % 10000 == 0):
        print "Processing review %d out of %d" % (i + 1, num_reviews)
    clean_train_reviews.append(" ".join(
        Word2VecUtility.review_to_wordlist(train["review"][i], True)))

# ****** Create a bag of words from the training set
print "Creating the bag of words...\n"

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer = "word",   \
                         tokenizer = None,    \
                         preprocessor = None, \
                         stop_words = None,   \
                         max_features = 5000) # limit the most frequent 5000 words

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
Exemplo n.º 4
0
    '/Users/XW/Desktop/datascience.stackexchange.com/Posts.xml')
post = [(i.attrib.get("PostTypeId"), i.attrib.get("CreationDate"),
         i.attrib.get("Body")) for i in post_tree.getroot()
        if i.attrib.get("PostTypeId") == '2' and i.attrib.get("Id") not in aaId
        and i.attrib.get("Id") not in delId]
post_frame = DataFrame(post, columns=['PostTypeId', 'CreationDate', 'Body'])
post_body = post_frame.loc[:, 'Body']

clean_post = []
print "Cleaning and parsing the posts...\n"
for i in xrange(0, len(post_body)):
    tmp = BeautifulSoup(post_body[i].replace('\n', ""),
                        'html.parser').get_text()
    if tmp == '':
        continue
    clean_post = " ".join(Word2VecUtility.review_to_wordlist(tmp, True))
    f = file('/Users/XW/Desktop/datascience.stackexchange.com/parse/' + str(i),
             'w')
    f.write(clean_post.encode('utf-8'))

import textmining
import os
xDIR = '/Users/XW/Desktop/datascience.stackexchange.com/parse'


def termdocumentmatrix_example(xDIR):

    # Initialize class to create term-document matrix
    count = 0
    tdm = textmining.TermDocumentMatrix()
    for i in os.listdir(xDIR):
Exemplo n.º 5
0
    #test= raw_input()	
    
    

    print ('\n\nPlease uncomment nltk.download() to download text data sets \n')
    #nltk.download()  # Download text data sets, including stop words

    # Initialize an empty list to hold the clean symptoms and summary
    clean_train_symptom = []
    clean_train_summary ={}

    

    print ("Cleaning and parsing the training set symptoms...\n")
    for i in xrange( 0, len(train["symptom"])):
        clean_train_symptom.append(" ".join(Word2VecUtility.symptoms_to_wordlist(train["symptom"][i],True)))
    print ("Cleaning and parsing the training set summary...\n")
    for i in xrange( 0, len(train["summary"])):
        
	clean_train_summary[train["disease"][i]] = "".join(Word2VecUtility.summary_to_wordlist(train["summary"][i]))
	
    
    # ****** Create a bag of words from the training set
    #
    print ("Creating the bag of words...\n")
    

    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.
    vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
Exemplo n.º 6
0
words_set = set(model.index2word)
word2index = { word : (i + index_from) for i,word in enumerate(words_set) }
index2word = { i : word for word, i in list(word2index.items()) }
index2word[0] = '0'
index2word[1] = '1'
index2word[2] = '2'
# 'Word2Vec' object does not support item assignment
padding_model = {}
padding_model['0'] = np.random.standard_normal(num_features)
padding_model['1'] = np.random.standard_normal(num_features)
padding_model['2'] = np.random.standard_normal(num_features)


reviews_words = []
for review in data["text"]:
    review_words = Word2VecUtility.review_to_wordlist(review, remove_stopwords = True)
    # each word index increased with 3.
    review_words = [start] + [word2index[w] if (w in words_set) else oov for w in review_words]
#   review_words = [oov if (ix > (max_words + index_from)) else ix for ix in review_words]
    reviews_words.append(review_words)

# padding with 0, each review has max_length now.
reviews_words = sequence.pad_sequences(reviews_words, maxlen = max_length, padding='post', truncating='post')
print(reviews_words.shape)


# In[47]:


data_matrix = np.empty((reviews_words.shape[0], max_length, num_features))
print(data_matrix.shape)
Exemplo n.º 7
0
def getCleanReviews(reviews):
    clean_reviews = []
    for review in reviews["Paper_content"]:
        clean_reviews.append(
            Word2VecUtility.review_to_wordlist(review, remove_stopwords=True))
    return clean_reviews
# In[7]:


# print data.ix[0:10]
print((data.iloc[:10]['text']))
# print data['text'][2]


# In[8]:


review_sents = []
print ("Cleaning and parsing the reviews...\n")
for i in range( 0, len(data["text"])):
    # sent_reviews += Word2VecUtility.review_to_sentences(data["text"][i], tokenizer)
    review_sents += Word2VecUtility.review_to_sentences(data.iloc[i]["text"], tokenizer)
    


# In[53]:


out = open('review_sents_1859888.pkl', 'wb')
pickle.dump(review_sents, out)
out.close()


# In[11]:


# review_sents = pickle.load(open('review_sents_1859888.pkl', 'rb'))
Exemplo n.º 9
0
import numpy as np

if __name__ == '__main__':
    train = pd.read_csv(os.path.join(os.path.dirname(__file__)), 'data', 'labeledTrainData.tsv', header = 0,\
                        delimiter = "\t", quoting=3)
    test = pd.read_csv(os.path.join(os.path.dirname(__file__)), 'data', 'testData.tsv', header = 0,\
                        delimiter = "\t", quoting=3)
    print 'The first review is:'
    print train["review"][0]
    raw_input("Press Enter to continue..")
    # Initialize an empty list to hold clean_reviews
    clean_train_reviews = []
    # Loop over each review; create an index i that goes from 0 to the length of the movie review list
    print "Cleaning and parsing the training set movie reviews...\n"
    for i in xrange(0, len(train["review"])):
        clean_train_reviews.append("".join(Word2VecUtility.review_to_wordlist(train["review"][i], True)))

    # ****** Create a bag of words from the training set
    print "Creating the bag of words...\n"
    # Initialize CountVectorizer object which is Scikit-learn bag of words tool
    vectorizer = CountVectorizer(analyzer = "word", \
                            tokenizer = None, \
                            preprocessor = None, \
                            stop_words = None, \
                            max_features=5000)
    # fit_transform does two functions: First it fits the model and learns the vocabulary
    # second; it transforms our training data into feature vectors. The input should be a list of strings
    train_data_features = vectorizer.fit_transform(clean_train_reviews)
    # Convert to numpy array
    train_data_features = train_data_features.toarray()
Exemplo n.º 10
0
from Word2VecUtility import Word2VecUtility
import sklearn
import sklearn.feature_extraction

post_tree = ET.parse(
    '/Users/Zhen/Desktop/Courses/BigData/stackexchange/data/Posts.xml')
post = [(i.attrib.get("PostTypeId"), i.attrib.get("CreationDate"),
         i.attrib.get("Body")) for i in post_tree.getroot()]
post_frame = DataFrame(post, columns=['PostTypeId', 'CreationDate', 'Body'])
post_body = post_frame.loc[:, 'Body']

clean_post = []
print "Cleaning and parsing the posts...\n"
for i in xrange(0, len(post_body)):
    clean_post.append(" ".join(
        Word2VecUtility.review_to_wordlist(post_body[i], True)))
clean_postdf = pd.DataFrame(clean_post)
clean_postdf.to_csv('post_body.csv', sep=',', encoding='utf-8')

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = sklearn.feature_extraction.text.CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000,min_df=1)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
data_features = vectorizer.fit_transform(clean_post)
Exemplo n.º 11
0
from time import mktime
import sys
sys.path.insert(0, '/Users/Zhen/Desktop/Courses/BigData/stackexchange/')
from Word2VecUtility import Word2VecUtility
import sklearn
import sklearn.feature_extraction

post_tree=ET.parse('/Users/Zhen/Desktop/Courses/BigData/stackexchange/data/Posts.xml')
post=[(i.attrib.get("PostTypeId"),i.attrib.get("CreationDate"),i.attrib.get("Body") ) for i in post_tree.getroot()] 
post_frame=DataFrame(post,columns=['PostTypeId','CreationDate','Body'])
post_body=post_frame.loc[:,'Body']

clean_post = []
print "Cleaning and parsing the posts...\n"
for i in xrange( 0, len(post_body)):
	clean_post.append(" ".join(Word2VecUtility.review_to_wordlist(post_body[i], True)))
clean_postdf=pd.DataFrame(clean_post)
clean_postdf.to_csv('post_body.csv',sep=',',encoding = 'utf-8')


 # Initialize the "CountVectorizer" object, which is scikit-learn's
 # bag of words tool.
vectorizer = sklearn.feature_extraction.text.CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000,min_df=1)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
word2index = { word : (i + index_from) for i,word in enumerate(words_set) }
index2word = { i : word for word, i in word2index.items() }
index2word[0] = '0'
index2word[1] = '1'
index2word[2] = '2'
# 'Word2Vec' object does not support item assignment
padding_model = {}
padding_model['0'] = np.random.standard_normal(num_features)
padding_model['1'] = np.random.standard_normal(num_features)
padding_model['2'] = np.random.standard_normal(num_features)

data = pd.read_csv('review_sub_399850.tsv', header=0, delimiter="\t", quoting=3, encoding='utf-8')

reviews_words = []
for review in data["text"]:
    review_words = Word2VecUtility.review_to_wordlist(review, remove_stopwords = True)
    # each word index has already been increased by 3.
    review_words = [start] + [word2index[w] if (w in model) else oov for w in review_words]
    # index from 0,1,... to 5002
    review_words = [oov if (ix >= (max_words + index_from)) else ix for ix in review_words]
    reviews_words.append(review_words)

# padding with 0, each review has max_length now.
reviews_words = sequence.pad_sequences(reviews_words, maxlen = max_length, padding='post', truncating='post')

print reviews_words[:20, :12]
print reviews_words.shape

labels = data["stars"]
# print labels[:10], labels.shape
labels[labels <= 3] = 0
Exemplo n.º 13
0
    # Read data from files
    train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data2',
                                     'labeledTrainData.csv'),
                        header=0,
                        delimiter="\t",
                        quoting=3)
    test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data2',
                                    'testData.csv'),
                       header=0,
                       delimiter="\t",
                       quoting=3)

    print "Cleaning training content"
    clean_train_reviews = []
    for review in train["Paper_content"]:
        clean_train_reviews.append( Word2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True ))

    print "Cleaning test content"
    clean_test_reviews = []
    for review in test["Paper_content"]:
        clean_test_reviews.append( Word2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True ))

    # ****** Create bags of centroids
    #
    # Pre-allocate an array for the training set bags of centroids (for speed)
    train_centroids = np.zeros( (train["Paper_content"].size, num_clusters), \
        dtype="float32" )

    # Transform the training set reviews into bags of centroids
    counter = 0
Exemplo n.º 14
0
# print train["stars"][0]

# print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...'
# nltk.download()  # Download text data sets, including stop words

# # Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# # Loop over each review
print "Cleaning and parsing the training set reviews...\n"

num_reviews = len(train["review"])
for i in xrange( 0, num_reviews):
    if( (i+1)%10000 == 0 ):
        print "Processing review %d out of %d" % ( i+1, num_reviews ) 
    clean_train_reviews.append(" ".join(Word2VecUtility.review_to_wordlist(train["review"][i], True)))


# ****** Create a bag of words from the training set
print "Creating the bag of words...\n"


# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer = "word",   \
                         tokenizer = None,    \
                         preprocessor = None, \
                         stop_words = None,   \
                         max_features = 5000) # limit the most frequent 5000 words

# fit_transform() does two functions: First, it fits the model
Exemplo n.º 15
0
delId = np.array(delId).tolist()
delId = [str(int(i)) for i in delId]

post_tree=ET.parse('/Users/XW/Desktop/datascience.stackexchange.com/Posts.xml')
post=[(i.attrib.get("PostTypeId"),i.attrib.get("CreationDate"),i.attrib.get("Body") ) for i in post_tree.getroot() if i.attrib.get("PostTypeId") =='2' 
       and i.attrib.get("Id") not in aaId and i.attrib.get("Id") not in delId] 
post_frame=DataFrame(post,columns=['PostTypeId','CreationDate','Body'])
post_body=post_frame.loc[:,'Body']

clean_post = []
print "Cleaning and parsing the posts...\n"
for i in xrange( 0, len(post_body)):
    tmp=BeautifulSoup(post_body[i].replace('\n',""),'html.parser').get_text()
    if tmp=='':
        continue
    clean_post=" ".join(Word2VecUtility.review_to_wordlist(tmp, True))
    f = file('/Users/XW/Desktop/datascience.stackexchange.com/parse/' + str(i), 'w')
    f.write(clean_post.encode('utf-8'))


import textmining
import os
xDIR = '/Users/XW/Desktop/datascience.stackexchange.com/parse'
def termdocumentmatrix_example(xDIR):
    
    # Initialize class to create term-document matrix
    count=0
    tdm = textmining.TermDocumentMatrix()
    for i in os.listdir(xDIR):
        Res = tdm.add_doc(open(os.path.join(xDIR,i)).read()) 
Exemplo n.º 16
0
    # Verify the number of reviews that were read (100,000 in total)
    print "Read %d labeled train reviews, %d labeled test reviews, " \
     "and %d unlabeled reviews\n" % (train["Paper_content"].size,
     test["Paper_content"].size, unlabeled_train["Paper_content"].size )

    # Load the punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # ****** Split the labeled and unlabeled training sets into clean sentences
    #
    sentences = []  # Initialize an empty list of sentences

    print "Parsing sentences from training set"
    for review in train["Paper_content"]:
        sentences += Word2VecUtility.review_to_sentences(review, tokenizer)

    print "Parsing sentences from unlabeled set"
    for review in unlabeled_train["Paper_content"]:
        sentences += Word2VecUtility.review_to_sentences(review, tokenizer)

    # ****** Set parameters and train the word2vec model
    #
    # Import the built-in logging module and configure it so that Word2Vec
    # creates nice output messages
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)

    # Set values for various parameters
    num_features = 300  # Word vector dimensionality
    min_word_count = 15  # Minimum word count
Exemplo n.º 17
0
    #input("Press Enter to continue...")

    #print ('Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...')
    #nltk.download()  # Download text data sets, including stop words

    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    print("Cleaning and parsing the training set movie reviews...\n")
    for i in range(0, len(train["review"])):
        clean_train_reviews.append(" ".join(
            Word2VecUtility.review_to_wordlist(train["review"][i], True)))

    # ****** Create a bag of words from the training set
    #
    print("Creating the bag of words...\n")

    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.
    vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
Exemplo n.º 18
0

# In[7]:

# print data.ix[0:10]
print data.iloc[:10]['text']
# print data['text'][2]


# In[8]:

review_sents = []
print "Cleaning and parsing the reviews...\n"
for i in xrange( 0, len(data["text"])):
    # sent_reviews += Word2VecUtility.review_to_sentences(data["text"][i], tokenizer)
    review_sents += Word2VecUtility.review_to_sentences(data.iloc[i]["text"], tokenizer)


# # In[53]:

out = open('review_sents_1859888.pkl', 'wb')
pickle.dump(review_sents, out)
out.close()



# # In[11]:

review_sents = pickle.load(open('review_sents_1859888.pkl', 'rb'))
print len(review_sents)
print review_sents[:5]