Python Word2VecUtility.review_to_wordlist示例，Word2VecUtility.Word2VecUtility.review_to_wordlist Python示例

示例#1

0

显示文件

def get_volcabulary_and_list_words(data):
    reviews_words = []
    volcabulary = []
    for review in data["text"]:
        review_words = Word2VecUtility.review_to_wordlist(
            review, remove_stopwords=True)
        reviews_words.append(review_words)
        for word in review_words:
            volcabulary.append(word)
    volcabulary = set(volcabulary)
    return volcabulary, reviews_words

示例#2

0

显示文件

文件： train_keras_embedding.py 项目： Zhiyu-Chen/CNN-yelp-challenge-2016-sentiment-classification

def get_volcabulary_and_list_words(data):
    reviews_words = []
    volcabulary = []
    for review in data["text"]:
        review_words = Word2VecUtility.review_to_wordlist(
            review, remove_stopwords=True)
        reviews_words.append(review_words)
        for word in review_words:
            volcabulary.append(word)
    volcabulary = set(volcabulary)
    return volcabulary, reviews_words

示例#3

0

显示文件

文件： BagOfWords.py 项目： RealityCtrl/chatbot

    #input("Press Enter to continue...")

    #print ('Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...')
    #nltk.download()  # Download text data sets, including stop words

    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    print("Cleaning and parsing the training set movie reviews...\n")
    for i in range(0, len(train["review"])):
        clean_train_reviews.append(" ".join(
            Word2VecUtility.review_to_wordlist(train["review"][i], True)))

    # ****** Create a bag of words from the training set
    #
    print("Creating the bag of words...\n")

    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.
    vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data

示例#4

0

显示文件

文件： transform.py 项目： Sapphirine/stackexchange

delId = np.array(delId).tolist()
delId = [str(int(i)) for i in delId]

post_tree=ET.parse('/Users/XW/Desktop/datascience.stackexchange.com/Posts.xml')
post=[(i.attrib.get("PostTypeId"),i.attrib.get("CreationDate"),i.attrib.get("Body") ) for i in post_tree.getroot() if i.attrib.get("PostTypeId") =='2' 
       and i.attrib.get("Id") not in aaId and i.attrib.get("Id") not in delId] 
post_frame=DataFrame(post,columns=['PostTypeId','CreationDate','Body'])
post_body=post_frame.loc[:,'Body']

clean_post = []
print "Cleaning and parsing the posts...\n"
for i in xrange( 0, len(post_body)):
    tmp=BeautifulSoup(post_body[i].replace('\n',""),'html.parser').get_text()
    if tmp=='':
        continue
    clean_post=" ".join(Word2VecUtility.review_to_wordlist(tmp, True))
    f = file('/Users/XW/Desktop/datascience.stackexchange.com/parse/' + str(i), 'w')
    f.write(clean_post.encode('utf-8'))


import textmining
import os
xDIR = '/Users/XW/Desktop/datascience.stackexchange.com/parse'
def termdocumentmatrix_example(xDIR):
    
    # Initialize class to create term-document matrix
    count=0
    tdm = textmining.TermDocumentMatrix()
    for i in os.listdir(xDIR):
        Res = tdm.add_doc(open(os.path.join(xDIR,i)).read())

示例#5

0

显示文件

文件： transform.py 项目： Sapphirine/stackexchange

    '/Users/XW/Desktop/datascience.stackexchange.com/Posts.xml')
post = [(i.attrib.get("PostTypeId"), i.attrib.get("CreationDate"),
         i.attrib.get("Body")) for i in post_tree.getroot()
        if i.attrib.get("PostTypeId") == '2' and i.attrib.get("Id") not in aaId
        and i.attrib.get("Id") not in delId]
post_frame = DataFrame(post, columns=['PostTypeId', 'CreationDate', 'Body'])
post_body = post_frame.loc[:, 'Body']

clean_post = []
print "Cleaning and parsing the posts...\n"
for i in xrange(0, len(post_body)):
    tmp = BeautifulSoup(post_body[i].replace('\n', ""),
                        'html.parser').get_text()
    if tmp == '':
        continue
    clean_post = " ".join(Word2VecUtility.review_to_wordlist(tmp, True))
    f = file('/Users/XW/Desktop/datascience.stackexchange.com/parse/' + str(i),
             'w')
    f.write(clean_post.encode('utf-8'))

import textmining
import os
xDIR = '/Users/XW/Desktop/datascience.stackexchange.com/parse'


def termdocumentmatrix_example(xDIR):

    # Initialize class to create term-document matrix
    count = 0
    tdm = textmining.TermDocumentMatrix()
    for i in os.listdir(xDIR):

示例#6

0

显示文件

words_set = set(model.index2word)
word2index = { word : (i + index_from) for i,word in enumerate(words_set) }
index2word = { i : word for word, i in list(word2index.items()) }
index2word[0] = '0'
index2word[1] = '1'
index2word[2] = '2'
# 'Word2Vec' object does not support item assignment
padding_model = {}
padding_model['0'] = np.random.standard_normal(num_features)
padding_model['1'] = np.random.standard_normal(num_features)
padding_model['2'] = np.random.standard_normal(num_features)


reviews_words = []
for review in data["text"]:
    review_words = Word2VecUtility.review_to_wordlist(review, remove_stopwords = True)
    # each word index increased with 3.
    review_words = [start] + [word2index[w] if (w in words_set) else oov for w in review_words]
#   review_words = [oov if (ix > (max_words + index_from)) else ix for ix in review_words]
    reviews_words.append(review_words)

# padding with 0, each review has max_length now.
reviews_words = sequence.pad_sequences(reviews_words, maxlen = max_length, padding='post', truncating='post')
print(reviews_words.shape)


# In[47]:


data_matrix = np.empty((reviews_words.shape[0], max_length, num_features))
print(data_matrix.shape)

示例#7

0

显示文件

def getCleanReviews(reviews):
    clean_reviews = []
    for review in reviews["Paper_content"]:
        clean_reviews.append(
            Word2VecUtility.review_to_wordlist(review, remove_stopwords=True))
    return clean_reviews

示例#8

0

显示文件

文件： BagOfWords.py 项目： saniyasaifee/MS_Thesis

import numpy as np

if __name__ == '__main__':
    train = pd.read_csv(os.path.join(os.path.dirname(__file__)), 'data', 'labeledTrainData.tsv', header = 0,\
                        delimiter = "\t", quoting=3)
    test = pd.read_csv(os.path.join(os.path.dirname(__file__)), 'data', 'testData.tsv', header = 0,\
                        delimiter = "\t", quoting=3)
    print 'The first review is:'
    print train["review"][0]
    raw_input("Press Enter to continue..")
    # Initialize an empty list to hold clean_reviews
    clean_train_reviews = []
    # Loop over each review; create an index i that goes from 0 to the length of the movie review list
    print "Cleaning and parsing the training set movie reviews...\n"
    for i in xrange(0, len(train["review"])):
        clean_train_reviews.append("".join(Word2VecUtility.review_to_wordlist(train["review"][i], True)))

    # ****** Create a bag of words from the training set
    print "Creating the bag of words...\n"
    # Initialize CountVectorizer object which is Scikit-learn bag of words tool
    vectorizer = CountVectorizer(analyzer = "word", \
                            tokenizer = None, \
                            preprocessor = None, \
                            stop_words = None, \
                            max_features=5000)
    # fit_transform does two functions: First it fits the model and learns the vocabulary
    # second; it transforms our training data into feature vectors. The input should be a list of strings
    train_data_features = vectorizer.fit_transform(clean_train_reviews)
    # Convert to numpy array
    train_data_features = train_data_features.toarray()

示例#9

0

显示文件

from Word2VecUtility import Word2VecUtility
import sklearn
import sklearn.feature_extraction

post_tree = ET.parse(
    '/Users/Zhen/Desktop/Courses/BigData/stackexchange/data/Posts.xml')
post = [(i.attrib.get("PostTypeId"), i.attrib.get("CreationDate"),
         i.attrib.get("Body")) for i in post_tree.getroot()]
post_frame = DataFrame(post, columns=['PostTypeId', 'CreationDate', 'Body'])
post_body = post_frame.loc[:, 'Body']

clean_post = []
print "Cleaning and parsing the posts...\n"
for i in xrange(0, len(post_body)):
    clean_post.append(" ".join(
        Word2VecUtility.review_to_wordlist(post_body[i], True)))
clean_postdf = pd.DataFrame(clean_post)
clean_postdf.to_csv('post_body.csv', sep=',', encoding='utf-8')

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = sklearn.feature_extraction.text.CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000,min_df=1)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
data_features = vectorizer.fit_transform(clean_post)

示例#10

0

显示文件

文件： postdataProcessing.py 项目： Sapphirine/stackexchange

from time import mktime
import sys
sys.path.insert(0, '/Users/Zhen/Desktop/Courses/BigData/stackexchange/')
from Word2VecUtility import Word2VecUtility
import sklearn
import sklearn.feature_extraction

post_tree=ET.parse('/Users/Zhen/Desktop/Courses/BigData/stackexchange/data/Posts.xml')
post=[(i.attrib.get("PostTypeId"),i.attrib.get("CreationDate"),i.attrib.get("Body") ) for i in post_tree.getroot()] 
post_frame=DataFrame(post,columns=['PostTypeId','CreationDate','Body'])
post_body=post_frame.loc[:,'Body']

clean_post = []
print "Cleaning and parsing the posts...\n"
for i in xrange( 0, len(post_body)):
	clean_post.append(" ".join(Word2VecUtility.review_to_wordlist(post_body[i], True)))
clean_postdf=pd.DataFrame(clean_post)
clean_postdf.to_csv('post_body.csv',sep=',',encoding = 'utf-8')


 # Initialize the "CountVectorizer" object, which is scikit-learn's
 # bag of words tool.
vectorizer = sklearn.feature_extraction.text.CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000,min_df=1)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.

示例#11

0

显示文件

文件： train_with_word2vec_embedding.py 项目： 12190143/CNN-yelp-challenge-2016-sentiment-classification

word2index = { word : (i + index_from) for i,word in enumerate(words_set) }
index2word = { i : word for word, i in word2index.items() }
index2word[0] = '0'
index2word[1] = '1'
index2word[2] = '2'
# 'Word2Vec' object does not support item assignment
padding_model = {}
padding_model['0'] = np.random.standard_normal(num_features)
padding_model['1'] = np.random.standard_normal(num_features)
padding_model['2'] = np.random.standard_normal(num_features)

data = pd.read_csv('review_sub_399850.tsv', header=0, delimiter="\t", quoting=3, encoding='utf-8')

reviews_words = []
for review in data["text"]:
    review_words = Word2VecUtility.review_to_wordlist(review, remove_stopwords = True)
    # each word index has already been increased by 3.
    review_words = [start] + [word2index[w] if (w in model) else oov for w in review_words]
    # index from 0,1,... to 5002
    review_words = [oov if (ix >= (max_words + index_from)) else ix for ix in review_words]
    reviews_words.append(review_words)

# padding with 0, each review has max_length now.
reviews_words = sequence.pad_sequences(reviews_words, maxlen = max_length, padding='post', truncating='post')

print reviews_words[:20, :12]
print reviews_words.shape

labels = data["stars"]
# print labels[:10], labels.shape
labels[labels <= 3] = 0

示例#12

0

显示文件

    # Read data from files
    train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data2',
                                     'labeledTrainData.csv'),
                        header=0,
                        delimiter="\t",
                        quoting=3)
    test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data2',
                                    'testData.csv'),
                       header=0,
                       delimiter="\t",
                       quoting=3)

    print "Cleaning training content"
    clean_train_reviews = []
    for review in train["Paper_content"]:
        clean_train_reviews.append( Word2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True ))

    print "Cleaning test content"
    clean_test_reviews = []
    for review in test["Paper_content"]:
        clean_test_reviews.append( Word2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True ))

    # ****** Create bags of centroids
    #
    # Pre-allocate an array for the training set bags of centroids (for speed)
    train_centroids = np.zeros( (train["Paper_content"].size, num_clusters), \
        dtype="float32" )

    # Transform the training set reviews into bags of centroids
    counter = 0

示例#13

0

显示文件

文件： word2vec_model.py 项目： GongQin721/CNN-yelp2016-sentiment-analysis-master

# In[15]:


review_words = []
print((type(model.index2word)))
print((len(model.index2word)))
print((model.index2word[:100]))
index2word_set = set(model.index2word)
print((len(index2word_set)))


# In[16]:


words = Word2VecUtility.review_to_wordlist(data.iloc[0]['text'])
print(words)
for word in words:
    print((word in index2word_set))


# In[8]:


clean_labels = np.array(data["stars"])
print((clean_labels[:10], clean_labels.shape))
clean_labels[clean_labels <= 3] = 0
clean_labels[clean_labels > 3] = 1
print((clean_labels[:10]))
# num of positive reviews
print(((clean_labels == 1).sum()))