def get_avgfeatures_word2vec( data, column, model, num_features=300, writeFeaturesFileName="./model/imdb_avgfeatures.pickle"): if (os.path.isfile(writeFeaturesFileName)): reviewFeatureVecs = cPickle.load(open(writeFeaturesFileName)) return reviewFeatureVecs # reviews = read_article.data_to_reviews(data, column) # Initialize a counter counter = 0 # # Preallocate a 2D numpy array, for speed reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32") # # Loop through the reviews for review in reviews: # Print a status message every 1000th review if counter % 1000. == 0.: print("Review %d of %d" % (counter, len(reviews))) # Call the function (defined above) that makes average feature vectors reviewFeatureVecs[counter] = makeAvgVec(review, model) # Increment the counter counter = counter + 1 cPickle.dump(reviewFeatureVecs, open(writeFeaturesFileName, 'w')) return reviewFeatureVecs
def get_indices_word2vec(data, column, model, maxLength=50, writeIndexFileName="./model/word2vec_indices.pickle", padLeft=True, keep_freqwords=[]): if (os.path.isfile(writeIndexFileName)): reviewIndexVecs = pickle.load(open(writeIndexFileName, 'rb')) return reviewIndexVecs # reviews = read_article.data_to_reviews(data, column, keep_freqwords=keep_freqwords) # Initialize a counter counter = 0 # # Preallocate a 2D numpy array, for speed reviewIndexVecs = np.zeros((len(reviews), maxLength), dtype="int32") # # Loop through the reviews for review in reviews: # Print a status message every 1000th review if counter % 1000 == 0: print("Review %d of %d" % (counter, len(reviews))) # Call the function (defined above) that makes average feature vectors reviewIndexVecs[counter] = makeIndexVec(review, model, maxLength, padLeft=padLeft) # Increment the counter counter = counter + 1 pickle.dump(reviewIndexVecs, open(writeIndexFileName, 'wb')) return reviewIndexVecs