def w2v_train(test=True): sentence_path = get_constant('W2V_SRC_PATH', test) model_path = get_constant('W2V_MODEL_PATH', test) model_txt_path = get_constant('W2V_MODEL_TXT_PATH', test) sentences = MySentences(sentence_path) # a memory-friendly iterator model = gensim.models.Word2Vec(sentences, size=200) model.save(model_path) model.save_word2vec_format(model_txt_path, binary=False)
def scrape(stop_at=None, train=0.85, test=True): # This function is to scrape only TestReview and Overall fields from original Amazon database HUNDRED = 100 data_path = get_constant('DATA_PATH', test) train_path = get_constant('TRAIN_PATH', test) validate_path = get_constant('VALIDATE_PATH', test) reviews_path = get_constant('W2V_SRC_PATH', test) total_num_reviews = num_of_reviews(data_path) unit_percent_reviews = int(total_num_reviews / HUNDRED) data_file = open(data_path, 'r') train_file = open(train_path, 'w') validate_file = open(validate_path, 'w') reviews_file = open(reviews_path, 'w') progress = 0 count = 1 for line in data_file: line = line.strip() data = json.loads(line) review = del_punctuations(data['reviewText']) review = review.lower() review = del_stopwords(review) output = {"overall": data['overall'], "reviewText": review} if count <= train * total_num_reviews: train_file.write(json.dumps(output) + '\n') else: validate_file.write(json.dumps(output) + '\n') reviews_file.write(review + '\n') # print count if count % unit_percent_reviews == 0: progress = int(count * HUNDRED / float(total_num_reviews)) print '{}%'.format(progress) elif count == total_num_reviews: print '100%' count += 1 if stop_at and progress == stop_at: break data_file.close() train_file.close() validate_file.close() reviews_file.close() return
def test_cascade(mode=myconstants.Mode.MODE_MEAN, classification=None): test = False accuracy_path = get_constant('ACCURACY_PATH', test) w2v_model_path = get_constant('W2V_MODEL_PATH', test) d2v_model_path = get_constant('D2V_MODEL_PATH', test) # Load model if mode == myconstants.Mode.MODE_PCA: vec_dim = myconstants.Mode.PCA_COMPONENTS * myconstants.W2V_DIM model = gensim.models.Word2Vec.load(w2v_model_path) elif mode == myconstants.Mode.MODE_MEAN: vec_dim = myconstants.W2V_DIM model = gensim.models.Word2Vec.load(w2v_model_path) elif mode == myconstants.Mode.MODE_D2V: vec_dim = myconstants.D2V_DIM model = gensim.models.Doc2Vec.load(d2v_model_path) accuracy_ratings, accuracy_reviews = get_validate_data( accuracy_path, model, mode) accuracy_normal = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0} accuracy_cascade = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0} for idx, reviews in enumerate(accuracy_reviews): temp_classes = classification import operator rating, value = max(enumerate(accuracy_ratings[idx]), key=operator.itemgetter(1)) rating += 1 while 1: if len(temp_classes) > 1: p = predict(reviews, temp_classes, vec_dim, mode) temp_classes = form_classes(p) else: if temp_classes[0][0] == rating: accuracy_cascade[temp_classes[0][0]] += 1 p = predict(reviews, myconstants.CLASSES, vec_dim, mode) if p[0] == rating: accuracy_normal[p[0]] += 1 break # p = predict(reviews, myconstants.CLASSES, vec_dim, mode) # # print tolerate_class(p) # if rating in tolerate_class(p): # accuracy_cascade[rating] += 1 # p = predict(reviews, myconstants.CLASSES, vec_dim, mode) # if p[0] == rating: # accuracy_normal[rating] += 1 print idx print accuracy_normal, accuracy_cascade return accuracy_normal, accuracy_cascade
def predict(sentence): print 'your review is : %s' % sentence sentence = ml.del_punctuations(sentence) sentence = sentence.lower() sentence = ml.del_stopwords(sentence) while sentence == '': sentence = raw_input("Meaningless sentence! Please enter another sentence :") print 'your review is : %s' % sentence sentence = ml.del_punctuations(sentence) sentence = sentence.lower() sentence = ml.del_stopwords(sentence) print 'your key words for vector are : %s' % sentence # load model test = False accuracy_path = get_constant('ACCURACY_PATH', test) w2v_model_path = get_constant('W2V_MODEL_PATH', test) d2v_model_path = get_constant('D2V_MODEL_PATH', test) # load model if mode == myconstants.Mode.MODE_PCA: vec_dim = myconstants.Mode.PCA_COMPONENTS * myconstants.W2V_DIM model = gensim.models.Word2Vec.load(w2v_model_path) elif mode == myconstants.Mode.MODE_MEAN: vec_dim = myconstants.W2V_DIM model = gensim.models.Word2Vec.load(w2v_model_path) elif mode == myconstants.Mode.MODE_D2V: vec_dim = myconstants.D2V_DIM model = gensim.models.Doc2Vec.load(d2v_model_path) vector = np.array(sentence_mean(sentence, model)) vector.shape = [1, vec_dim] sess = tf.Session() x = tf.Variable(vector) try: W = tf.Variable(np.load(get_save_name(myconstants.NUMPY_W, mode))) b = tf.Variable(np.load(get_save_name(myconstants.NUMPY_B, mode))) except: print 'you have not trained your model yet!' y = tf.argmax(tf.nn.softmax(tf.matmul(x, W) + b), 1) init = tf.initialize_all_variables() sess.run(init) result = sess.run(y) total_class = myconstants.CLASSES print 'predicted rating : ', total_class[result] print ''
def d2v_train(test=True, combine=False): train_path = get_constant('D2V_SRC_PATH', test) accuracy_path = get_constant('ACCURACY_PATH', test) model_path = get_constant('D2V_MODEL_PATH', test) if combine: sentences = LabeledLineSentence([[train_path, 'TRAIN'], [accuracy_path, 'TEST', 'JSON']]) else: sentences = LabeledLineSentence([[train_path, 'TRAIN']]) model = Doc2Vec(alpha=0.025, min_alpha=0.025) # use fixed learning rate model.build_vocab(sentences) for epoch in range(5): model.train(sentences) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay model.save(model_path)
def test_similarity(stop_at=None, max_count=2, test=True, mode=myconstants.Mode.MODE_PCA): sentence_path = get_constant('TRAIN_PATH', test) model_path = get_constant('W2V_MODEL_PATH', test=False) model = gensim.models.Word2Vec.load(model_path) sen_list = {i: [] for i in range(1, 6)} count = 1 train_file = open(sentence_path, 'r') for line in train_file: line = line.strip() data = json.loads(line) current_rating = data['overall'] sentence = data['reviewText'] if mode == myconstants.Mode.MODE_PCA: sen_represent = sentence_pca(sentence, model) elif mode == myconstants.Mode.MODE_MEAN: sen_represent = sentence_mean(sentence, model) sen_list, full_flag = form_sen_list(sen_list, sen_represent, current_rating, max_count) if full_flag or (stop_at and stop_at == count): print_comparison(sen_list, max_count) print count break count += 1 return
def next_batch(size, test, model, mode): from random import choice if test: choices = CHOICES_TEST else: choices = CHOICES p = [choice(choices) for n in range(size)] sentence_path = get_constant('TRAIN_PATH', test) mean_batch = [] rating_batch = [] with open(sentence_path, 'r') as train_file: lines = train_file.readlines() for i in p: current_i = i sen_represent = None rating_valid = False while sen_represent == None: line = lines[current_i] data = json.loads(line) rating = data['overall'] sentence = data['reviewText'] if mode == myconstants.Mode.MODE_PCA: sen_represent = sentence_pca(sentence, model) elif mode == myconstants.Mode.MODE_MEAN: sen_represent = sentence_mean(sentence, model) elif mode == myconstants.Mode.MODE_D2V: # import pdb; pdb.set_trace() sen_represent = sentence_d2v(sentence, current_i, model) current_i = choice(choices) rating_vec = one_hot(rating) mean_batch.append(sen_represent) rating_batch.append(rating_vec) return [np.array(mean_batch), np.array(rating_batch)]
def tf_train(test=True, learning_rate=0.5, mode=myconstants.Mode.MODE_MEAN, loops=1000): accuracy_path = get_constant('ACCURACY_PATH', test) w2v_model_path = get_constant('W2V_MODEL_PATH', test) d2v_model_path = get_constant('D2V_MODEL_PATH', test) result_folder = get_constant('RESULT_DIR', test) # total_num_reviews = f.num_of_reviews(sentence_path) # total_num_reviews = myconstants.TOTAL_REVIEWS # unit_percent_reviews = int(total_num_reviews / HUNDRED / 10) # result/path result_path = result_folder + myconstants.Mode.MODE_NAME[mode] for cla in myconstants.CLASSES: result_path += '_' for rating in cla: result_path += str(rating) result_path += '.txt' import os if os.path.isfile(result_path): result_file = open(result_path, 'a') result_file.write('\n\n') else: result_file = open(result_path, 'w') result_file.write('') # Load model if mode == myconstants.Mode.MODE_PCA: vec_dim = myconstants.Mode.PCA_COMPONENTS * myconstants.W2V_DIM model = gensim.models.Word2Vec.load(w2v_model_path) elif mode == myconstants.Mode.MODE_MEAN: vec_dim = myconstants.W2V_DIM model = gensim.models.Word2Vec.load(w2v_model_path) elif mode == myconstants.Mode.MODE_D2V: vec_dim = myconstants.D2V_DIM model = gensim.models.Doc2Vec.load(d2v_model_path) accuracy_ratings, accuracy_reviews = get_validate_data( accuracy_path, model, mode) # tf training graph graph = tf.Graph() with graph.as_default(): sess = tf.InteractiveSession() try: W = tf.Variable(np.load(get_save_name(myconstants.NUMPY_W, mode))) b = tf.Variable(np.load(get_save_name(myconstants.NUMPY_B, mode))) print('Exist {}: {}'.format(myconstants.Mode.MODE_NAME[mode], myconstants.CLASSES)) except: W = tf.Variable(tf.truncated_normal([vec_dim, TOTAL_CLASSES])) b = tf.Variable(tf.truncated_normal([TOTAL_CLASSES])) print('New {}: {}'.format(myconstants.Mode.MODE_NAME[mode], myconstants.CLASSES)) x = tf.placeholder(tf.float32, [None, vec_dim]) y = tf.matmul(x, W) + b y_ = tf.placeholder(tf.float32, [None, TOTAL_CLASSES]) cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(y, y_)) # cross_entropy += tf.reduce_mean(tf.square(b)) # cross_entropy += tf.reduce_mean(tf.square(W)) train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize( cross_entropy) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) init = tf.initialize_all_variables() sess.run(init) count = 1 progress = 0 for line in range(loops): mean_batch, rating_batch = next_batch(100, test, model, mode) sess.run(train_step, feed_dict={x: mean_batch, y_: rating_batch}) if count % 10 == 0: print_value = "percent test accuracy {}, ".format( accuracy.eval(feed_dict={ x: accuracy_reviews, y_: accuracy_ratings })) print_value += 'loop: {}'.format(count) result_file.write(print_value + '\n') print(print_value) count += 1 print('{}: {}'.format(myconstants.Mode.MODE_NAME[mode], myconstants.CLASSES)) np.save(get_save_name(myconstants.NUMPY_W, mode), sess.run(W)) np.save(get_save_name(myconstants.NUMPY_B, mode), sess.run(b)) result_file.close() return