def predict(): pred_text = input("Please enter a review in english: ") contractions = get_contractions() pred_text = utils.clean_text(pred_text, contractions) pred_seq = tokenizer.text_to_sequence(pred_text, pred=True) pred_seq = np.tile(pred_seq, (args.batch_size, 1)) with tf.Session(graph=train_graph) as sess: checkpoint = "./saves/best_model.ckpt" all_preds = [] # with tf.Session() as sess: saver = tf.train.Saver() # Load the model saver.restore(sess, checkpoint) state = sess.run(graph.initial_state) feed = { graph.input_data: pred_seq, graph.keep_prob: args.keep_prob, graph.initial_state: state } preds = sess.run(graph.predictions, feed_dict=feed) for i in range(len(preds)): all_preds.append(preds[i, :]) all_preds = np.asarray(all_preds) y_predictions = np.argmax(all_preds, axis=1) counts = np.bincount(y_predictions) print("\nYou rated the restaurant: " + str(np.argmax(counts)) + " stars!")
def clean_str(text, max_length, enable_max): """Clean sentence""" text = text.lower() text = text.split() new_text = [] contractions = get_contractions() for word in text: if word in contractions: new_text.append(contractions[word]) else: new_text.append(word) text = " ".join(new_text) # Format words and remove unwanted characters text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(r'\<a href', ' ', text) text = re.sub(r'&', '', text) text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text) text = re.sub(r'<br />', ' ', text) text = re.sub(r'\'', ' ', text) text = text.split(' ') stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = empty_remover(text) if enable_max: if len(text) >= max_length: text = text[0:max_length] elif len(text) < max_length: text = text + ["PAD"] * (max_length - len(text)) text = text[0:max_length] return ' '.join(text).strip()
def gen_summary(text, max_length): """Clean sentence""" sentence = summarize(text) bow = sentence bow = bow.lower() bow = bow.split() #bow = bow + keywords(text,split = True) bow = bow + text.lower().split() new_text = [] contractions = get_contractions() for word in bow: if word in contractions: new_text.append(contractions[word]) else: new_text.append(word) text = " ".join(new_text) # Format words and remove unwanted characters text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(r'\<a href', ' ', text) text = re.sub(r'&', '', text) text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text) text = re.sub(r'<br />', ' ', text) text = re.sub(r'\'', ' ', text) text = text.split(' ') stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = ['GO'] + text text = empty_remover(text) if len(text) >= max_length: text = text[0:max_length] else: text = text + ["PAD"] * (max_length - len(text)) text = text[0:max_length] return ' '.join(text)
def process_reviews(bus_file='.data/dataset/business.json', rev_file='./data/dataset/review.json'): """ Function will initialize the review preprocessing pipeline. It will expand contractions of text and then perform text cleaning :param bus_file: Type string, path to business json file :param rev_file: Type string, path to reviews json file :return: """ assert isinstance(bus_file, str) assert isinstance(rev_file, str) restId = [] for line in open(bus_file, 'r'): data = json.loads(line) if 'Restaurants' in data['categories'] or 'Food' in data['categories']: restId.append(data['business_id']) print("There are %d restaurants" % (len(restId))) contractions = get_contractions() revs_list = [[]] stars_list = [[]] k = 0 # Count nolang = [[]] for line in open(rev_file, 'r'): # encoding='utf-8' if k >= args.num_reviews: break data = json.loads(line) text = data['text'] star = data['stars'] ID = data['business_id'] # Check language if text is None: continue if star is None: continue if ID not in restId: continue try: if detect(text) == 'en': revs_list.append(utils.clean_text(text, contractions)) stars_list.append(star) k += 1 # Notify for every 5000 reviews if len(revs_list) % 5000 == 0: print("Currently processed %d reviews" % len(revs_list)) except ValueError: nolang.append(text) print("Detected text with no language! Now at: %d" % len(nolang)) print("Length of Reviews:\t" + str(len(revs_list)) + "Length of Stars:\t" + str(len(stars_list))) return revs_list, stars_list