示例#1
0
def predict():
    pred_text = input("Please enter a review in english: ")
    contractions = get_contractions()
    pred_text = utils.clean_text(pred_text, contractions)
    pred_seq = tokenizer.text_to_sequence(pred_text, pred=True)
    pred_seq = np.tile(pred_seq, (args.batch_size, 1))

    with tf.Session(graph=train_graph) as sess:
        checkpoint = "./saves/best_model.ckpt"
        all_preds = []
        # with tf.Session() as sess:
        saver = tf.train.Saver()
        # Load the model
        saver.restore(sess, checkpoint)
        state = sess.run(graph.initial_state)
        feed = {
            graph.input_data: pred_seq,
            graph.keep_prob: args.keep_prob,
            graph.initial_state: state
        }

        preds = sess.run(graph.predictions, feed_dict=feed)
        for i in range(len(preds)):
            all_preds.append(preds[i, :])
    all_preds = np.asarray(all_preds)
    y_predictions = np.argmax(all_preds, axis=1)
    counts = np.bincount(y_predictions)
    print("\nYou rated the restaurant: " + str(np.argmax(counts)) + " stars!")
def clean_str(text, max_length, enable_max):
    """Clean sentence"""
    text = text.lower()
    text = text.split()
    new_text = []
    contractions = get_contractions()
    for word in text:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    text = text.split(' ')
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]

    text = empty_remover(text)
    if enable_max:
        if len(text) >= max_length:
            text = text[0:max_length]
        elif len(text) < max_length:
            text = text + ["PAD"] * (max_length - len(text))
            text = text[0:max_length]

    return ' '.join(text).strip()
示例#3
0
def gen_summary(text, max_length):
    """Clean sentence"""
    sentence = summarize(text)
    bow = sentence
    bow = bow.lower()
    bow = bow.split()
    #bow = bow + keywords(text,split = True)
    bow = bow + text.lower().split()
    new_text = []
    contractions = get_contractions()
    for word in bow:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = text.split(' ')
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = ['GO'] + text
    text = empty_remover(text)
    if len(text) >= max_length:
        text = text[0:max_length]
    else:
        text = text + ["PAD"] * (max_length - len(text))
        text = text[0:max_length]
    return ' '.join(text)
def process_reviews(bus_file='.data/dataset/business.json',
                    rev_file='./data/dataset/review.json'):
    """
    Function will initialize the review preprocessing pipeline. It will expand contractions of text
    and then perform text cleaning
    :param bus_file: Type string, path to business json file
    :param rev_file: Type string, path to reviews json file
    :return:
    """
    assert isinstance(bus_file, str)
    assert isinstance(rev_file, str)

    restId = []
    for line in open(bus_file, 'r'):
        data = json.loads(line)
        if 'Restaurants' in data['categories'] or 'Food' in data['categories']:
            restId.append(data['business_id'])
    print("There are %d restaurants" % (len(restId)))

    contractions = get_contractions()

    revs_list = [[]]
    stars_list = [[]]
    k = 0  # Count
    nolang = [[]]
    for line in open(rev_file, 'r'):  # encoding='utf-8'
        if k >= args.num_reviews:
            break
        data = json.loads(line)
        text = data['text']
        star = data['stars']
        ID = data['business_id']
        # Check language
        if text is None:
            continue
        if star is None:
            continue
        if ID not in restId:
            continue
        try:
            if detect(text) == 'en':
                revs_list.append(utils.clean_text(text, contractions))
                stars_list.append(star)
                k += 1
                # Notify for every 5000 reviews
                if len(revs_list) % 5000 == 0:
                    print("Currently processed %d reviews" % len(revs_list))
        except ValueError:
            nolang.append(text)
            print("Detected text with no language! Now at: %d" % len(nolang))
    print("Length of Reviews:\t" + str(len(revs_list)) + "Length of Stars:\t" +
          str(len(stars_list)))
    return revs_list, stars_list