Пример #1
0
def _read_and_decode(split, max_ngram_len, feature='n-gram'):
    voca = Vocabulary(ku.voca_root)
    userhelper = UserHelper()
    reviews = ReviewLoader(ku.Movie, product_num=50).get_data()

    users = userhelper.get_users(reviews)
    user2idx = userhelper.user2idx(users)
    if feature == 'n-gram':
        feature2idx = voca.character_n_gram_table(reviews, min_threshold=6)
    else:
        feature2idx = voca.word_table(reviews, min_threshold=5)
    print('--------------------feature2idx-----------------', len(feature2idx))
    feature_loader = FeatureLoader(user2idx=user2idx,
                                   max_ngram_len=max_ngram_len,
                                   ngram2idx=feature2idx)
    training_split = int(len(reviews) * 0.8)
    valid_split = training_split - int(training_split * 0.2)
    if split == 'train':
        X, Y = feature_loader.load_n_gram_idx_feature_label(
            reviews[:valid_split], )
    elif split == 'valid':
        X, Y = feature_loader.load_n_gram_idx_feature_label(
            reviews[:valid_split])
    else:
        X, Y = feature_loader.load_n_gram_idx_feature_label(
            reviews[training_split:])
    # X, Y = tf.convert_to_tensor(X, dtype=tf.int32), tf.convert_to_tensor(Y, dtype=tf.int32)
    recons_Y = Y
    Y = keras.utils.to_categorical(Y, num_classes=len(user2idx))
    features = {'text': X, 'labels': Y, 'recons_labels': recons_Y}
    print('X.shape: ', X.shape)
    print('Y.shape: ', Y.shape)
    return features, len(user2idx), len(feature2idx), X.shape[0]
Пример #2
0
 def feature_label(self):
     data_params = {
         'max_ngram_len': self.max_len,
         'user2idx': self.user2idx,
         'ngram2idx': self.feature2idx
     }
     feature_loader = FeatureLoader(**data_params)
     x, y = feature_loader.load_n_gram_idx_feature_label(self.reviews)
     return x, y
Пример #3
0
 def feature_label(self):
     '''convert text to feature and get it's label'''
     data_params = {
         'user2idx': self.user2idx,
         'ngram2idx': self.feature2idx
     }
     feature_loader = FeatureLoader(**data_params)
     x, y = feature_loader.load_n_gram_binary_feature_label(self.reviews)
     return x, y
Пример #4
0
def get_feature(reviews):
    if feature_name == 'n-gram':
        feature2idx = voca.character_n_gram_table(reviews, min_threshold=6)
    else:
        feature2idx = voca.word_table(reviews, min_threshold=5)
    feature_loader = FeatureLoader(user2idx=user2idx,
                                   max_ngram_len=max_len,
                                   ngram2idx=feature2idx)
    X, Y = feature_loader.load_n_gram_idx_feature_label(reviews)
    return X, Y, len(feature2idx)
Пример #5
0
 def load_feature_label(self, split):
     feature2idx = self.feature2idx()
     data_params = {
         'max_ngram_len': self.max_len,
         'user2idx': self.user2idx,
         'ngram2idx': feature2idx
     }
     feature_loader = FeatureLoader(**data_params)
     x, y = feature_loader.load_n_gram_idx_feature_label(self.reviews)
     train_split = int(x.shape[0] * 0.8)
     valid_split = train_split - int(train_split * 0.2)
     if split == 'train':
         x, y = x[:valid_split, :], y[:valid_split]
     elif split == 'valid':
         x, y = x[valid_split:train_split, :], y[valid_split:train_split]
     else:
         x, y = x[train_split:, :], y[train_split:]
     return torch.tensor(x, dtype=torch.long), torch.tensor(
         y, dtype=torch.long), len(feature2idx)
Пример #6
0
def get_feature(reviews, split):
    if feature_name == 'n-gram':
        feature2idx = voca.character_n_gram_table(reviews, min_threshold=6)
    else:
        feature2idx = voca.word_table(reviews, min_threshold=5)
    feature_loader = FeatureLoader(user2idx=user2idx,
                                   max_ngram_len=max_len,
                                   ngram2idx=feature2idx)
    training_split = int(len(reviews) * 0.8)
    valid_split = training_split - int(training_split * 0.2)
    if split == 'train':
        X, Y = feature_loader.load_n_gram_idx_feature_label(
            reviews[:training_split - valid_split])
    elif split == 'valid':
        X, Y = feature_loader.load_n_gram_idx_feature_label(
            reviews[training_split - valid_split:training_split])
    else:
        X, Y = feature_loader.load_n_gram_idx_feature_label(
            reviews[training_split:])

    Y = keras.utils.to_categorical(Y, num_classes=len(user2idx))
    return X, Y, len(feature2idx)
Пример #7
0
ngram2idx = voca.character_n_gram_table(reviews, min_threshold=ngram_min_threshold)
pos2idx = userhelper.pos2idx()


data_params = {ku.max_ngram_len: 600, ku.max_pos_num: max_pos_num,
               ku.max_words_num: max_words_num, ku.user2idx: user2idx,
               ku.ngram2idx: ngram2idx, ku.pos2idx: pos2idx,
               }

net_params = {ku.max_words_num: max_words_num, 'syntax_dim': 60, 'ngram_dim': 300,
              'pos_type_num': len(pos2idx), 'out_dim': len(user2idx),
              ku.max_pos_num: max_pos_num, 'vocab_size': len(ngram2idx),
              'batch_size': 32, 'filters': 300, 'kernel_size': 3,
              'loss': 'categorical_crossentropy'}

feature_loader = FeatureLoader(**data_params)
feature = feature_loader.syntax_cnn_feature_label(reviews)

pos_id, position_id, ngram_id, user_id = feature[ku.pos_id], feature[ku.pos_order_id], \
                                         feature[ku.ngram_id], feature[ku.user_id]

print('pos_id: ', pos_id.shape)
print('position_id: ', position_id.shape)
print('ngram_id: ', ngram_id.shape)
print('user_id: ', user_id.shape)

training_split = int(0.8 * ngram_id.shape[0])
training_ngram_id, testing_ngram_id = ngram_id[:training_split, :], ngram_id[training_split:, :]
training_pos_id, testing_pos_id = pos_id[:training_split, :], pos_id[training_split:, :]
training_position_id, testing_position_id = position_id[:training_split, :], position_id[training_split:, :]
training_x = [training_ngram_id, training_pos_id, training_position_id]
Пример #8
0
from utils.data_utils import FeatureLoader, UserHelper, DataHelper
import utils.key_utils as ku
import numpy as np
from collections import Counter
from scipy import sparse

userhelper = UserHelper()
datahelper = DataHelper()
feature_loader = FeatureLoader()


def get_users(reviews):
    users = userhelper.get_users(reviews)
    user2idx = userhelper.user2idx(users)
    users_id = []
    for review in reviews:
        users_id.append(user2idx[review[ku.reviewer_ID]])
    return np.array(users_id)


#
#
# def get_products_id(reviews):
#     products = datahelper.get_products(reviews)
#     product2idx = datahelper.product2idx(products)
#     products_id = datahelper.load_products_id(products, product2idx)
#     return products_id, len(product2idx)


def load_feature_label(reviews, products_id):
    y = get_users(reviews)
Пример #9
0
from baselines.gcforest.GcForest import GCForest
from utils.vocabulary_utils import Vocabulary
import utils.key_utils as ku
from utils.data_utils import ReviewLoader, FeatureLoader, DataHelper, UserHelper
import sklearn.utils as sku
from sklearn.metrics import accuracy_score
import os
import pickle

datahelper = DataHelper()
voca = Vocabulary(ku.voca_root)
userhelper = UserHelper()
feature_loader = FeatureLoader()

reviews = ReviewLoader(ku.Movie, product_num=50).get_data()
users = userhelper.get_users(reviews)

user2idx = userhelper.user2idx(users)
ngram2idx = voca.character_n_gram_table(reviews, min_threshold=2)
voca.dump_n_grams(ngram2idx, type=ku.charngram2idx)


def get_toy_config():
    config = {}
    ca_config = {}
    ca_config["random_state"] = 0
    ca_config["max_layers"] = 100
    ca_config["early_stopping_rounds"] = 3
    ca_config["n_classes"] = 203
    ca_config["estimators"] = []
    ca_config["estimators"].append({
Пример #10
0
 def feature_label(self):
     feature2idx = self.feature2idx(None)
     params = {'feature2idx': feature2idx, 'user2idx': self.user2idx}
     feature_loader = FeatureLoader(**params)
     x, users = feature_loader.load_n_gram_binary_feature_label(self.reviews)
     return x, users
Пример #11
0
voca = Vocabulary(ku.voca_root)
userhelper = UserHelper()

reviews = ReviewLoader(ku.Movie, product_num=50).get_data()

users = userhelper.get_users(reviews)
user2idx = userhelper.user2idx(users)
ngram2idx = voca.character_n_gram_table(reviews,
                                        min_threshold=ngram_min_threshold)
print(len(ngram2idx))
data_params = {
    'max_ngram_len': max_ngram_len,
    'user2idx': user2idx,
    'ngram2idx': ngram2idx
}
feature_loader = FeatureLoader(**data_params)

param = {
    'kernel_size': [3, 5, 7],
    'batch_size': 32,
    'epochs': 100,
    'loss': 'categorical_crossentropy',
    'embedding_dim': 100,
    'user_num': len(user2idx),
    'max_ngram_len': max_ngram_len,
    'feature_num': 300,
    'vocab_size': len(ngram2idx)
}
#
#
x, y = feature_loader.load_n_gram_idx_feature_label(reviews)