def _read_and_decode(split, max_ngram_len, feature='n-gram'): voca = Vocabulary(ku.voca_root) userhelper = UserHelper() reviews = ReviewLoader(ku.Movie, product_num=50).get_data() users = userhelper.get_users(reviews) user2idx = userhelper.user2idx(users) if feature == 'n-gram': feature2idx = voca.character_n_gram_table(reviews, min_threshold=6) else: feature2idx = voca.word_table(reviews, min_threshold=5) print('--------------------feature2idx-----------------', len(feature2idx)) feature_loader = FeatureLoader(user2idx=user2idx, max_ngram_len=max_ngram_len, ngram2idx=feature2idx) training_split = int(len(reviews) * 0.8) valid_split = training_split - int(training_split * 0.2) if split == 'train': X, Y = feature_loader.load_n_gram_idx_feature_label( reviews[:valid_split], ) elif split == 'valid': X, Y = feature_loader.load_n_gram_idx_feature_label( reviews[:valid_split]) else: X, Y = feature_loader.load_n_gram_idx_feature_label( reviews[training_split:]) # X, Y = tf.convert_to_tensor(X, dtype=tf.int32), tf.convert_to_tensor(Y, dtype=tf.int32) recons_Y = Y Y = keras.utils.to_categorical(Y, num_classes=len(user2idx)) features = {'text': X, 'labels': Y, 'recons_labels': recons_Y} print('X.shape: ', X.shape) print('Y.shape: ', Y.shape) return features, len(user2idx), len(feature2idx), X.shape[0]
class ReviewInfo: def __init__(self, reviews, feature='n-gram'): self.reviews = reviews self.feature = feature self.vocab = Vocabulary(ku.voca_root) self.feature2idx = self.feature2idx(6) self.product2idx = self.product2idx() self.user2idx = self.user2idx() self.user_num = len(self.user2idx) self.product_num = len(self.product2idx) self.x, self.users = self.feature_label() self.products = self.get_products() def user2idx(self): userhelper = UserHelper() users = userhelper.get_users(self.reviews) return userhelper.user2idx(users) def product2idx(self): datahelper = DataHelper() products = datahelper.get_products(self.reviews) return datahelper.product2idx(products) def feature2idx(self, min_threshold): '''get a dict that convert each word to feature''' assert self.feature == 'n-gram' or self.feature == 'word' if self.feature == 'n-gram': feature2idx = self.vocab.character_n_gram_table( self.reviews, min_threshold=min_threshold) else: feature2idx = self.vocab.word_table(self.reviews, min_threshold=0) return feature2idx def feature_label(self): '''convert text to feature and get it's label''' data_params = { 'user2idx': self.user2idx, 'ngram2idx': self.feature2idx } feature_loader = FeatureLoader(**data_params) x, y = feature_loader.load_n_gram_binary_feature_label(self.reviews) return x, y def get_products(self): products = [] for review in self.reviews: products.append(self.product2idx[review[ku.asin]]) return products def split_data(self): training_split = int(len(self.reviews) * 0.6) valid_split = training_split + int(len(self.reviews) * 0.2) training_x, training_y = self.x[: training_split], self.users[: training_split] valid_x, valid_y = self.x[training_split:valid_split], self.users[ training_split:valid_split] test_x, test_y = self.x[valid_split:], self.users[valid_split:] return (training_x, training_y), (valid_x, valid_y), (test_x, test_y)
class ReviewInfo: def __init__(self, reviews, min_threshold=6, feature_name='n-gram', max_len=3500): self.vocab = Vocabulary(ku.voca_root) self.reviews = reviews self.feature = feature_name self.min_threshold = min_threshold self.max_len = max_len self.user2idx = self.user2idx() self.feature2idx = self.feature2idx(min_threshold) self.vocab_size = len(self.feature2idx) self.num_classes = len(self.user2idx) self.x, self.y = self.feature_label() self.fake_x, self.fake_y = self.fake_feature_label() def user2idx(self): userhelper = UserHelper() users = userhelper.get_users(self.reviews) return userhelper.user2idx(users) def feature2idx(self, min_threshold): assert self.feature == 'n-gram' or self.feature == 'word' if self.feature == 'n-gram': feature2idx = self.vocab.character_n_gram_table( self.reviews, min_threshold=min_threshold) else: feature2idx = self.vocab.word_table(self.reviews, min_threshold=0) return feature2idx def feature_label(self): data_params = { 'max_ngram_len': self.max_len, 'user2idx': self.user2idx, 'ngram2idx': self.feature2idx } feature_loader = FeatureLoader(**data_params) x, y = feature_loader.load_n_gram_idx_feature_label(self.reviews) return x, y def fake_feature_label(self): num_samples = len(self.reviews) x = np.random.randint(0, self.vocab_size, (num_samples, self.max_len)) y = np.random.randint(0, self.num_classes, (num_samples, )) return x, y
class SvmInfo(ReviewInfo): def __init__(self, reviews, feature_name='n-gram'): super(SvmInfo, self).__init__(reviews) self.feature_name = feature_name self.vocab = Vocabulary(ku.voca_root) self.x, self.users = self.feature_label() def feature2idx(self, min_threshold): if self.feature_name == 'n-gram': feature2idx = self.vocab.character_n_gram_table(self.reviews, min_threshold=6) else: feature2idx = self.vocab.word_table(self.reviews, min_threshold=6) return feature2idx def feature_label(self): feature2idx = self.feature2idx(None) params = {'feature2idx': feature2idx, 'user2idx': self.user2idx} feature_loader = FeatureLoader(**params) x, users = feature_loader.load_n_gram_binary_feature_label(self.reviews) return x, users
class NonPreTrainedInfo(ReviewInfo): def __init__(self, reviews, min_threshold=6, feature_name='n-gram', max_seq_len=3500 ): super(NonPreTrainedInfo, self).__init__(reviews) self.min_threshold = min_threshold self.feature = feature_name self.max_seq_len = max_seq_len self.vocab = Vocabulary(ku.voca_root) self.feature2idx = self.feature2idx(min_threshold) self.x, self.users = self.feature_label() self.vocab_size = len(self.feature2idx) self.products = self.get_products() def feature2idx(self, min_threshold): assert self.feature == 'n-gram' or self.feature == 'word' if self.feature == 'n-gram': feature2idx = self.vocab.character_n_gram_table(self.reviews, min_threshold=min_threshold) else: feature2idx = self.vocab.word_table(self.reviews, min_threshold=0) return feature2idx def feature_label(self): data_params = {'max_ngram_len': self.max_seq_len, 'user2idx': self.user2idx, 'feature2idx': self.feature2idx} feature_loader = FeatureLoader(**data_params) x, y = feature_loader.load_n_gram_idx_feature_label(self.reviews, padding=True) return x, y def get_products(self): products = [] for review in self.reviews: products.append(self.product2idx[review[ku.asin]]) return products
class ReviewDataSet(Dataset): def __init__(self, reviews, split, feature='n-gram', min_threshold=6, max_len=3500): self.vocab = Vocabulary(ku.voca_root) self.reviews = reviews self.feature = feature self.min_threshold = min_threshold self.max_len = max_len self.user2idx = self.user2idx() self.text, self.label, self.vocab_size = self.load_feature_label(split) def get_users(self): userhelper = UserHelper() return userhelper.get_users(self.reviews) def user2idx(self): userhelper = UserHelper() users = userhelper.get_users(self.reviews) return userhelper.user2idx(users) def feature2idx(self): assert self.feature == 'n-gram' or self.feature == 'word' if self.feature == 'n-gram': feature2idx = self.vocab.character_n_gram_table( self.reviews, min_threshold=self.min_threshold) else: feature2idx = self.vocab.word_table( self.reviews, min_threshold=self.min_threshold) return feature2idx def load_feature_label(self, split): feature2idx = self.feature2idx() data_params = { 'max_ngram_len': self.max_len, 'user2idx': self.user2idx, 'ngram2idx': feature2idx } feature_loader = FeatureLoader(**data_params) x, y = feature_loader.load_n_gram_idx_feature_label(self.reviews) train_split = int(x.shape[0] * 0.8) valid_split = train_split - int(train_split * 0.2) if split == 'train': x, y = x[:valid_split, :], y[:valid_split] elif split == 'valid': x, y = x[valid_split:train_split, :], y[valid_split:train_split] else: x, y = x[train_split:, :], y[train_split:] return torch.tensor(x, dtype=torch.long), torch.tensor( y, dtype=torch.long), len(feature2idx) def __len__(self): return len(self.text) def __getitem__(self, idx): text = self.text[idx, :] label = self.label[idx] return {'text': text, 'label': label}