def load_ratnet(data_dir, model_dir): logging.info("Loading data from cache...") with open(data_dir / 'beer_core-train.pkl', 'rb') as fp: reviews, beers = pickle.load(fp) random.seed(1337) random.shuffle(reviews) logging.info("Loading text sequences...") text_sequences = [ CharacterSequence.from_string(review.text) for review in reviews ] text_encoding = OneHotEncoding(include_start_token=True, include_stop_token=True) text_encoding.build_encoding(text_sequences) identity_encoding = IdentityEncoding(1) logging.info("Loading model...") ratnet = CharacterRNN('ratnet', len(text_encoding) + len(identity_encoding), len(text_encoding), n_layers=2, n_hidden=1024) ratnet.load_parameters(model_dir / 'ratnet1_2-1024.pkl') ratnet.compile_method('generate_with_concat') ratnet.compile_method('log_probability') return ratnet, text_encoding
with open('data/beer/beer_top-train.pkl', 'rb') as fp: reviews, beers = pickle.load(fp) text_sequences = [CharacterSequence.from_string(review.text) for review in reviews] beer_cats = [SingletonSequence(review.beer.style) for review in reviews] review_num_seqs = [c.encode(text_encoding) for c in text_sequences] num_seq = NumberSequence(np.concatenate([c.seq for c in review_num_seqs])) beer_seq = NumberSequence(np.concatenate([c.encode(cat_encoding).replicate(len(r)).seq for c, r in zip(beer_cats, review_num_seqs)])) batcher = WindowedBatcher([num_seq, beer_seq], [text_encoding, cat_encoding], sequence_length=200, batch_size=256) catnet = CharacterRNN('2pac', len(text_encoding) + len(cat_encoding), len(text_encoding), n_layers=2, n_hidden=1024) catnet.compile_method("generate_with_concat") def load_charnet(): catnet.load_parameters('models/charnet-top_2-1024-2.pkl') layer = catnet.lstm.input_layer weights = { 'W_ix': layer.get_parameter_value("W_ix"), 'W_ox': layer.get_parameter_value("W_ox"), 'W_fx': layer.get_parameter_value("W_fx"), 'W_gx': layer.get_parameter_value("W_gx"), } for w, value in weights.items(): layer.set_parameter_value(w, np.vstack([value,
CharacterSequence.from_string(review.text) for review in reviews ] text_encoding = dataset.OneHotEncoding(include_start_token=True, include_stop_token=True) text_encoding.build_encoding(text_sequences) beer_cats = [SingletonSequence(review.beer.style) for review in reviews] cat_encoding = dataset.OneHotEncoding(include_start_token=False, include_stop_token=False) cat_encoding.build_encoding(beer_cats) catnet = CharacterRNN('2pac', len(text_encoding) + len(cat_encoding), len(text_encoding), n_layers=2, n_hidden=1024) catnet.load_parameters('models/catnet_2-1024-2.pkl') catnet.compile_method('generate_examples') with open(REAL_FILE, 'w') as fp: for review in reviews: print >> fp, "%u: %s" % (cat_encoding.encode( review.beer.style), review.text) with open(GEN_FILE, 'w') as fp: for i, beer in enumerate(cat_encoding.backward_mapping): logging.info("Generating %s reviews", beer) for _ in tqdm(xrange(300)): gen_reviews = generate(beer, 2000) for review in gen_reviews:
c.replicate(len(r)).seq for c, r in zip(beer_ratings, review_num_seqs) ])) # batcher = WindowedBatcher(num_seq, [text_encoding, style_encoding], sequence_length=200, batch_size=500) batcher = WindowedBatcher([num_seq, beer_seq], [text_encoding, identity_encoding], sequence_length=200, batch_size=500) # batcher = WindowedBatcher(num_seq, [text_encoding], sequence_length=200, batch_size=500) D = text_encoding.index # charrnn = CharacterRNN('2pac', len(text_encoding) + len(style_encoding), len(text_encoding), n_layers=2, n_hidden=512) # charrnn = CharacterRNN('2pac', len(text_encoding), len(text_encoding), n_layers=2, n_hidden=1024) charrnn = CharacterRNN('2pac', len(text_encoding) + len(identity_encoding), len(text_encoding), n_layers=2, n_hidden=512) # charrnn.compile_method('generate') # sgd = SGD(charrnn) # rmsprop = RMSProp(charrnn) # mom = Momentum(charrnn) def train(optimizer, n_iterations, *args): state = None for i in xrange(n_iterations): X, y = batcher.next_batch() if state is None: state = np.zeros((X.shape[1], charrnn.n_layers, charrnn.n_hidden))
class Beermind(object): def __init__(self, model_dir, data_dir): self.model_dir = model_dir self.data_dir = data_dir logging.info("Loading datasets[0]...") with open(self.data_dir / 'beer' / 'beer-top.pkl', 'rb') as fp: beer_top = pickle.load(fp)[0] logging.info("Loading datasets[1]...") with open(self.data_dir / 'beer' / 'beer-core.pkl', 'rb') as fp: beer_core = pickle.load(fp)[0] self.top_encoding = OneHotEncoding(include_start_token=True, include_stop_token=True) top_sequences = [CharacterSequence.from_string(r.text) for r in beer_top] self.top_encoding.build_encoding(top_sequences) top_cats = [SingletonSequence(r.beer.style) for r in beer_top] self.cat_encoding = OneHotEncoding(include_start_token=False, include_stop_token=False) self.cat_encoding.build_encoding(top_cats) self.rat_encoding = IdentityEncoding(1) self.core_encoding = OneHotEncoding(include_start_token=True, include_stop_token=True) core_sequences = [CharacterSequence.from_string(r.text) for r in beer_core] self.core_encoding.build_encoding(core_sequences) core_users = [SingletonSequence(r.user) for r in beer_core] self.user_encoding = OneHotEncoding(include_start_token=False, include_stop_token=False) self.user_encoding.build_encoding(core_users) core_items = [SingletonSequence(r.beer.name) for r in beer_core] self.item_encoding = OneHotEncoding(include_start_token=False, include_stop_token=False) self.item_encoding.build_encoding(core_items) logging.info("Loading models[0]...") self.catnet = CharacterRNN('catnet', len(self.top_encoding) + len(self.cat_encoding), len(self.top_encoding), n_layers=2, n_hidden=1024) self.catnet.load_parameters(self.model_dir / 'catnet.pkl') self.catnet.compile_method('log_probability') self.catratnet = CharacterRNN('catratnet', len(self.top_encoding) + len(self.rat_encoding) + len(self.cat_encoding), len(self.top_encoding), n_layers=2, n_hidden=1024) self.catratnet.load_parameters(self.model_dir / 'catratnet.pkl') self.catratnet.compile_method('generate_with_concat') logging.info("Loading models[1]...") self.useritemnet = CharacterRNN('useritemnet', len(self.core_encoding) + len(self.user_encoding) + len(self.item_encoding), len(self.core_encoding), n_layers=2, n_hidden=1024) self.useritemnet.load_parameters(self.model_dir / 'useritemnet.pkl') self.useritemnet.compile_method('generate_with_concat') def generate(self, category, rating, length, temperature=1.0): rating = self.transform_rating(rating) results = self.catratnet.generate_with_concat( np.eye(len(self.top_encoding))[self.top_encoding.encode("<STR>")], np.concatenate([np.eye(len(self.cat_encoding))[self.cat_encoding.encode(category)], [rating]]), length, temperature ) return str(NumberSequence(results.argmax(axis=1)).decode(self.top_encoding)) def category_probability(self, review): num_review = CharacterSequence(review).encode(self.top_encoding).seq.ravel() Xs = num_review[:-1] idx = num_review[1:][:, None] X = np.tile(np.eye(len(self.top_encoding))[Xs, None], (1, len(self.cat_encoding), 1)) S = X.shape[0] cats = np.tile(np.eye(len(self.cat_encoding))[None], (S, 1, 1)) X, cats = X.astype(np.float32), cats.astype(np.float32) X = np.concatenate([X, cats], -1) result = self.catnet.log_probability(X, idx) denom = logsumexp(result, axis=1) probs = np.exp(result - denom[:, None]) data = {} for i, cat in enumerate(self.cat_encoding.backward_mapping): data[cat] = probs[:, i].tolist() return data def users(self): return self.user_encoding.backward_mapping def items(self): return self.item_encoding.backward_mapping def generate_useritemnet(self, user, item, length, temperature=1.0): results = self.useritemnet.generate_with_concat( np.eye(len(self.core_encoding))[self.top_encoding.encode("<STR>")], np.concatenate([np.eye(len(self.user_encoding))[self.user_encoding.encode(user)], np.eye(len(self.item_encoding))[self.item_encoding.encode(item)] ]), length, temperature ) return str(NumberSequence(results.argmax(axis=1)).decode(self.core_encoding)) def transform_rating(self, rating): return (rating - 3.0) / 2.0 def inverse_transform_rating(self, rating): return rating * 2 + 3
def __init__(self, model_dir, data_dir): self.model_dir = model_dir self.data_dir = data_dir logging.info("Loading datasets[0]...") with open(self.data_dir / 'beer' / 'beer-top.pkl', 'rb') as fp: beer_top = pickle.load(fp)[0] logging.info("Loading datasets[1]...") with open(self.data_dir / 'beer' / 'beer-core.pkl', 'rb') as fp: beer_core = pickle.load(fp)[0] self.top_encoding = OneHotEncoding(include_start_token=True, include_stop_token=True) top_sequences = [CharacterSequence.from_string(r.text) for r in beer_top] self.top_encoding.build_encoding(top_sequences) top_cats = [SingletonSequence(r.beer.style) for r in beer_top] self.cat_encoding = OneHotEncoding(include_start_token=False, include_stop_token=False) self.cat_encoding.build_encoding(top_cats) self.rat_encoding = IdentityEncoding(1) self.core_encoding = OneHotEncoding(include_start_token=True, include_stop_token=True) core_sequences = [CharacterSequence.from_string(r.text) for r in beer_core] self.core_encoding.build_encoding(core_sequences) core_users = [SingletonSequence(r.user) for r in beer_core] self.user_encoding = OneHotEncoding(include_start_token=False, include_stop_token=False) self.user_encoding.build_encoding(core_users) core_items = [SingletonSequence(r.beer.name) for r in beer_core] self.item_encoding = OneHotEncoding(include_start_token=False, include_stop_token=False) self.item_encoding.build_encoding(core_items) logging.info("Loading models[0]...") self.catnet = CharacterRNN('catnet', len(self.top_encoding) + len(self.cat_encoding), len(self.top_encoding), n_layers=2, n_hidden=1024) self.catnet.load_parameters(self.model_dir / 'catnet.pkl') self.catnet.compile_method('log_probability') self.catratnet = CharacterRNN('catratnet', len(self.top_encoding) + len(self.rat_encoding) + len(self.cat_encoding), len(self.top_encoding), n_layers=2, n_hidden=1024) self.catratnet.load_parameters(self.model_dir / 'catratnet.pkl') self.catratnet.compile_method('generate_with_concat') logging.info("Loading models[1]...") self.useritemnet = CharacterRNN('useritemnet', len(self.core_encoding) + len(self.user_encoding) + len(self.item_encoding), len(self.core_encoding), n_layers=2, n_hidden=1024) self.useritemnet.load_parameters(self.model_dir / 'useritemnet.pkl') self.useritemnet.compile_method('generate_with_concat')