示例#1
0
def load_ratnet(data_dir, model_dir):
    logging.info("Loading data from cache...")
    with open(data_dir / 'beer_core-train.pkl', 'rb') as fp:
        reviews, beers = pickle.load(fp)

    random.seed(1337)
    random.shuffle(reviews)

    logging.info("Loading text sequences...")
    text_sequences = [
        CharacterSequence.from_string(review.text) for review in reviews
    ]

    text_encoding = OneHotEncoding(include_start_token=True,
                                   include_stop_token=True)
    text_encoding.build_encoding(text_sequences)

    identity_encoding = IdentityEncoding(1)

    logging.info("Loading model...")
    ratnet = CharacterRNN('ratnet',
                          len(text_encoding) + len(identity_encoding),
                          len(text_encoding),
                          n_layers=2,
                          n_hidden=1024)
    ratnet.load_parameters(model_dir / 'ratnet1_2-1024.pkl')
    ratnet.compile_method('generate_with_concat')
    ratnet.compile_method('log_probability')
    return ratnet, text_encoding
示例#2
0
with open('data/beer/beer_top-train.pkl', 'rb') as fp:
    reviews, beers = pickle.load(fp)

text_sequences = [CharacterSequence.from_string(review.text) for review in reviews]
beer_cats = [SingletonSequence(review.beer.style) for review in reviews]

review_num_seqs = [c.encode(text_encoding) for c in text_sequences]

num_seq = NumberSequence(np.concatenate([c.seq for c in review_num_seqs]))
beer_seq = NumberSequence(np.concatenate([c.encode(cat_encoding).replicate(len(r)).seq for c, r in
                                          zip(beer_cats, review_num_seqs)]))
batcher = WindowedBatcher([num_seq, beer_seq], [text_encoding, cat_encoding],
                          sequence_length=200, batch_size=256)

catnet = CharacterRNN('2pac', len(text_encoding) + len(cat_encoding),
                       len(text_encoding), n_layers=2, n_hidden=1024)
catnet.compile_method("generate_with_concat")

def load_charnet():
    catnet.load_parameters('models/charnet-top_2-1024-2.pkl')
    layer = catnet.lstm.input_layer

    weights = {
        'W_ix': layer.get_parameter_value("W_ix"),
        'W_ox': layer.get_parameter_value("W_ox"),
        'W_fx': layer.get_parameter_value("W_fx"),
        'W_gx': layer.get_parameter_value("W_gx"),
    }

    for w, value in weights.items():
        layer.set_parameter_value(w, np.vstack([value,
示例#3
0
        CharacterSequence.from_string(review.text) for review in reviews
    ]

    text_encoding = dataset.OneHotEncoding(include_start_token=True,
                                           include_stop_token=True)
    text_encoding.build_encoding(text_sequences)

    beer_cats = [SingletonSequence(review.beer.style) for review in reviews]

    cat_encoding = dataset.OneHotEncoding(include_start_token=False,
                                          include_stop_token=False)
    cat_encoding.build_encoding(beer_cats)

    catnet = CharacterRNN('2pac',
                          len(text_encoding) + len(cat_encoding),
                          len(text_encoding),
                          n_layers=2,
                          n_hidden=1024)
    catnet.load_parameters('models/catnet_2-1024-2.pkl')
    catnet.compile_method('generate_examples')

    with open(REAL_FILE, 'w') as fp:
        for review in reviews:
            print >> fp, "%u: %s" % (cat_encoding.encode(
                review.beer.style), review.text)
    with open(GEN_FILE, 'w') as fp:
        for i, beer in enumerate(cat_encoding.backward_mapping):
            logging.info("Generating %s reviews", beer)
            for _ in tqdm(xrange(300)):
                gen_reviews = generate(beer, 2000)
                for review in gen_reviews:
示例#4
0
        c.replicate(len(r)).seq for c, r in zip(beer_ratings, review_num_seqs)
    ]))
# batcher = WindowedBatcher(num_seq, [text_encoding, style_encoding], sequence_length=200, batch_size=500)
batcher = WindowedBatcher([num_seq, beer_seq],
                          [text_encoding, identity_encoding],
                          sequence_length=200,
                          batch_size=500)
# batcher = WindowedBatcher(num_seq, [text_encoding], sequence_length=200, batch_size=500)

D = text_encoding.index

# charrnn = CharacterRNN('2pac', len(text_encoding) + len(style_encoding), len(text_encoding), n_layers=2, n_hidden=512)
# charrnn = CharacterRNN('2pac', len(text_encoding), len(text_encoding), n_layers=2, n_hidden=1024)
charrnn = CharacterRNN('2pac',
                       len(text_encoding) + len(identity_encoding),
                       len(text_encoding),
                       n_layers=2,
                       n_hidden=512)
# charrnn.compile_method('generate')

# sgd = SGD(charrnn)
# rmsprop = RMSProp(charrnn)
# mom = Momentum(charrnn)


def train(optimizer, n_iterations, *args):
    state = None
    for i in xrange(n_iterations):
        X, y = batcher.next_batch()
        if state is None:
            state = np.zeros((X.shape[1], charrnn.n_layers, charrnn.n_hidden))
示例#5
0
class Beermind(object):

    def __init__(self, model_dir, data_dir):
        self.model_dir = model_dir
        self.data_dir = data_dir

        logging.info("Loading datasets[0]...")

        with open(self.data_dir / 'beer' / 'beer-top.pkl', 'rb') as fp:
            beer_top = pickle.load(fp)[0]

        logging.info("Loading datasets[1]...")
        with open(self.data_dir / 'beer' / 'beer-core.pkl', 'rb') as fp:
            beer_core = pickle.load(fp)[0]

        self.top_encoding = OneHotEncoding(include_start_token=True,
                                           include_stop_token=True)
        top_sequences = [CharacterSequence.from_string(r.text) for r in beer_top]
        self.top_encoding.build_encoding(top_sequences)

        top_cats = [SingletonSequence(r.beer.style) for r in beer_top]

        self.cat_encoding = OneHotEncoding(include_start_token=False,
                                           include_stop_token=False)
        self.cat_encoding.build_encoding(top_cats)
        self.rat_encoding = IdentityEncoding(1)

        self.core_encoding = OneHotEncoding(include_start_token=True,
                                           include_stop_token=True)
        core_sequences = [CharacterSequence.from_string(r.text) for r in beer_core]
        self.core_encoding.build_encoding(core_sequences)

        core_users = [SingletonSequence(r.user) for r in beer_core]
        self.user_encoding = OneHotEncoding(include_start_token=False,
                                           include_stop_token=False)
        self.user_encoding.build_encoding(core_users)

        core_items = [SingletonSequence(r.beer.name) for r in beer_core]
        self.item_encoding = OneHotEncoding(include_start_token=False,
                                           include_stop_token=False)
        self.item_encoding.build_encoding(core_items)

        logging.info("Loading models[0]...")

        self.catnet = CharacterRNN('catnet',
                                      len(self.top_encoding) + len(self.cat_encoding),
                                      len(self.top_encoding),
                                      n_layers=2,
                                      n_hidden=1024)
        self.catnet.load_parameters(self.model_dir / 'catnet.pkl')
        self.catnet.compile_method('log_probability')

        self.catratnet = CharacterRNN('catratnet',
                                      len(self.top_encoding) + len(self.rat_encoding) + len(self.cat_encoding),
                                      len(self.top_encoding),
                                      n_layers=2,
                                      n_hidden=1024)
        self.catratnet.load_parameters(self.model_dir / 'catratnet.pkl')
        self.catratnet.compile_method('generate_with_concat')

        logging.info("Loading models[1]...")
        self.useritemnet = CharacterRNN('useritemnet',
                                      len(self.core_encoding) + len(self.user_encoding) + len(self.item_encoding),
                                      len(self.core_encoding),
                                      n_layers=2,
                                      n_hidden=1024)
        self.useritemnet.load_parameters(self.model_dir / 'useritemnet.pkl')
        self.useritemnet.compile_method('generate_with_concat')

    def generate(self, category, rating, length, temperature=1.0):
        rating = self.transform_rating(rating)
        results = self.catratnet.generate_with_concat(
            np.eye(len(self.top_encoding))[self.top_encoding.encode("<STR>")],
            np.concatenate([np.eye(len(self.cat_encoding))[self.cat_encoding.encode(category)],
                            [rating]]),
            length,
            temperature
        )
        return str(NumberSequence(results.argmax(axis=1)).decode(self.top_encoding))

    def category_probability(self, review):
        num_review = CharacterSequence(review).encode(self.top_encoding).seq.ravel()
        Xs = num_review[:-1]
        idx = num_review[1:][:, None]
        X = np.tile(np.eye(len(self.top_encoding))[Xs, None], (1, len(self.cat_encoding), 1))
        S = X.shape[0]
        cats = np.tile(np.eye(len(self.cat_encoding))[None], (S, 1, 1))
        X, cats = X.astype(np.float32), cats.astype(np.float32)
        X = np.concatenate([X, cats], -1)
        result = self.catnet.log_probability(X, idx)
        denom = logsumexp(result, axis=1)
        probs = np.exp(result - denom[:, None])
        data = {}
        for i, cat in enumerate(self.cat_encoding.backward_mapping):
            data[cat] = probs[:, i].tolist()
        return data

    def users(self):
        return self.user_encoding.backward_mapping

    def items(self):
        return self.item_encoding.backward_mapping

    def generate_useritemnet(self, user, item, length, temperature=1.0):
        results = self.useritemnet.generate_with_concat(
            np.eye(len(self.core_encoding))[self.top_encoding.encode("<STR>")],
            np.concatenate([np.eye(len(self.user_encoding))[self.user_encoding.encode(user)],
                            np.eye(len(self.item_encoding))[self.item_encoding.encode(item)]
                           ]),
            length,
            temperature
        )
        return str(NumberSequence(results.argmax(axis=1)).decode(self.core_encoding))

    def transform_rating(self, rating):
        return (rating - 3.0) / 2.0

    def inverse_transform_rating(self, rating):
        return rating * 2 + 3
示例#6
0
    def __init__(self, model_dir, data_dir):
        self.model_dir = model_dir
        self.data_dir = data_dir

        logging.info("Loading datasets[0]...")

        with open(self.data_dir / 'beer' / 'beer-top.pkl', 'rb') as fp:
            beer_top = pickle.load(fp)[0]

        logging.info("Loading datasets[1]...")
        with open(self.data_dir / 'beer' / 'beer-core.pkl', 'rb') as fp:
            beer_core = pickle.load(fp)[0]

        self.top_encoding = OneHotEncoding(include_start_token=True,
                                           include_stop_token=True)
        top_sequences = [CharacterSequence.from_string(r.text) for r in beer_top]
        self.top_encoding.build_encoding(top_sequences)

        top_cats = [SingletonSequence(r.beer.style) for r in beer_top]

        self.cat_encoding = OneHotEncoding(include_start_token=False,
                                           include_stop_token=False)
        self.cat_encoding.build_encoding(top_cats)
        self.rat_encoding = IdentityEncoding(1)

        self.core_encoding = OneHotEncoding(include_start_token=True,
                                           include_stop_token=True)
        core_sequences = [CharacterSequence.from_string(r.text) for r in beer_core]
        self.core_encoding.build_encoding(core_sequences)

        core_users = [SingletonSequence(r.user) for r in beer_core]
        self.user_encoding = OneHotEncoding(include_start_token=False,
                                           include_stop_token=False)
        self.user_encoding.build_encoding(core_users)

        core_items = [SingletonSequence(r.beer.name) for r in beer_core]
        self.item_encoding = OneHotEncoding(include_start_token=False,
                                           include_stop_token=False)
        self.item_encoding.build_encoding(core_items)

        logging.info("Loading models[0]...")

        self.catnet = CharacterRNN('catnet',
                                      len(self.top_encoding) + len(self.cat_encoding),
                                      len(self.top_encoding),
                                      n_layers=2,
                                      n_hidden=1024)
        self.catnet.load_parameters(self.model_dir / 'catnet.pkl')
        self.catnet.compile_method('log_probability')

        self.catratnet = CharacterRNN('catratnet',
                                      len(self.top_encoding) + len(self.rat_encoding) + len(self.cat_encoding),
                                      len(self.top_encoding),
                                      n_layers=2,
                                      n_hidden=1024)
        self.catratnet.load_parameters(self.model_dir / 'catratnet.pkl')
        self.catratnet.compile_method('generate_with_concat')

        logging.info("Loading models[1]...")
        self.useritemnet = CharacterRNN('useritemnet',
                                      len(self.core_encoding) + len(self.user_encoding) + len(self.item_encoding),
                                      len(self.core_encoding),
                                      n_layers=2,
                                      n_hidden=1024)
        self.useritemnet.load_parameters(self.model_dir / 'useritemnet.pkl')
        self.useritemnet.compile_method('generate_with_concat')