def __init__(self, phase, batch_size, max_len=140, batches_per_epoch=1000, pad=True): self.phase = phase self.batch_size = batch_size self.batches_per_epoch = batches_per_epoch self.max_len = max_len self.vocab = Vocabulary() self.vocab.add('<pad>') self.vocab.add('<unk>') self.vocab.add('<end>') for i in range(256): ch = chr(i) self.vocab.add(ch) self.n_classes = len(self.vocab) self.pad = pad self.tweets = [] with open("data/tweets.txt") as f: while True: s = f.readline() if s == "": break s = s.strip().split(" ") for i in range(len(s)): if s[i].startswith('http://'): s[i] = "url" if s[i].startswith('https://'): s[i] = "url" if s[i].startswith("@"): s[i] = "@userid" #if s[i].startswith("#"): # s[i] = "#hashtag" s = ''.join([s1 + " " for s1 in s]).strip() tweet = s if len(tweet) <= max_len - 1: self.tweets.append(tweet) if len(self.tweets) >= 1000000: break valid_size = 10000 if self.phase == 'train': self.tweets = self.tweets[valid_size:] else: self.tweets = self.tweets[:valid_size] print("%s: %d tweets, max %d chars" % (phase, len(self.tweets), max_len)) x = self.make_batch() self.shared_x = theano.shared(x) self.index = T.iscalar()
import pickle import sys from nn.utils import Vocabulary session = sys.argv[1] vocab_file = open("session/%s/vocab.pkl" % session) vocab = Vocabulary() vocab = pickle.load(vocab_file) vocab_file.close print(vocab.word_to_index)
class TwitterReconstructionDatabase(object): def __init__(self, phase, batch_size, max_len=140, batches_per_epoch=1000, pad=True): self.phase = phase self.batch_size = batch_size self.batches_per_epoch = batches_per_epoch self.max_len = max_len self.vocab = Vocabulary() self.vocab.add('<pad>') self.vocab.add('<unk>') self.vocab.add('<end>') for i in range(256): ch = chr(i) self.vocab.add(ch) self.n_classes = len(self.vocab) self.pad = pad self.tweets = [] with open("data/tweets.txt") as f: while True: s = f.readline() if s == "": break s = s.strip().split(" ") for i in range(len(s)): if s[i].startswith('http://'): s[i] = "url" if s[i].startswith('https://'): s[i] = "url" if s[i].startswith("@"): s[i] = "@userid" #if s[i].startswith("#"): # s[i] = "#hashtag" s = ''.join([s1 + " " for s1 in s]).strip() tweet = s if len(tweet) <= max_len - 1: self.tweets.append(tweet) if len(self.tweets) >= 1000000: break valid_size = 10000 if self.phase == 'train': self.tweets = self.tweets[valid_size:] else: self.tweets = self.tweets[:valid_size] print("%s: %d tweets, max %d chars" % (phase, len(self.tweets), max_len)) x = self.make_batch() self.shared_x = theano.shared(x) self.index = T.iscalar() def to_inputs(self, tweet): chars = [self.vocab.by_word(ch, oov_word='<unk>') for ch in tweet] chars.append(self.vocab.by_word('<end>')) for i in range(self.max_len - len(tweet) - 1): chars.append(self.vocab.by_word('<pad>')) return numpy.asarray(chars) def make_batch(self): batch = numpy.zeros((self.max_len, self.batch_size)) if self.pad: for i in range(self.batch_size): idx = numpy.random.randint(len(self.tweets)) batch[:, i] = self.to_inputs(self.tweets[idx]) else: idx = numpy.random.randint(len(self.tweets)) max_len = len(self.tweets[idx]) target_len = len(self.tweets[idx]) batch[:, 0] = self.to_inputs(self.tweets[idx]) i = 1 while i < self.batch_size: idx = numpy.random.randint(len(self.tweets)) if abs(len(self.tweets[idx]) - target_len) > 3: continue batch[:, i] = self.to_inputs(self.tweets[idx]) max_len = max(max_len, len(self.tweets[idx]) + 1) i += 1 batch = batch[0:max_len] return batch.astype('int32') def total_batches(self): return self.batches_per_epoch def givens(self, x, t): return { x: self.shared_x[:, self.index * self.batch_size:(self.index + 1) * self.batch_size], } def indices(self): for i in range(self.total_batches()): x = self.make_batch() self.shared_x.set_value(x) yield 0
from nn.layers import OneHot from nn.utils import Vocabulary import nn.utils from lm_vae import Sampler from lm_vae_sample import LNLSTMStep from textproject_vae_charlevel import make_model from wordfilter import Wordfilter wordfilter = Wordfilter() t1 = time.time() session = "sp15_trial" vocab = Vocabulary() if os.path.exists("session/%s/vocab.pkl" % session): with open("session/%s/vocab.pkl" % session) as vocab_file: vocab = pickle.load(vocab_file) print("Loaded vocab with %i chars:" % len(vocab)) #print(vocab.index_to_word) else: print("Using default 256-char vocab") # old-school vocab.add("<pad>") vocab.add("<unk>") vocab.add("<end>") for i in xrange(256): ch = chr(i) vocab.add(ch)
@app.route('/get_z', methods=['GET']) def get_z(): #json = request.get_json() s1 = request.args.get('s1') print(s1) z, text = serve_get_z(s1) return jsonify({"z": z, "text": text}) # The actual work t1 = time.time() print("It begins") vocab = Vocabulary() if os.path.exists("session/%s/vocab.pkl" % session): with open("session/%s/vocab.pkl" % session) as vocab_file: vocab = pickle.load(vocab_file) print("Loaded vocab with %i chars:" % len(vocab)) print(vocab.index_to_word) else: print("Using default 256-char vocab") # Should probably extract this into a little shared module with textproject_reconstruction_database # Maybe later vocab.add('<pad>') vocab.add('<unk>') vocab.add('<end>') for i in xrange(32, 128): ch = chr(i)
def __init__(self, dataset, phase, batch_size, max_len=140, pad=True, sp_model=None): self.phase = phase self.batch_size = batch_size self.max_len = max_len self.sp_model = sp_model self.vocab = Vocabulary() self.using_sp = (self.sp_model != None) and (len(self.sp_model) > 0) if self.using_sp: print("Using sentencepiece") import sentencepiece as spm # https://github.com/google/sentencepiece self.sp = spm.SentencePieceProcessor() self.sp.Load(self.sp_model) sp_model_size = self.sp.GetPieceSize() print("Loaded SP model with", sp_model_size, "tokens") self.vocab.add('<pad>') self.vocab.add('<unk>') self.vocab.add('<end>') for i in xrange(sp_model_size): self.vocab.add(self.sp.IdToPiece(i)) else: print("Using default fixed vocab") self.vocab.add('<pad>') self.vocab.add('<unk>') self.vocab.add('<end>') for i in xrange(32, 128): ch = chr(i) self.vocab.add(ch) self.n_classes = len(self.vocab) self.pad = pad self.sentences = [] # First, check full path; if that doesn't work, make a guess that it's in data/ if os.path.exists(dataset): dataset_path = dataset elif os.path.exists("data/%s" % dataset): dataset_path = "data/%s" % dataset else: raise Exception("Can't find any dataset named %s!" % dataset) with open(dataset_path) as f: while True: s = f.readline() if s == "": break if self.using_sp: chars = self.sp.EncodeAsIds(s) if len(chars) <= max_len - 1: self.sentences.append(s) elif len(s) <= max_len - 1: self.sentences.append(s) #if len(self.sentences) >= 1000000: #break self.shuffle_sentences() valid_size = int(len(self.sentences) * 0.1) if self.phase == 'train': self.sentences = self.sentences[valid_size:] else: self.sentences = self.sentences[:valid_size] print "%s data: %d sentences, max %d chars" % ( phase, len(self.sentences), max_len) self.batches_per_epoch = int(len(self.sentences) / batch_size) # per the original textvae code, let's just keep this lean if self.phase == 'valid': print "Reducing valid set to 100 batches" self.batches_per_epoch = min(self.batches_per_epoch, 100) x = self.make_batch() self.shared_x = theano.shared(x) self.index = T.iscalar()
class TextProjectReconstructionDatabase(object): def __init__(self, dataset, phase, batch_size, max_len=140, pad=True, sp_model=None): self.phase = phase self.batch_size = batch_size self.max_len = max_len self.sp_model = sp_model self.vocab = Vocabulary() self.using_sp = (self.sp_model != None) and (len(self.sp_model) > 0) if self.using_sp: print("Using sentencepiece") import sentencepiece as spm # https://github.com/google/sentencepiece self.sp = spm.SentencePieceProcessor() self.sp.Load(self.sp_model) sp_model_size = self.sp.GetPieceSize() print("Loaded SP model with", sp_model_size, "tokens") self.vocab.add('<pad>') self.vocab.add('<unk>') self.vocab.add('<end>') for i in xrange(sp_model_size): self.vocab.add(self.sp.IdToPiece(i)) else: print("Using default fixed vocab") self.vocab.add('<pad>') self.vocab.add('<unk>') self.vocab.add('<end>') for i in xrange(32, 128): ch = chr(i) self.vocab.add(ch) self.n_classes = len(self.vocab) self.pad = pad self.sentences = [] # First, check full path; if that doesn't work, make a guess that it's in data/ if os.path.exists(dataset): dataset_path = dataset elif os.path.exists("data/%s" % dataset): dataset_path = "data/%s" % dataset else: raise Exception("Can't find any dataset named %s!" % dataset) with open(dataset_path) as f: while True: s = f.readline() if s == "": break if self.using_sp: chars = self.sp.EncodeAsIds(s) if len(chars) <= max_len - 1: self.sentences.append(s) elif len(s) <= max_len - 1: self.sentences.append(s) #if len(self.sentences) >= 1000000: #break self.shuffle_sentences() valid_size = int(len(self.sentences) * 0.1) if self.phase == 'train': self.sentences = self.sentences[valid_size:] else: self.sentences = self.sentences[:valid_size] print "%s data: %d sentences, max %d chars" % ( phase, len(self.sentences), max_len) self.batches_per_epoch = int(len(self.sentences) / batch_size) # per the original textvae code, let's just keep this lean if self.phase == 'valid': print "Reducing valid set to 100 batches" self.batches_per_epoch = min(self.batches_per_epoch, 100) x = self.make_batch() self.shared_x = theano.shared(x) self.index = T.iscalar() def shuffle_sentences(self): # this all might be horribly inefficient but whatever t = time.time() print("Shuffling %s sentences..." % len(self.sentences)) numpy.random.shuffle(self.sentences) print "...done. Took %s seconds" % round(time.time() - t) def to_inputs(self, sentence): sentence = sentence.replace("\n", "") if self.using_sp: #print(self.sp.EncodeAsPieces(sentence)) chars = self.sp.EncodeAsIds(sentence) else: chars = [ self.vocab.by_word(ch, oov_word='<unk>') for ch in sentence ] chars.append(self.vocab.by_word('<end>')) for i in xrange(self.max_len - len(chars)): chars.append(self.vocab.by_word('<pad>')) return numpy.asarray(chars) # The original code drew random samples but didn't keep track of which had already been drawn. This seems not ideal to me so I am rewriting to make minibatches draw samples *without* replacement. # EDIT: Now back to doing it the original way, because speed! def make_batch(self): batch = numpy.zeros((self.max_len, self.batch_size)) if self.pad: for i in xrange(self.batch_size): idx = numpy.random.randint(len(self.sentences)) # here we pop the sentence out of the array entirely batch[:, i] = self.to_inputs(self.sentences[idx]) else: idx = numpy.random.randint(len(self.sentences)) max_len = len(self.sentences[idx]) target_len = len(self.sentences[idx]) batch[:, 0] = self.to_inputs(self.sentences[idx]) i = 1 while i < self.batch_size: idx = numpy.random.randint(len(self.sentences)) if abs(len(self.sentences[idx]) - target_len) > 3: continue batch[:, i] = self.to_inputs(self.sentences[idx]) max_len = max(max_len, len(self.sentences[idx]) + 1) i += 1 batch = batch[0:max_len] return batch.astype('int32') def givens(self, x, t): return { x: self.shared_x[:, self.index * self.batch_size:(self.index + 1) * self.batch_size], } def total_batches(self): return self.batches_per_epoch def indices(self): for i in xrange(self.batches_per_epoch): x = self.make_batch() self.shared_x.set_value(x) yield 0
def main(z, sample_size, p, lstm_size, mode, alpha): vocab = Vocabulary() vocab.add('<pad>') vocab.add('<unk>') vocab.add('<end>') for i in range(256): ch = chr(i) vocab.add(ch) n_classes = len(vocab) model = make_model(z, sample_size, p, n_classes, lstm_size, alpha) name = "twittervae.charlevel.z_%d.len_%d.p_%.2f.lstmsz_%d.alpha_%.2f" % ( z, sample_size, p, lstm_size, alpha) model.load("exp/%s/model.flt" % name) model.set_phase(train=False) start_word = n_classes if mode == 'vary': n = 7 sampled = numpy.random.normal(0, 1, (1, z)) sampled = numpy.repeat(sampled, n * z, axis=0) for dim in range(z): eps = 0.01 x = numpy.linspace(eps, 1 - eps, num=n) x = norm.ppf(x) sampled[dim * n:(dim + 1) * n, dim] = x n *= z elif mode == 'interpolatereal': valid_db = TwitterReconstructionDatabase("valid", 50, batches_per_epoch=100, max_len=sample_size) s1 = numpy.random.randint(0, len(valid_db.tweets)) s2 = numpy.random.randint(0, len(valid_db.tweets)) encoder = model.layers[0].branches[0] sampler = encoder[-1] assert isinstance(sampler, Sampler) ins = numpy.zeros((sample_size, 2)) ins[:, 0] = valid_db.to_inputs(valid_db.tweets[s1]) ins[:, 1] = valid_db.to_inputs(valid_db.tweets[s2]) x = T.imatrix() z = encoder(x) mu = sampler.mu f = theano.function([x], mu) z = f(ins.astype('int32')) s1_z = z[0] s2_z = z[1] n = 7 s1_z = numpy.repeat(s1_z[None, :], n, axis=0) s2_z = numpy.repeat(s2_z[None, :], n, axis=0) steps = numpy.linspace(0, 1, n)[:, None] sampled = s1_z * (1 - steps) + s2_z * steps elif mode == 'arithm': valid_db = TwitterReconstructionDatabase("valid", 50, batches_per_epoch=100, max_len=sample_size) s1 = numpy.random.randint(0, len(valid_db.tweets)) s2 = numpy.random.randint(0, len(valid_db.tweets)) s3 = numpy.random.randint(0, len(valid_db.tweets)) print(valid_db.tweets[s1]) print(valid_db.tweets[s2]) print(valid_db.tweets[s3]) encoder = model.layers[0].branches[0] sampler = encoder[-1] assert isinstance(sampler, Sampler) ins = numpy.zeros((sample_size, 3)) ins[:, 0] = valid_db.to_inputs(valid_db.tweets[s1]) ins[:, 1] = valid_db.to_inputs(valid_db.tweets[s2]) ins[:, 2] = valid_db.to_inputs(valid_db.tweets[s3]) x = T.imatrix() z = encoder(x) mu = sampler.mu f = theano.function([x], mu) z = f(ins.astype('int32')) s1_z = z[0] s2_z = z[1] s3_z = z[1] n = 1 sampled = s1_z - s2_z + s3_z sampled = sampled[None, :] elif mode == 'interpolate': z = numpy.random.normal(0, 1, (2, z)) s1_z = z[0] s2_z = z[1] n = 7 s1_z = numpy.repeat(s1_z[None, :], n, axis=0) s2_z = numpy.repeat(s2_z[None, :], n, axis=0) steps = numpy.linspace(0, 1, n)[:, None] sampled = s1_z * (1 - steps) + s2_z * steps else: n = 100 sampled = numpy.random.normal(0, 1, (n, z)) start_words = numpy.ones(n) * start_word start_words = theano.shared(start_words.astype('int32')) sampled = theano.shared(sampled.astype(theano.config.floatX)) decoder_from_z = model.layers[1].branches[0] from_z = decoder_from_z(sampled) layers = model.layers[-3:] layers[0] = LNLSTMStep(layers[0]) step = Sequential(layers) embed = model.layers[1].branches[1].layers[-1] words = start_words generated = [] for i in range(sample_size): ins = T.concatenate([from_z[i], embed(words)], axis=1) pred = step(ins) words = T.argmax(pred, axis=1) generated.append(words[None, :]) generated = T.concatenate(generated, axis=0) import time t = time.time() print("compiling...", end=' ') f = theano.function([], outputs=generated) print("done, took %f secs" % (time.time() - t)) w = f() results = [] pad = vocab.by_word("<pad>") end = vocab.by_word("<end>") for i in range(w.shape[1]): s = [] for idx in w[:, i]: if idx == end: break if idx == pad: break s.append(vocab.by_index(idx)) r = ''.join(s) if mode == "vary": if i % n == 0: print("dimension %d" % (i / n)) print(r.strip()) results.append(r)