def iter_valid_batches(self): for bunch in grouper(self.data['valid'], self.batch_size * 20): bunch_sort = [ bunch[i] for i in numpy.argsort([len(x) for x, _, _ in bunch]) ] for item in grouper(bunch_sort, self.batch_size): yield self.batcher.batch(item)
def cmd_predict_v(dataset='coco', datapath='.', model_path='.', model_name='model.pkl.gz', batch_size=128, output_v='predict_v.npy', output_r='predict_r.npy'): M = load(model_path, model_name=model_name) model = M['model'] batcher = M['batcher'] mapper = M['batcher'].mapper predict_v = predictor_v(model) predict_r = predictor_r(model) prov = dp.getDataProvider(dataset, root=datapath) sents = list(prov.iterSentences(split='val')) inputs = list( mapper.transform( [tokens(sent, tokenizer=batcher.tokenizer) for sent in sents])) print len(model.network.params()) preds_v = numpy.vstack([ predict_v(batcher.batch_inp(batch)) for batch in grouper(inputs, batch_size) ]) numpy.save(os.path.join(model_path, output_v), preds_v) preds_r = numpy.vstack([ predict_r(batcher.batch_inp(batch)) for batch in grouper(inputs, batch_size) ]) numpy.save(os.path.join(model_path, output_r), preds_r)
def cmd_predict_r(model_path='.', batch_size=128, split='train', output_premise='predict_premise_r.npy', output_hypo='predict_hypo_r.npy', output_labels='entailment_labels.npy'): def load(f): return pickle.load(gzip.open(os.path.join(model_path, f))) model_name = 'model.pkl.gz' batcher, scaler, model = map( load, ['batcher.pkl.gz', 'scaler.pkl.gz', model_name]) mapper = batcher.mapper predict_r = predictor_r(model) sents_premise, sents_hypo, labels = zip(*parse_snli(split=split)) inputs_premise = list(mapper.transform(sents_premise)) inputs_hypo = list(mapper.transform(sents_hypo)) preds_premise_r = numpy.vstack([ predict_r(batcher.batch_inp(batch)) for batch in grouper(inputs_premise, batch_size) ]) numpy.save(os.path.join(model_path, split + '_' + output_premise), preds_premise_r) preds_hypo_r = numpy.vstack([ predict_r(batcher.batch_inp(batch)) for batch in grouper(inputs_hypo, batch_size) ]) numpy.save(os.path.join(model_path, split + '_' + output_hypo), preds_hypo_r) numpy.save(os.path.join(model_path, split + '_' + output_labels), labels)
def iter_train_batches(self): # sort data by length if self.curriculum: data = [self.data['train'][i] for i in numpy.argsort([len(x['tokens_in']) for x in self.data['train']])] else: data = self.data['train'] for bunch in util.grouper(data, self.batch_size*20): bunch_sort = [ bunch[i] for i in numpy.argsort([len(x['tokens_in']) for x in bunch]) ] for item in util.grouper(bunch_sort, self.batch_size): yield self.batcher.batch(item)
def encode_images(model, imgs, batch_size=128, task=None): """Project imgs to the joint space using model. """ if task is None: task = model.task return numpy.vstack([ task.encode_images(batch) for batch in util.grouper(imgs, batch_size) ])
def encode_sentences(model, audios, batch_size=128): """Project audios to the joint space using model. For each audio returns a vector. """ return numpy.vstack([ model.task.predict(vector_padder(batch)) for batch in util.grouper(audios, batch_size) ])
def predict_img(model, audios, batch_size=32): """Project sents to the visual space using model. For each sentence returns the predicted vector of visual features. """ return numpy.vstack([ model.task.predict(vector_padder(batch)) for batch in util.grouper(audios, batch_size) ])
def predict_img(model, sents, batch_size=128): """Project sents to the visual space using model. For each sentence returns the predicted vector of visual features. """ inputs = list(model.batcher.mapper.transform(sents)) return numpy.vstack([ model.visual.predict(model.batcher.batch_inp(batch)) for batch in util.grouper(inputs, batch_size) ])
def encode_sentences(model, sents, batch_size=128): """Project sents to the joint space using model. For each sentence returns a vector. """ inputs = list(model.batcher.mapper.transform(sents)) return numpy.vstack([ model.task.predict(model.batcher.batch_inp(batch)) for batch in util.grouper(inputs, batch_size) ])
def representation(model, sents, batch_size=128): """Project sents to hidden state space using model. For each sentence returns a vector corresponding the activation of the hidden layer at the end-of-sentence symbol. """ task = model.Visual inputs = list(model.batcher.mapper.transform(sents)) return numpy.vstack([ task.representation(model.batcher.batch_inp(batch))[:,-1,:] for batch in util.grouper(inputs, batch_size) ])
def predict_img(model, sents, batch_size=128): """Project sents to the visual space using model. For each sentence returns the predicted vector of visual features. """ task = model.Visual inputs = list(model.batcher.mapper.transform(sents)) return numpy.vstack([ task.predict(model.batcher.batch_inp(batch)) for batch in util.grouper(inputs, batch_size) ])
def representation(model, sents, batch_size=128): """Project sents to hidden state space using model. For each sentence returns a vector corresponding the activation of the hidden layer at the end-of-sentence symbol. """ task = model.Visual inputs = list(model.batcher.mapper.transform(sents)) return numpy.vstack([ task.representation(model.batcher.batch_inp(batch))[:, -1, :] for batch in util.grouper(inputs, batch_size) ])
def cmd_train_rte(data_path='.', size=200, dropout=0.0, lr=0.0002, epochs=1, batch_size=64, model_path='.', seed=None): sys.setrecursionlimit(50000) if seed is not None: random.seed(seed) classify_size = 3 premise_r = numpy.load( os.path.join(data_path, "train_predict_premise_r.npy")) hypo_r = numpy.load(os.path.join(data_path, "train_predict_hypo_r.npy")) labels = onehot( numpy.load(os.path.join(data_path, "train_entailment_labels.npy")), classify_size) val_premise_r = numpy.load( os.path.join(data_path, "dev_predict_premise_r.npy")) val_hypo_r = numpy.load(os.path.join(data_path, "dev_predict_hypo_r.npy")) val_labels = onehot( numpy.load(os.path.join(data_path, "dev_entailment_labels.npy")), classify_size) size_repr = premise_r.shape[1] model = RTE(size_repr=size_repr, size_hidden=size, dropout=dropout, lr=lr) start_epoch = 1 for epoch in range(start_epoch, epochs + 1): costs = Counter() for _j, item in enumerate( grouper(itertools.izip(premise_r, hypo_r, labels), batch_size)): j = _j + 1 premise, hypo, label = zip(*item) cost = model.train(premise, hypo, label) costs += Counter({'cost': cost, 'N': 1}) costs_valid = valid_loss(model, val_premise_r, val_hypo_r, val_labels) print epoch, j, j * batch_size, "train", "ce", costs['cost'] / costs[ 'N'] print epoch, j, j * batch_size, "valid", "ce", costs_valid[ 'cost'] / costs_valid['N'] print epoch, j, j*batch_size, "valid", "ac", \ metrics.accuracy_score(numpy.argmax(val_labels, axis=1), numpy.argmax(model.predict(val_premise_r, val_hypo_r), axis=1)) # pickle.dump(model, gzip.open(os.path.join(model_path, "entailment_model.{}.pkl.gz".format(epoch)),'w'), # protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(model, gzip.open(os.path.join(model_path, "entailment_model.pkl.gz"), 'w'), protocol=pickle.HIGHEST_PROTOCOL)
def pile(model, sents, batch_size=128): """Project each symbol in each sentence in sents to hidden state spaces corresponding to layers using model. For each sentence returns a 3D tensor corresponding to the activations of the hidden layers at each position in the sentence. """ task = model.Visual lens = map(len, sents) inputs = list(model.batcher.mapper.transform(sents)) rs = [ r for batch in util.grouper(inputs, batch_size) for r in task.pile(model.batcher.batch_inp(batch)) ] return [ r[-l-1:,:,:] for (r,l) in zip(rs, lens) ]
def cmd_predict_r(model_path='.', batch_size=128, split='train', output_premise='predict_premise_r.npy', output_hypo='predict_hypo_r.npy', output_labels='entailment_labels.npy'): def load(f): return pickle.load(gzip.open(os.path.join(model_path, f))) model_name = 'model.pkl.gz' batcher, scaler, model = map(load, ['batcher.pkl.gz','scaler.pkl.gz', model_name]) mapper = batcher.mapper predict_r = predictor_r(model) sents_premise, sents_hypo, labels = zip(*parse_snli(split=split)) inputs_premise = list(mapper.transform(sents_premise)) inputs_hypo = list(mapper.transform(sents_hypo)) preds_premise_r = numpy.vstack([ predict_r(batcher.batch_inp(batch)) for batch in grouper(inputs_premise, batch_size) ]) numpy.save(os.path.join(model_path, split + '_' + output_premise), preds_premise_r) preds_hypo_r = numpy.vstack([ predict_r(batcher.batch_inp(batch)) for batch in grouper(inputs_hypo, batch_size) ]) numpy.save(os.path.join(model_path, split + '_' + output_hypo), preds_hypo_r) numpy.save(os.path.join(model_path, split + '_' + output_labels), labels)
def pile(model, sents, batch_size=128): """Project each symbol in each sentence in sents to hidden state spaces corresponding to layers using model. For each sentence returns a 3D tensor corresponding to the activations of the hidden layers at each position in the sentence. """ task = model.Visual lens = map(len, sents) inputs = list(model.batcher.mapper.transform(sents)) rs = [ r for batch in util.grouper(inputs, batch_size) for r in task.pile(model.batcher.batch_inp(batch)) ] return [r[-l - 1:, :, :] for (r, l) in zip(rs, lens)]
def cmd_predict_v(dataset='coco', datapath='.', model_path='.', model_name='model.pkl.gz', batch_size=128, output_v='predict_v.npy', output_r='predict_r.npy'): M = load(model_path, model_name=model_name) model = M['model'] batcher = M['batcher'] mapper = M['batcher'].mapper predict_v = predictor_v(model) predict_r = predictor_r(model) prov = dp.getDataProvider(dataset, root=datapath) sents = list(prov.iterSentences(split='val')) inputs = list(mapper.transform([tokens(sent, tokenizer=batcher.tokenizer) for sent in sents ])) print len(model.network.params()) preds_v = numpy.vstack([ predict_v(batcher.batch_inp(batch)) for batch in grouper(inputs, batch_size) ]) numpy.save(os.path.join(model_path, output_v), preds_v) preds_r = numpy.vstack([ predict_r(batcher.batch_inp(batch)) for batch in grouper(inputs, batch_size) ]) numpy.save(os.path.join(model_path, output_r), preds_r)
def featurefile(dataset='flickr8k', chunksize=1000, kind='fbank', noisy=False): if kind == 'mfcc': extract = extract_mfcc elif kind == 'fbank': extract = extract_fbank else: raise "Invalid kind" infix = ".noisy" if noisy else "" for i,chunk in enumerate(util.grouper(gzip.open("/home/gchrupala/repos/reimaginet/data/{}/dataset{}.mp3.jsonl.gz".format(dataset, infix)),chunksize)): result = [] for line in chunk: sent = json.loads(line) sound = decodemp3(base64.b64decode(sent['speech'])) result.append(extract(sound)) numpy.save("/home/gchrupala/repos/reimaginet/data/{}/dataset{}.{}.{}.npy".format(dataset,infix,kind,i), result)
def cmd_train_rte(data_path='.', size=200, dropout=0.0, lr=0.0002, epochs=1, batch_size=64, model_path='.', seed=None): sys.setrecursionlimit(50000) if seed is not None: random.seed(seed) classify_size = 3 premise_r = numpy.load(os.path.join(data_path, "train_predict_premise_r.npy")) hypo_r = numpy.load(os.path.join(data_path, "train_predict_hypo_r.npy")) labels = onehot(numpy.load(os.path.join(data_path, "train_entailment_labels.npy")), classify_size) val_premise_r = numpy.load(os.path.join(data_path, "dev_predict_premise_r.npy")) val_hypo_r = numpy.load(os.path.join(data_path, "dev_predict_hypo_r.npy")) val_labels = onehot(numpy.load(os.path.join(data_path, "dev_entailment_labels.npy")), classify_size) size_repr = premise_r.shape[1] model = RTE(size_repr=size_repr, size_hidden=size, dropout=dropout, lr=lr) start_epoch=1 for epoch in range(start_epoch, epochs+1): costs = Counter() for _j,item in enumerate(grouper(itertools.izip(premise_r, hypo_r, labels), batch_size)): j = _j + 1 premise, hypo, label = zip(*item) cost = model.train(premise, hypo, label) costs += Counter({'cost':cost, 'N':1}) costs_valid = valid_loss(model, val_premise_r, val_hypo_r, val_labels) print epoch, j, j*batch_size, "train", "ce", costs['cost']/costs['N'] print epoch, j, j*batch_size, "valid", "ce", costs_valid['cost']/costs_valid['N'] print epoch, j, j*batch_size, "valid", "ac", \ metrics.accuracy_score(numpy.argmax(val_labels, axis=1), numpy.argmax(model.predict(val_premise_r, val_hypo_r), axis=1)) # pickle.dump(model, gzip.open(os.path.join(model_path, "entailment_model.{}.pkl.gz".format(epoch)),'w'), # protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(model, gzip.open(os.path.join(model_path, "entailment_model.pkl.gz"),'w'), protocol=pickle.HIGHEST_PROTOCOL)
def iter_train_batches(self): for bunch in grouper(self.data['train'], self.batch_size*20): bunch_sort = [ bunch[i] for i in numpy.argsort([len(x) for x,_,_ in bunch]) ] for item in grouper(bunch_sort, self.batch_size): yield self.batcher.batch(item)
def encode_images(model, imgs, batch_size=128): """Project imgs to the joint space using model. """ return numpy.vstack([ model.task.encode_images(batch) for batch in util.grouper(imgs, batch_size) ])
def layer_states(model, audios, batch_size=128): """Pass audios through the model and for each audio return the state of each timestep and each layer.""" lens = (numpy.array(map(len, audios)) + model.config['filter_length']) // model.config['stride'] rs = [ r for batch in util.grouper(audios, batch_size) for r in model.task.pile(vector_padder(batch)) ] return [ r[-l:,:,:] for (r,l) in zip(rs, lens) ]
def iter_valid_batches(self): for bunch in util.grouper(self.data['valid'], self.batch_size*20): bunch_sort = [ bunch[i] for i in numpy.argsort([len(x['tokens_in']) for x in bunch]) ] for item in util.grouper(bunch_sort, self.batch_size): yield self.batcher.batch(item)