def eval(self, cur_lang, x, y, wvec, labels, bs=16, av='micro', L=0, source=None): """ Evaluate model on the given validation or test set. """ cur_lang = 0 if source is not None else cur_lang preds, real, watts, satts = [], [], [], [] batch, elapsed, curbatch = 0, 0, 0 while batch < len(x)/(1.0*bs): cur_x = x[curbatch:curbatch+bs] cur_y = y[curbatch:curbatch+bs] x_vecs, y_vecs = load_vectors(wvec, labels, cur_x, cur_y, self.args['swpad'], self.args['spad']) if source is None and not self.single_language: pred = self.model.predict([np.array(x_vecs) for i in range(L)])[cur_lang] else: pred = self.model.predict(np.array(x_vecs)) if self.args['store_test'] and not self.args['train']: watts.append(self.watts[cur_lang](x_vecs)) satts.append(self.satts[cur_lang](x_vecs)) preds.append(pred); real.append(y_vecs) sys.stdout.write(("\t%d/%d\r"%(((batch+1)*bs), len(x)))) sys.stdout.flush() batch += 1; curbatch += bs reals = np.array([rr for r in real for rr in r]) preds = np.array([pp for p in preds for pp in p]) if self.args['store_test'] and not self.args['train']: watts = np.array([ww for w in watts for ww in w]) satts = np.array([ss for s in satts for ss in s]) return reals, preds, watts, satts return reals, preds
def plot_decomposition(args): print(f'Reading vectors from `{args.vec_path}`...') embeddings, w2i, i2w = load_vectors(args.vec_path, gensim=args.gensim_format) matrix_path = os.path.join(args.matrix_dir, f'{args.name}') logX = load_matrix(matrix_path + '.logx.npz') fX = load_matrix(matrix_path + '.fx.npz') logX, fX = logX.todense(), fX.todense() plt.imshow(embeddings) plt.savefig(os.path.join('plots', 'emb.pdf')) plt.clf() plt.imshow(embeddings.T) plt.savefig(os.path.join('plots', 'emb.t.pdf')) plt.clf() plt.imshow(logX) plt.savefig(os.path.join('plots', 'logX.pdf')) plt.clf() plt.imshow(fX * logX) plt.savefig(os.path.join('plots', 'fX.logX.pdf')) plt.clf() plt.imshow(embeddings @ embeddings.T) plt.savefig(os.path.join('plots', 'logX_.pdf')) plt.clf()
def fit(self, X_train, Y_train, X_val, Y_val, labels, wvecs, vocabs): """ Trains the model using stochastic gradient descent. At each epoch epoch, it stores the parameters of the model and its performance on the validation set. """ resume_path, resume_epoch = self.find_checkpoint() errors, prs, recs, fs = [], [], [], [] val_scores, train_scores = [], [] if args['seed'] is not None: np.random.seed(args['seed']) for e in range(self.args['ep']): if resume_epoch > 1 and e < resume_epoch: continue print "\nEpoch %d/%d" % (e+1,self.args['ep']) batch, elapsed, curbatch = 0, 0, 0 all_pred, all_real = [], [] while( batch < (self.args['ep_size']/self.args['bs']) ): X_vecs, Y_vecs, start_time = [], [], time.time() for l in range(len(self.args['languages'])): idxs = np.random.randint(len(X_train[l]), size=self.args['bs']).tolist() cur_x = X_train[l][idxs]; cur_y = Y_train[l][idxs] x_vecs, y_vecs = load_vectors(wvecs[l], labels[l], cur_x, cur_y, self.args['swpad'], self.args['spad']) X_vecs.append(np.array(x_vecs));Y_vecs.append(np.array(y_vecs)) if self.single_language: err = self.model.train_on_batch(X_vecs[0], Y_vecs[0])[0] preds = self.model.predict(X_vecs[0],batch_size=self.args['bs']) else: err = self.model.train_on_batch(X_vecs, Y_vecs)[0] preds = self.model.predict(X_vecs,batch_size=self.args['bs']) pr, rec, f = self.get_avgresults(preds, Y_vecs) errors.append(err); prs.append(pr); recs.append(rec); fs.append(f) progress = ((batch+1)*self.args['bs'])*30./(self.args['ep_size']) elapsed += time.time() - start_time stat_args = ( ("="*(int(progress))).ljust(30,'.'), round(elapsed), sum(errors)/len(errors), sum(prs)/len(prs), sum(recs)/len(recs), sum(fs)/len(fs) ) progress = ("%d/%d"%(((batch+1)*self.args['bs']), self.args['ep_size'])).ljust(15) stats = "[%s] - %ds - loss: %.4f - p: %.4f - r: %.4f - f1: %.4f\r" % stat_args sys.stdout.write( progress + stats ) sys.stdout.flush() batch += 1; curbatch += self.args['bs'] if resume_path is not None and e == 0: print "\n[*] Loading initial weights from %s" % resume_path self.model.load_weights(resume_path) train_score = (sum(prs)/len(prs), sum(recs)/len(recs), sum(fs)/len(fs)) lang_scores = [] for l in range(len(X_train)): print "\n[*] Validating on %s..." % self.args['languages'][l] reals, preds = self.eval(l, X_val[l],Y_val[l], wvecs[l], labels[l], L=len(X_train)) val_score = self.get_results(reals, preds>self.args['t']) lang_scores.append(val_score) val_scores.append(val_score) train_scores.append(train_score) for l,language in enumerate(self.args['languages']): self.save_model(language, e, train_score, lang_scores[l]) return train_scores, val_scores
def __data_generation(self, chunk_id, idxs): """ Load data for a particular chunk id with or without sampling. """ if self.chunk_mode: x_vecs, y_vecs = load_vectors(self.X[chunk_id], self.Y[chunk_id], idxs, self.wpad, len(self.l_vecs), self.model) else: x_vecs, y_vecs = load_vectors(self.X, self.Y, idxs, self.wpad, len(self.l_vecs), self.model) if self.model.args['la']: if self.sampling < 1.0: ls_vecs, ys_vecs = [], [] num_sam = int(self.sampling*len(self.l_vecs)) for i_idx, xv in enumerate(x_vecs): cur_pos_ids = y_vecs[i_idx].nonzero()[0].tolist() all_ids = list(set(self.init_ids) - set(cur_pos_ids)) sample_ids = np.random.randint(len(all_ids), size=int(num_sam)).tolist() merged_ids = cur_pos_ids + np.array(all_ids)[sample_ids].tolist() samples = self.l_vecs[merged_ids][:num_sam] ls_vecs.append(samples) y_samples = y_vecs[i_idx][merged_ids][:num_sam] ys_vecs.append(y_samples) return [np.array(x_vecs), np.array(ls_vecs)], np.array(ys_vecs) else: return [np.array(x_vecs), self.bsl_vecs], np.array(y_vecs) else: return np.array(x_vecs), np.array(y_vecs)
def tsne(args): """Plots t-SNE.""" num_words = 1000 print(f'Reading vectors from `{args.vec_path}`...') embeddings, w2i, i2w = load_vectors(args.vec_path, gensim=args.gensim_format) vocab_path = os.path.join(args.vocab_dir, f'{args.name}.vocab') embeddings = embeddings[:num_words, :] most_common_words = [i2w[i] for i in range(num_words)] print(f'Loaded {embeddings.shape[0]} vectors.') print(f'Plotting t-SNE for {num_words} vectors.') # Make bokeh plot. emb_scatter(embeddings, list(most_common_words), model_name=args.name)
def eval(self, cur_lang, x, y, label_vecs, bs=8, av='micro', L=0, source=None, avg=True, mode='none'): """ Evaluate model on the given validation or test set. """ cur_lang = 0 if source is not None else cur_lang preds, real, watts, satts = [], [], [], [] batch, elapsed, curbatch, init = 0, 0, 0, 0 rls, aps, oes, elapsed = [], [], [], 0.0 total = len(x) keys = x.keys() num_labels = label_vecs.shape[1] if mode and mode == 'seen': eval_ids = pickle.load(open(self.args['seen_ids'])) eval_ids = self.revids[eval_ids] # select evaluation ids elif mode and mode == 'unseen': eval_ids = pickle.load(open(self.args['unseen_ids'])) eval_ids = self.revids[eval_ids] # select evaluation ids else: # validation eval_ids = np.arange(label_vecs.shape[1]) total = 5000 # use a small sample for validation / otherwise too slow print while batch < total/(1.0*bs): start_time = time.time() init_ids = [init+curbatch+cur for cur in range(bs) if init+curbatch+cur < len(keys)] idxs = np.array(keys)[init_ids] x_vecs, y_vecs = load_vectors(x, y, idxs, self.args['wpad'], num_labels, self) if self.args['la']: # Zero-shot models if self.args['train']: # Predictions for all the labels are build subsequently due to # the predefined vocabulary size which is required by sampling. ll = int(self.args["sampling"]*num_labels) done, pred, pi = False, None, 0 while (not done): if pi == 0: totest = label_vecs[:,:ll] elif pi > 0: totest = label_vecs[:,pi*ll:ll+pi*ll] if totest.shape[1] != ll: remained = totest.shape[1] totest = np.hstack([totest, np.zeros((bs,ll - totest.shape[1], totest.shape[2]))]) done = True cur_pred = self.model.predict([np.array(x_vecs), totest], batch_size=self.args['bs']) if pred is None: pred = cur_pred else: if done: pred = np.hstack([pred, cur_pred[:,:remained]]) else: pred = np.hstack([pred, cur_pred]) pi += 1 else: pred = self.model.predict([np.array(x_vecs), label_vecs], batch_size=self.args['bs']) else: # Non-zero-shot models pred = self.model.predict(np.array(x_vecs), batch_size=self.args['bs']) real = np.array(y_vecs); pred = np.array(pred) rls.append(rankloss(real[:,eval_ids], pred[:,eval_ids])) aps.append(avgprec(real[:,eval_ids], pred[:,eval_ids])) cur_oes = [one_error(real[j][eval_ids], pred[j][eval_ids]) for j in range(len(pred))] oes.append(np.array(cur_oes).mean()) elapsed += time.time() - start_time sys.stdout.write("\t%d/%d rls=%.5f - aps=%.5f - oe=%.5f \t %ds\r"%(((batch+1)*bs), len(x), np.array(rls).mean(), np.array(aps).mean(), np.array(oes).mean(), elapsed)) sys.stdout.flush() batch += 1; curbatch += bs if avg: rls = np.array(rls).mean() aps = np.array(aps).mean() oes = np.array(oes).mean() print "rl: %.4f - ap: %.4f - oe: %.4f" % (rls, aps, oes) return rls, aps, oes else: return rls, aps, oes
def eval(self, cur_lang, x, y, wvec, labels, bs=16, av='micro', L=0, source=None): """ Evaluate model on the given validation or test set. """ cur_lang = 0 if source is not None else cur_lang preds, real, watts, satts = [], [], [], [] batch, elapsed, curbatch = 0, 0, 0 if self.args['la']: label_vecs = [] for l, lang in enumerate(self.args['languages']): out_dim = self.model.layers[-1 * (len(self.args['languages']) - l)].output_shape vecs = [ np.random.ranf(self.args['wdim']) for lnum in range(out_dim[1]) ] l_vecs = [vecs for b in range(bs)] if l == cur_lang: vecs = self.load_vecs(np.array(wvec), labels) l_vecs = [vecs for b in range(bs)] label_vecs.append(np.array(l_vecs)) while batch < len(x) / (1.0 * bs): cur_x = x[curbatch:curbatch + bs] cur_y = y[curbatch:curbatch + bs] x_vecs, y_vecs = load_vectors(wvec, labels, cur_x, cur_y, self.args['swpad'], self.args['spad']) if source is None and not self.single_language: if self.args['la']: inputs_all = self.get_inputs( [np.array(x_vecs) for i in range(L)], [label_vecs[i] for i in range(L)]) pred = self.model.predict(inputs_all)[cur_lang] else: pred = self.model.predict( [np.array(x_vecs) for i in range(L)])[cur_lang] else: if self.args['la']: pred = self.model.predict( [np.array(x_vecs), label_vecs[cur_lang]]) else: pred = self.model.predict(np.array(x_vecs)) if self.args['store_test'] and not self.args['train']: watts.append(self.watts[cur_lang](x_vecs)) satts.append(self.satts[cur_lang](x_vecs)) preds.append(pred) real.append(y_vecs) sys.stdout.write(("\t%d/%d\r" % (((batch + 1) * bs), len(x)))) sys.stdout.flush() batch += 1 curbatch += bs reals = np.array([rr for r in real for rr in r]) preds = np.array([pp for p in preds for pp in p]) if self.args['store_test'] and not self.args['train']: watts = np.array([ww for w in watts for ww in w]) satts = np.array([ss for s in satts for ss in s]) return reals, preds, watts, satts return reals, preds