def set_ensemble_train_func(self): write('Building an ensemble training function...') self.optimizer = get_optimizer(self.argv) self.optimizer.set_params(self.model.params) if self.argv.load_opt_param: self.optimizer.load_params(self.argv.load_opt_param) # 1D: batch_size * n_spans, 2D: [batch index, label id, span index] span_true = T.imatrix('span_true') # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim h_span = self.model.feat_layer.forward(self.model.inputs, self.experts) # 1D: batch_size, 2D: n_labels, 3D: n_spans; score logits = self.model.feat_layer.calc_logit_scores(h=h_span) # 1D: batch_size, 2D: n_labels; span index span_pred = self.model.argmax_span(logits) nll = self.model.calc_loss(logits, span_true) l2_reg = L2Regularizer() objective = nll + l2_reg(alpha=self.argv.reg, params=self.model.params) grads = T.grad(cost=objective, wrt=self.model.params) updates = self.optimizer(grads=grads, params=self.model.params) self.train_func = theano.function(inputs=self.model.inputs + [span_true], outputs=[objective, span_pred], updates=updates, mode='FAST_RUN')
def set_model(self, **kwargs): write('Setting a model...') argv = self.argv self.vocab_word = kwargs['vocab_word'] self.use_elmo = kwargs['use_elmo'] self.vocab_label = kwargs['vocab_label'] self.vocab_label_valid = kwargs['vocab_label_valid'] word_emb = kwargs['word_emb'] vocab_word_size = self.vocab_word.size() if self.vocab_word else 0 self.input_dim = argv.emb_dim if word_emb is None else word_emb.shape[1] self.hidden_dim = argv.hidden_dim self.output_dim = -1 self.decoder = Decoder(argv=argv, vocab_label=self.vocab_label) self.model = SpanModel() self.model.compile(inputs=self._set_inputs(), vocab_word_size=vocab_word_size, use_elmo=self.use_elmo, word_emb=word_emb, input_dim=[self.input_dim, self.input_dim], hidden_dim=self.hidden_dim, feat_dim=2 * self.hidden_dim, output_dim=self.vocab_label.size(), n_layers=argv.n_layers, drop_rate=argv.drop_rate) write('\t- {}'.format("\n\t- ".join( [l.name for l in self.model.layers]))) self._show_model_config()
def set_train_func(self): write('Building a training function...') self.optimizer = get_optimizer(self.argv) self.optimizer.set_params(self.model.params) if self.argv.load_opt_param: write('\tLoading optimization params...') self.optimizer.load_params(self.argv.load_opt_param) y_true = T.imatrix('y') # 1D: batch_size, 2D: n_words, 3D: output_dim emit_scores = self.model.get_emit_scores() # 1D: batch_size, 2D: n_words; elem=label id y_pred = self.model.label_layer.get_y_pred(emit_scores) # 1D: batch_size; elem=log proba y_path_proba = self.model.label_layer.get_y_path_proba( emit_scores, y_true) l2_reg = L2Regularizer() cost = -T.mean(y_path_proba) + l2_reg(alpha=self.argv.reg, params=self.model.params) grads = T.grad(cost=cost, wrt=self.model.params) updates = self.optimizer(grads=grads, params=self.model.params) self.train_func = theano.function(inputs=self.model.inputs + [y_true], outputs=[cost, y_pred], updates=updates, on_unused_input='warn', mode='FAST_RUN')
def _show_model_config(self): model = self.model write('Model configuration') write('\t- Input Dim: {}'.format(self.input_dim)) write('\t- Hidden Dim: {}'.format(self.hidden_dim)) write('\t- Output Dim: {}'.format(self.output_dim)) write('\t- Parameters: {}'.format( sum(len(x.get_value(borrow=True).ravel()) for x in model.params)))
def load_experts_params(self, path): write('Loading experts params...') param_files = glob.glob(path + '/*') param_files = [ fn for fn in param_files if fn.split('/')[-1].startswith('param') ] write("\t - Param Files: %s" % str(param_files)) for i, path in enumerate(param_files[:self.argv.n_experts]): params = load_pickle(path) assert len(self.experts[i].params) == len(params) for p1, p2 in zip(self.experts[i].params, params): p1.set_value(p2)
def set_pred_func(self): write('Building a predicting function...') # 1D: batch_size, 2D: n_words, 3D: output_dim o = self.model.get_emit_scores() # 1D: batch_size, 2D: n_words; elem=label id y_pred = self.model.label_layer.get_y_pred(o) self.pred_func = theano.function(inputs=self.model.inputs, outputs=y_pred, on_unused_input='warn', mode='FAST_RUN')
def f_score(self, y_true, y_pred, vocab_label): """ :param y_true: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index] :param y_pred: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index] """ correct, p_total, r_total = self.metrics(y_true=y_true, y_pred=y_pred, vocab_label=vocab_label) p, r, f = f_score(correct, p_total, r_total) write('\tF:{:>7.2%} P:{:>7.2%} ({:>5}/{:>5}) R:{:>7.2%} ({:>5}/{:>5})'.format( f, p, int(correct), int(p_total), r, int(correct), int(r_total)) ) return f
def set_init_ensemble_param(self): write('Initializing params...') W = np.zeros(shape=(2 * self.hidden_dim, self.vocab_label.size()), dtype=theano.config.floatX) b = np.zeros(shape=self.vocab_label.size(), dtype=theano.config.floatX) for model in self.experts: W += model.params[-2].get_value(borrow=True) for model in self.experts: b += model.params[-1].get_value(borrow=True) W = W / len(self.experts) b = b / len(self.experts) self.model.params[-2].set_value(W) self.model.params[-1].set_value(b)
def train(self, batches): start = time.time() n_batches = 0. loss_total = 0. self.correct = 0. self.n_pred_spans = 0. self.model.feat_layer.is_train.set_value(1) if self.experts: for model in self.experts: model.feat_layer.is_train.set_value(1) for inputs in batches: self.n_total_batches += 1 n_batches += 1 if n_batches % 100 == 0: sys.stdout.write("%d " % n_batches) sys.stdout.flush() if len(inputs[0][0]) > 100: continue loss, span_pred = self.train_func(*inputs) if math.isnan(loss): write('\n\nNAN: Index: %d\n' % n_batches) exit() loss_total += loss crr, p_total = calc_correct_and_pred_spans(span_true=inputs[-1], span_pred=span_pred, marks=inputs[1]) self.correct += crr self.n_pred_spans += p_total self.model.feat_layer.is_train.set_value(0) if self.experts: for model in self.experts: model.feat_layer.is_train.set_value(0) avg_loss = loss_total / n_batches p, r, f = calc_f_score(self.correct, self.n_pred_spans, self.n_true_spans) write('\n\tTime: %f seconds' % (time.time() - start)) write('\tAverage Negative Log Likelihood: %f(%f/%d)' % (avg_loss, loss_total, n_batches)) write( '\tF:{:>7.2%} P:{:>7.2%} ({:>5}/{:>5}) R:{:>7.2%} ({:>5}/{:>5})'. format(f, p, int(self.correct), int(self.n_pred_spans), r, int(self.correct), int(self.n_true_spans)))
def train(self, batches): start = time.time() n_batches = 0. n_samples = 0. loss_total = 0. p_total = 0. r_total = 0. correct = 0. self.model.feat_layer.is_train.set_value(1) for index, inputs in enumerate(batches): if (index + 1) % 100 == 0: sys.stdout.write('%d ' % (index + 1)) sys.stdout.flush() batch_size = len(inputs[0]) n_words = len(inputs[0][0]) if n_words < 2 or 100 < n_words: continue loss, y_pred = self.train_func(*inputs) if math.isnan(loss): write('\n\nNAN: Index: %d\n' % (index + 1)) exit() loss_total += loss n_batches += 1 n_samples += batch_size * n_words correct_i, p_total_i, r_total_i = metrics_for_bio( y_true=inputs[-1], y_pred=y_pred, vocab_label=self.vocab_label) correct += correct_i p_total += p_total_i r_total += r_total_i self.model.feat_layer.is_train.set_value(0) avg_loss = loss_total / n_batches p, r, f = f_score(correct, p_total, r_total) write('\n\tTime: %f seconds' % (time.time() - start)) write('\tAverage Negative Log Likelihood: %f(%f/%d)' % (avg_loss, loss_total, n_batches)) write( '\tF:{:>7.2%} P:{:>7.2%} ({:>5}/{:>5}) R:{:>7.2%} ({:>5}/{:>5})'. format(f, p, int(correct), int(p_total), r, int(correct), int(r_total)))
def predict_greedy(self, batches): """ :param batches: 1D: n_sents, 2D: n_prds, 3D: n_feats, 4D: n_words; elem=(x_w, x_m) :return: y: 1D: n_sents, 2D: n_prds, 3D: n_spans, 3D: [label_id, pre_index, post_index] """ start = time.time() y = [] for index, inputs in enumerate(batches): if (index + 1) % 100 == 0: sys.stdout.write("%d " % (index + 1)) sys.stdout.flush() if len(inputs) == 0: span_triples = [] else: scores = self.pred_func(*inputs) span_triples = self.decoder.greedy_span_triples( scores=scores, marks=inputs[-1]) y.append(span_triples) write('\n\tTime: %f seconds' % (time.time() - start)) return y
def predict(self, batches): """ :param batches: 1D: n_batches, 2D: n_words; elem=(x_w, x_m) :return: y: 1D: n_batches, 2D: batch_size; elem=(y_pred(1D:n_words), y_proba(float)) """ start = time.time() y = [] for index, inputs in enumerate(batches): if (index + 1) % 100 == 0: sys.stdout.write("%d " % (index + 1)) sys.stdout.flush() if len(inputs) == 0: y_pred = [] elif len(inputs[0][0]) < 2: y_pred = [[0] for _ in range(len(inputs[0]))] else: y_pred = self.pred_func(*inputs) y.append(y_pred) write('\n\tTime: %f seconds' % (time.time() - start)) return y
def set_ensemble_model(self, **kwargs): write('Setting a model...') argv = self.argv self.vocab_word = kwargs['vocab_word'] self.use_elmo = kwargs['use_elmo'] self.vocab_label = kwargs['vocab_label'] self.vocab_label_valid = kwargs['vocab_label_valid'] word_emb = kwargs['word_emb'] vocab_word_size = self.vocab_word.size() if self.vocab_word else 0 self.input_dim = argv.emb_dim if word_emb is None else word_emb.shape[1] self.hidden_dim = argv.hidden_dim self.output_dim = -1 self.decoder = Decoder(argv=argv, vocab_label=self.vocab_label) ################# # Set MoE model # ################# inputs = self._set_inputs() self.model = MoEModel() self.model.compile(inputs=inputs, feat_dim=2 * self.hidden_dim, output_dim=self.vocab_label.size(), drop_rate=argv.drop_rate, n_experts=argv.n_experts) write('\t- {}\n'.format("\n\t- ".join( [l.name for l in self.model.layers]))) ############### # Set experts # ############### experts = [] for _ in range(argv.n_experts): model = SpanModel() model.compile(inputs=self.model.inputs, vocab_word_size=vocab_word_size, use_elmo=self.use_elmo, input_dim=[self.input_dim, self.input_dim], hidden_dim=self.hidden_dim, feat_dim=2 * self.hidden_dim, output_dim=self.vocab_label.size(), n_layers=argv.n_layers, word_emb=word_emb, drop_rate=argv.drop_rate) write('\t- {}\n'.format("\n\t- ".join( [l.name for l in model.layers]))) experts.append(model) self.experts = experts
def set_ensemble_pred_func(self): write('Building an ensemble predicting function...') if self.argv.search == 'argmax': self.set_ensemble_pred_argmax_func() else: self.set_ensemble_pred_score_func()
def _run_epochs(self, train_samples, valid_samples=None, init_epoch=0): write('\nTRAIN START') argv = self.argv pproc = self.preprocessor vocab_label_valid = self.model_api.vocab_label_valid if valid_samples: valid_batches = pproc.make_batches(samples=valid_samples, is_valid_data=True) valid_batch_x, valid_batch_y = pproc.split_x_and_y(valid_batches) ########################################## # Initial result with pre-trained params # ########################################## if (argv.load_param or argv.load_param_dir) and valid_samples: write('\nEpoch: 0 (Using the Pre-trained Params)') write('VALID') valid_batch_y_pred = self.model_api.predict(valid_batch_x) self.best_valid_f1 = self.evaluator.f_score(y_true=valid_batch_y, y_pred=valid_batch_y_pred, vocab_label=vocab_label_valid) ############# # Main loop # ############# for epoch in range(init_epoch, argv.epoch): write('\nEpoch: %d' % (epoch + 1)) write('TRAIN') if argv.halve_lr and epoch > 49 and (epoch % 25) == 0: lr = self.model_api.optimizer.lr.get_value(borrow=True) self.model_api.optimizer.lr.set_value(lr * 0.5) write('### HALVE LEARNING RATE: %f -> %f' % (lr, lr * 0.5)) ############ # Training # ############ train_batches = pproc.make_batches(train_samples) self.model_api.train(train_batches) ############## # Validating # ############## if valid_samples: write('VALID') valid_batch_y_pred = self.model_api.predict(valid_batch_x) valid_f1 = self.evaluator.f_score(y_true=valid_batch_y, y_pred=valid_batch_y_pred, vocab_label=vocab_label_valid) if self.best_valid_f1 < valid_f1: self.best_valid_f1 = valid_f1 self.best_epoch = epoch self.f1_history[self.best_epoch + 1] = [self.best_valid_f1] if argv.save: self.model_api.save_params(epoch=0) self.model_api.optimizer.save_params(epoch=0) if argv.save_every_epoch: self.model_api.save_params(epoch=epoch) self.model_api.optimizer.save_params(epoch=epoch) show_score_history(self.f1_history)
def train(self): write('\nTRAINING START\n') argv = self.argv loader = self.loader pproc = self.preprocessor make_output_dir(self.argv) ################# # Load word emb # ################# if argv.word_emb: write('Loading Word Embeddings...') word_list, word_emb = load_emb(argv.word_emb) vocab_word = pproc.make_vocab_word(word_list) write('\t- # Vocabs: %d' % vocab_word.size()) else: vocab_word = word_emb = None ################# # Load elmo emb # ################# if self.argv.train_elmo_emb: write('Loading ELMo Embeddings...') train_elmo_emb = loader.load_hdf5(self.argv.train_elmo_emb) else: train_elmo_emb = None if self.argv.dev_elmo_emb: valid_elmo_emb = loader.load_hdf5(self.argv.dev_elmo_emb) else: valid_elmo_emb = None ############### # Load corpus # ############### write('Loading Corpus...') train_corpus = loader.load(path=argv.train_data, data_size=argv.data_size, is_test=False) valid_corpus = loader.load(path=argv.dev_data, data_size=argv.data_size, is_test=False) write('\t- # Sents: Train:%d Valid:%d' % (len(train_corpus), len(valid_corpus))) ############## # Make sents # ############## train_sents = pproc.make_sents(train_corpus) valid_sents = pproc.make_sents(valid_corpus) ############### # Make labels # ############### write('Making Labels...') if argv.save or argv.save_every_epoch: save_label = True else: save_label = False vocab_label_train = pproc.make_and_save_vocab_label(sents=train_sents, vocab_label_init=None, save=save_label, load=True) vocab_label_valid = pproc.make_and_save_vocab_label(sents=valid_sents, vocab_label_init=vocab_label_train, save=False, load=False) write('\t- # Labels: %d' % vocab_label_train.size()) ################### # Set sent params # ################### train_sents = pproc.set_sent_params(sents=train_sents, elmo_emb=train_elmo_emb, vocab_word=vocab_word, vocab_label=vocab_label_train) valid_sents = pproc.set_sent_params(sents=valid_sents, elmo_emb=valid_elmo_emb, vocab_word=vocab_word, vocab_label=vocab_label_valid) ################ # Make samples # ################ write('Making Samples...') train_samples = pproc.make_samples(sents=train_sents, is_valid_data=False) valid_samples = pproc.make_samples(sents=valid_sents, is_valid_data=True) write('\t- # Samples: Train:%d Valid:%d' % (len(train_samples), len(valid_samples))) ################# # Set Model API # ################# if train_elmo_emb is not None: use_elmo = True else: use_elmo = False if argv.n_experts > 0: is_ensemble = True else: is_ensemble = False self.model_api.n_true_spans = calc_true_spans(train_sents) if is_ensemble: self.model_api.set_ensemble_model(word_emb=word_emb, use_elmo=use_elmo, vocab_word=vocab_word, vocab_label=vocab_label_train, vocab_label_valid=vocab_label_valid) self.model_api.load_experts_params(argv.load_param_dir) self.model_api.set_init_ensemble_param() self.model_api.set_ensemble_train_func() if self.model_api.vocab_label_valid: self.model_api.set_ensemble_pred_func() init_epoch = 0 else: self.model_api.set_model(word_emb=word_emb, use_elmo=use_elmo, vocab_word=vocab_word, vocab_label=vocab_label_train, vocab_label_valid=vocab_label_valid) if argv.load_param_latest: if argv.output_dir: dir_name = argv.output_dir else: dir_name = 'output' param_fns = get_file_names_in_dir(dir_path=dir_name, prefix='param') opt_param_fns = get_file_names_in_dir(dir_path=dir_name, prefix='opt') param_fn, latest_epoch = get_latest_param_fn(file_names=param_fns) opt_param_fn, _ = get_latest_param_fn(file_names=opt_param_fns) self.model_api.argv.load_param = param_fn self.model_api.argv.load_opt_param = opt_param_fn self.model_api.load_params(param_fn) init_epoch = latest_epoch + 1 elif argv.load_param: self.model_api.load_params(argv.load_param) init_epoch = 0 else: init_epoch = 0 self.model_api.set_train_func() if self.model_api.vocab_label_valid: self.model_api.set_pred_func() ####################### # Run training epochs # ####################### self._run_epochs(train_samples, valid_samples, init_epoch)
def predict(self): argv = self.argv pproc = self.preprocessor loader = self.loader ################ # Load dataset # ################ write('Loading Dataset...') test_corpus = loader.load(path=argv.test_data, data_size=argv.data_size, is_test=True) test_sents = pproc.make_sents(test_corpus) ################# # Load init emb # ################# if argv.word_emb: write('Loading Embeddings...') word_list, word_emb = load_emb(argv.word_emb) vocab_word = pproc.make_vocab_word(word_list) write('\t- # Embedding Words: %d' % vocab_word.size()) else: vocab_word = word_emb = None if argv.test_elmo_emb: write('Loading ELMo Embeddings...') test_elmo_emb = loader.load_hdf5(argv.test_elmo_emb) else: test_elmo_emb = None ############### # Make labels # ############### label_key_value = loader.load_key_value_format(argv.load_label) vocab_label = make_vocab_from_ids(label_key_value) write('\t- # Labels: %d' % vocab_label.size()) ################### # Set sent params # ################### test_sents = pproc.set_sent_params(sents=test_sents, elmo_emb=test_elmo_emb, vocab_word=vocab_word, vocab_label=None) ################ # Make samples # ################ write('Making Test Samples...') test_batches = pproc.make_batch_per_sent(sents=test_sents) write('\t- # Test Samples: %d' % len(test_batches)) ############# # Model API # ############# use_elmo = True if test_elmo_emb is not None else False if argv.n_experts > 0: self.model_api.set_ensemble_model(word_emb=word_emb, use_elmo=use_elmo, vocab_word=vocab_word, vocab_label=vocab_label, vocab_label_valid=None) self.model_api.load_params(argv.load_param) self.model_api.load_experts_params(argv.load_param_dir) self.model_api.set_ensemble_pred_func() else: self.model_api.set_model(word_emb=word_emb, use_elmo=use_elmo, vocab_word=vocab_word, vocab_label=vocab_label, vocab_label_valid=None) self.model_api.load_params(argv.load_param) self.model_api.set_pred_func() ########### # Testing # ########### write('\nPREDICTION START') test_y_pred = self.model_api.predict(test_batches) self.saver.save_props(corpus=test_sents, labels=test_y_pred, vocab_label=vocab_label) self.saver.save_json_format(corpus=test_sents, labels=test_y_pred, vocab_label=vocab_label)
def validate(self): write('\nVALIDATING START\n') argv = self.argv loader = self.loader pproc = self.preprocessor make_output_dir(self.argv) ################ # Load dataset # ################ write('Loading Dataset...') valid_corpus = loader.load(path=argv.dev_data, data_size=argv.data_size, is_test=False) valid_sents = pproc.make_sents(valid_corpus) ################# # Load init emb # ################# if argv.word_emb: write('Loading Embeddings...') word_list_emb, word_emb = load_emb(argv.init_emb) vocab_word = pproc.make_vocab_word(word_list_emb) write('\t- # Embedding Words: %d' % vocab_word.size()) else: vocab_word = word_emb = None ################# # Load elmo emb # ################# if argv.dev_elmo_emb: write('Loading ELMo Embeddings...') valid_elmo_emb = loader.load_hdf5(argv.dev_elmo_emb) else: valid_elmo_emb = None ############### # Make labels # ############### write('Loading Labels...') label_key_value = loader.load_key_value_format(argv.load_label) vocab_label_train = make_vocab_from_ids(label_key_value) vocab_label_valid = pproc.make_and_save_vocab_label(sents=valid_sents, vocab_label_init=vocab_label_train, save=False, load=False) write('\t- # Labels: %d' % vocab_label_train.size()) ################### # Set sent params # ################### valid_sents = pproc.set_sent_params(sents=valid_sents, elmo_emb=valid_elmo_emb, vocab_word=vocab_word, vocab_label=vocab_label_valid) ################ # Make samples # ################ write('Making Valid Samples...') valid_samples = pproc.make_samples(sents=valid_sents) valid_batches = pproc.make_batches(samples=valid_samples, is_valid_data=True) valid_batch_x, y_true = pproc.split_x_and_y(valid_batches) write('\t- # Samples: %d' % len(valid_samples)) ################# # Set Model API # ################# if valid_elmo_emb is not None: use_elmo = True else: use_elmo = False self.model_api.set_model(word_emb=word_emb, use_elmo=use_elmo, vocab_word=vocab_word, vocab_label=vocab_label_train, vocab_label_valid=vocab_label_valid) self.model_api.set_pred_func() ############ # Validate # ############ write('\nVALIDATION START') dir_name = argv.load_param_dir param_files = [fn for fn in get_file_names_in_dir(dir_name) if fn.split('/')[-1].startswith('param')] write('\t- # Param Files: %d' % len(param_files)) best_file = None best_param = None best_f1 = -1.0 for param_file in param_files: write('\nFile Name: %s' % param_file) self.model_api.load_params(param_file) y_pred = self.model_api.predict(valid_batch_x) valid_f1 = self.evaluator.f_score(y_true=y_true, y_pred=y_pred, vocab_label=vocab_label_valid) if best_f1 < valid_f1: best_f1 = valid_f1 best_file = param_file best_param = [p.get_value(borrow=True) for p in self.model_api.model.params] write('Best Param=%s F1=%f' % (best_file, best_f1)) fn = argv.load_param_dir if argv.output_fn: fn += '/param.%s' % argv.output_fn else: fn += '/param.%s.best' % argv.method save_pickle(fn=fn, data=best_param)