def batchify_training_data(self, xy_ids_and_targets, batch_size, is_dev_data): logger.debug('Batchifying data') # sort according to the src lengths -> xy pairs sorted_data = sorted(zip(*xy_ids_and_targets), key=lambda p: len(p[0]), reverse=True) data_size = len(sorted_data) num_batches = data_size // batch_size data_indices = self.index_data(data_size, mode='no_shuffling') batch_pairs = [] batch_forms_all = [] for bi in range(num_batches+1): # including the last (smaller) batch batch_data_tuple, batch_forms = self.make_one_batch(sorted_data, data_indices, bi, batch_size) batch_pairs.append(batch_data_tuple) batch_forms_all.extend(batch_forms) if is_dev_data: self.dev_references = batch_forms_all logger.info('Saving dev (training) references to --> %s', self.fnames.dev_ref_fn) save_txt(self.dev_references, self.fnames.dev_ref_fn) assert data_size == len(batch_forms_all) return batch_pairs
def save_predictions(self, predictions, fname): assert type(predictions[0]) == str, \ 'Predictions are not strings -- consider re-implementing the method!' logger.debug('Saving Morph predictions to --> %s', fname) save_txt(predictions, fname)
def batchify_training_data(self, xy_ids, batch_size, is_dev_data): logger.debug('Batchifying data') x_data_ids, y_data_ids, forms = xy_ids data_size = len(x_data_ids) num_batches = data_size // batch_size data_indices = self.index_data(data_size, mode='no_shuffling') batch_pairs = [] for bi in range(num_batches + 1): batch_x = [] batch_y = [] curr_batch_indices = data_indices[bi * batch_size:(bi + 1) * batch_size] for idx in curr_batch_indices: x_ids = x_data_ids[idx] y_ids = y_data_ids[idx] x_enc_ids_copy = copy.deepcopy(x_ids) batch_x.append(x_enc_ids_copy) y_ids_copy = copy.deepcopy(y_ids) batch_y.append(y_ids_copy) batch_enc_x_var = cuda_if_gpu(Variable(torch.LongTensor(batch_x))) batch_dec_y_var = cuda_if_gpu(Variable(torch.LongTensor(batch_y))) batch_pairs.append((batch_enc_x_var, batch_dec_y_var)) if is_dev_data: self.dev_references = forms logger.info('Saving dev (training) references to --> %s', self.fnames.dev_ref_fn) save_txt(self.dev_references, self.fnames.dev_ref_fn) assert data_size == len(forms) return batch_pairs
def save_predictions(self, depgraphs, fname): predicted_snts = [] random_dg = depgraphs[0] if 'PRED_FORM' not in random_dg.node['1']: logger.debug('*** USING ORACLE FORMS *** ') for dg in depgraphs: snt_tokens = [ SynAlgo.get_node_gold_form(dg, node_id) for node_id in dg.graph['node_order'] ] predicted_snts.append(' '.join(snt_tokens)) else: logger.debug('*** USING PREDICTED FORMS *** ') for dg in depgraphs: snt_tokens = [ SynAlgo.get_node_pred_form(dg, node_id) for node_id in dg.graph['node_order'] ] predicted_snts.append(' '.join(snt_tokens)) logger.debug('Saving Syn predictions to --> %s', fname) save_txt(predicted_snts, fname)
def training_start(self, model, data, evaluator, nlgen): logger.debug("Preparing training data") dev_data_fname = data.fnames.dev_fn assert os.path.exists(dev_data_fname), logger.error( 'File %s does not exist', dev_data_fname) # dev data for evaluation dev_data_ref_fname = data.fnames.dev_ref_fn dev_data_raw = read_conll_data_file(data.fnames.dev_fn) logger.info('Saving Syn reference --> %s', data.fnames.dev_ref_fn) save_txt(itemlist=conll2snt(dev_data_raw), fname=dev_data_ref_fname) train_batches = data.batchify_vectorized_data( data.train, self.batch_size) # [(np_x, np_y_1hot), ...] dev_batches = data.batchify_vectorized_data(data.dev, self.batch_size) # need to move the model before setting the optimizer # see: http://pytorch.org/docs/stable/optim.html if self.use_cuda: model.cuda() self.set_optimizer(model, self.config['optimizer']) self.set_train_criterion(len(data.vocab.id2tok), PAD_ID) training_start_time = time.time() logger.info("Start training") best_score = 0 best_model_fn = None best_weights = None for epoch_idx in range(1, self.n_epochs + 1): epoch_start = time.time() logger.info('Epoch %d/%d', epoch_idx, self.n_epochs) # compute loss on train and dev data train_loss = self.train_epoch(epoch_idx, model, train_batches) dev_loss = self.compute_val_loss(model, dev_batches) evaluator.record_loss(train_loss, dev_loss) # run on dev data in prediction mode (no oracle decoding) predictions_fname = self.get_predictions_fname(epoch_idx) depgraphs = nlgen.predict_from_raw_data(model, dev_data_raw, data.vocab) nlgen.save_predictions(depgraphs, predictions_fname) # evaluate using metrics scores = evaluator.external_metric_eval(ref_fn=dev_data_ref_fname, pred_fn=predictions_fname) avg_score = (scores.bleu + scores.edist) / 2 model_fn = os.path.join( self.model_dir, 'weights.epoch%d_%0.3f_%0.3f' % (epoch_idx, scores.bleu, scores.edist)) if avg_score > best_score: best_score = avg_score best_model_fn = model_fn best_weights = model.state_dict() logger.debug('Time = %s', asMinutes(time.time() - epoch_start)) logger.info('Total training time=%s' % (asMinutes(time.time() - training_start_time))) self.best_model_fn = best_model_fn logger.debug('Saving model to --> %s', best_model_fn) torch.save(best_weights, best_model_fn) score_fname = os.path.join(self.model_dir, 'scores.csv') scores = evaluator.get_scores_to_save() evaluator.save_scores(scores, self.score_file_header, score_fname) evaluator.plot_lcurve(fname=os.path.join(self.model_dir, "lcurve.pdf"), title=self.model_type)
def save_dev_references(self, fname): logger.debug('Saving Morph references to --> %s', fname) save_txt(self.targets, fname)