def add_parameters(self, dropout, lstm_size, optimizer, model_type, gru=True): if model_type == "gru": self.encoder_rnn = dy.GRUBuilder(NUM_LAYERS, EMBEDDING_SIZE, lstm_size, self.model) self.encoder_rnn.set_dropout(dropout) self.encoder_rnn2 = dy.GRUBuilder(NUM_LAYERS, EMBEDDING_SIZE, lstm_size, self.model) self.encoder_rnn2.set_dropout(dropout) self.decoder_rnn = dy.GRUBuilder(NUM_LAYERS, EMBEDDING_SIZE + lstm_size, lstm_size, self.model) self.decoder_rnn.set_dropout(dropout) else: self.encoder_rnn = dy.LSTMBuilder(NUM_LAYERS, EMBEDDING_SIZE, lstm_size, self.model) self.encoder_rnn.set_dropout(dropout) self.encoder_rnn2 = dy.LSTMBuilder(NUM_LAYERS, EMBEDDING_SIZE, lstm_size, self.model) self.encoder_rnn2.set_dropout(dropout) self.decoder_rnn = dy.LSTMBuilder(NUM_LAYERS, EMBEDDING_SIZE + lstm_size, lstm_size, self.model) self.decoder_rnn.set_dropout(dropout) global DROPOUT DROPOUT = dropout self.W1 = self.model.add_parameters((200, lstm_size)) self.b1 = self.model.add_parameters((200, 1)) self.W2 = self.model.add_parameters((100, 200)) self.b2 = self.model.add_parameters((100, 1)) self.W3 = self.model.add_parameters((len(self.C2I), 100)) self.b3 = self.model.add_parameters((len(self.C2I), 1)) self.W_query = self.model.add_parameters((lstm_size, lstm_size)) self.W_key = self.model.add_parameters((lstm_size, lstm_size)) self.W_val = self.model.add_parameters((lstm_size, lstm_size)) self.W_att = self.model.add_parameters((1, EMBEDDING_SIZE)) self.W_c_s = self.model.add_parameters((lstm_size, EMBEDDING_SIZE)) self.W_direct = self.model.add_parameters((len(self.C2I), lstm_size)) self.b_att = self.model.add_parameters((lstm_size, 1)) self.b_direct = self.model.add_parameters((len(self.C2I), 1)) self.E_lang = self.model.add_lookup_parameters((7, EMBEDDING_SIZE)) if optimizer == "sgd": self.trainer = dy.SimpleSGDTrainer(self.model) elif optimizer == "rms": self.trainer = dy.RMSPropTrainer(self.model) if optimizer == "cyclic": self.trainer = dy.CyclicalSGDTrainer(self.model) elif optimizer == "adam": self.trainer = dy.AdamTrainer(self.model) else: self.trainer = dy.AdagradTrainer(self.model)
def __init__(self, e0: numbers.Real = 0.1, eps: numbers.Real = 1e-20, skip_noisy: bool = False) -> None: super().__init__(optimizer=dy.AdagradTrainer( ParamManager.global_collection(), e0, eps=eps), skip_noisy=skip_noisy)
def __init__(self, Cemb, character_idx_map, options): model = dy.Model() self.trainer = dy.AdagradTrainer(model, options['lr']) # we use Adagrad self.params = self.initParams(model, Cemb, options) self.options = options self.model = model self.character_idx_map = character_idx_map
def __init__(self, model, num_input, num_hidden, num_out=2): self.model = model HIDDEN_DIM = 100 MLP_DIM = 100 self.trainer = dy.AdagradTrainer(model, 0.01) self.W1 = model.add_parameter((num_out, HIDDEN_DIM)) self.W2 = model.add_parameter((MLP_DIM, num_hidden * 2)) self.pT = model.add_lookup_parameter((num_out, MLP_DIM)) self.activation_func = dy.tanh self.spec = (num_input, num_hidden, num_out, self.activation_func)
def __init__(self, params, vocab, label2tag, pretrained_embeddings=None): """ :param params: :param vocab: :param label2tag: :param pretrained_embeddings: """ self.dim_w = params.dim_w self.win = params.win self.vocab = vocab self.n_words = len(self.vocab) self.dim_asp = params.dim_asp self.dim_y_asp = params.n_asp_tags self.n_steps = params.n_steps self.asp_label2tag = label2tag self.dropout_asp = params.dropout_asp self.dropout = params.dropout self.ds_name = params.ds_name self.model_name = params.model_name self.attention_type = params.attention_type self.pc = dy.ParameterCollection() self.Emb = WDEmb(pc=self.pc, n_words=self.n_words, dim_w=self.dim_w, pretrained_embeddings=pretrained_embeddings) self.DEP_RecNN = DTreeBuilder(pc=self.pc, n_in=self.win * self.dim_w, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.ASP_RNN = dy.LSTMBuilder(1, self.win * self.dim_w, self.dim_asp, self.pc) self.BiAttention_F=BiAttention(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.BiAttention_B=BiAttention(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.BiAttention_T=BiAttention(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.MultiWeightLayer=MultiWeightLayer(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.ASP_FC = Linear(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_y_asp) self.layers = [self.ASP_FC,self.DEP_RecNN,self.BiAttention_F,self.BiAttention_B,self.BiAttention_T,self.MultiWeightLayer] if params.optimizer == 'sgd': self.optimizer = dy.SimpleSGDTrainer(self.pc, params.sgd_lr) elif params.optimizer == 'momentum': self.optimizer = dy.MomentumSGDTrainer(self.pc, 0.01, 0.9) elif params.optimizer == 'adam': self.optimizer = dy.AdamTrainer(self.pc, 0.001, 0.9, 0.9) elif params.optimizer == 'adagrad': self.optimizer = dy.AdagradTrainer(self.pc) elif params.optimizer == 'adadelta': self.optimizer = dy.AdadeltaTrainer(self.pc) else: raise Exception("Invalid optimizer!!")
def __init__(self, model, num_layers, num_input, num_hidden, num_out): self.model = model HIDDEN_DIM = 100 MLP_DIM = 100 self.trainer = dy.AdagradTrainer(model, 0.01) self.pH = model.add_parameter((num_out, HIDDEN_DIM)) self.pO = model.add_parameter((MLP_DIM, num_hidden * 2)) self.pT = model.add_lookup_parameter((num_out, MLP_DIM)) self.builders = [ dy.LSTMBuilder(num_layers, num_input, num_hidden, model), dy.LSTMBuilder(num_layers, num_input, num_hidden, model) ] self.activation_func = dy.tanh self.spec = (num_input, num_hidden, num_out, self.activation_func)
def __init__(self, embedding_size, hidden_size, labels_size, embedding): self.embedding = embedding self.model = dy.Model() self.trainer = dy.AdagradTrainer(self.model, 0.05) self.linear = self.model.add_parameters((embedding_size, hidden_size)) self.feed_F = FeedForward(self.model, (hidden_size, hidden_size), (hidden_size, hidden_size), 0.2) self.feed_G = FeedForward(self.model, (hidden_size, 2 * hidden_size), (hidden_size, hidden_size), 0.2) self.h_step_1 = self.model.add_parameters( (2 * hidden_size, hidden_size)) self.h_step_2 = self.model.add_parameters((hidden_size, hidden_size)) self.linear2 = self.model.add_parameters((hidden_size, labels_size))
def __init__(self, exp_global=Ref(Path("exp_global")), e0=0.1, eps=1e-20): self.optimizer = dy.AdagradTrainer( exp_global.dynet_param_collection.param_col, e0, eps=eps)
ds.dev.matrices]) te_graphs = ds.dev.matrices if opts.eval_dev \ else ds.test.matrices if opts.model is not None: # load and skip training (eval mode) timeprint('loading association model from file: {}'.format(opts.model)) assoc_model = AssociationModel(tr_graphs, embs, opts.assoc_mode, model_path=opts.model) else: # training phase assoc_model = AssociationModel(tr_graphs, embs, opts.assoc_mode, opts.dropout) trainer = dy.AdagradTrainer(assoc_model.model, opts.learning_rate) with open( 'assoc-pred-train-log-{}_{}.txt'.format( start_time.date(), start_time.time()), 'a') as log_file: if opts.no_log: log_file = None else: log_file.write('====\n') iteration_losses = [] # will hold loss averages dev_mrrs = [] saved_name = None N = assoc_model.vocab_size for ep in range(opts.epochs): # report if opts.v > 0:
def train_model(model, encoder, decoder, params, train_inputs, train_outputs, dev_inputs, dev_outputs, y2int, int2y, epochs, optimization, results_file_path, plot, batch_size, eval_after): print 'training...' np.random.seed(17) random.seed(17) # sort training sentences by length in descending order train_data = zip(train_inputs, train_outputs) train_data.sort(key=lambda t: -len(t[0])) train_order = [ x * batch_size for x in range(len(train_data) / batch_size + 1) ] # sort dev sentences by length in descending order dev_batch_size = 1 dev_data = zip(dev_inputs, dev_outputs) dev_data.sort(key=lambda t: -len(t[0])) dev_order = [ x * dev_batch_size for x in range(len(dev_data) / dev_batch_size + 1) ] if optimization == 'ADAM': trainer = dn.AdamTrainer( model ) # lam=REGULARIZATION, alpha=LEARNING_RATE, beta_1=0.9, beta_2=0.999, eps=1e-8) elif optimization == 'MOMENTUM': trainer = dn.MomentumSGDTrainer(model) elif optimization == 'SGD': trainer = dn.SimpleSGDTrainer(model) elif optimization == 'ADAGRAD': trainer = dn.AdagradTrainer(model) elif optimization == 'ADADELTA': trainer = dn.AdadeltaTrainer(model) else: trainer = dn.SimpleSGDTrainer(model) trainer.set_clip_threshold(float(arguments['--grad-clip'])) seen_examples_count = 0 total_loss = 0 best_dev_epoch = 0 best_train_epoch = 0 patience = 0 train_len = len(train_outputs) dev_len = len(dev_inputs) avg_train_loss = -1 train_loss_patience = 0 train_loss_patience_threshold = 99999999 max_patience = int(arguments['--max-patience']) log_path = results_file_path + '_log.txt' start_epoch, checkpoints_x, train_loss_y, dev_loss_y, dev_accuracy_y = read_from_log( log_path) if len(train_loss_y) > 0: total_batches = checkpoints_x[-1] best_avg_train_loss = max(train_loss_y) best_dev_accuracy = max(dev_accuracy_y) best_dev_loss = max(dev_loss_y) else: total_batches = 0 best_avg_train_loss = 999999 best_dev_loss = 999999 best_dev_accuracy = 0 # progress bar init # noinspection PyArgumentList widgets = [progressbar.Bar('>'), ' ', progressbar.ETA()] train_progress_bar = progressbar.ProgressBar(widgets=widgets, maxval=epochs).start() for e in xrange(start_epoch, epochs): # shuffle the batch start indices in each epoch random.shuffle(train_order) batches_per_epoch = len(train_order) start = time.time() # go through batches for i, batch_start_index in enumerate(train_order, start=1): total_batches += 1 # get batch examples batch_inputs = [ x[0] for x in train_data[batch_start_index:batch_start_index + batch_size] ] batch_outputs = [ x[1] for x in train_data[batch_start_index:batch_start_index + batch_size] ] actual_batch_size = len(batch_inputs) # skip empty batches if actual_batch_size == 0 or len(batch_inputs[0]) == 0: continue # compute batch loss loss = compute_batch_loss(encoder, decoder, batch_inputs, batch_outputs, y2int) # forward pass total_loss += loss.scalar_value() loss.backward() # update parameters trainer.update() seen_examples_count += actual_batch_size # avg loss per sample avg_train_loss = total_loss / float(i * batch_size + e * train_len) # start patience counts only after 20 batches if avg_train_loss < best_avg_train_loss and total_batches > 20: best_avg_train_loss = avg_train_loss train_loss_patience = 0 else: train_loss_patience += 1 if train_loss_patience > train_loss_patience_threshold: print 'train loss patience exceeded: {}'.format( train_loss_patience) return model, params, e, best_train_epoch if total_batches % 100 == 0 and total_batches > 0: print 'epoch {}: {} batches out of {} ({} examples out of {}) total: {} batches, {} examples. avg \ loss per example: {}'.format(e, i, batches_per_epoch, i * batch_size, train_len, total_batches, total_batches * batch_size, avg_train_loss) # print sentences per second end = time.time() elapsed_seconds = end - start print '{} sentences per second'.format(seen_examples_count / elapsed_seconds) seen_examples_count = 0 start = time.time() # checkpoint if total_batches % eval_after == 0: print 'starting checkpoint evaluation' dev_bleu, dev_loss = checkpoint_eval( encoder, decoder, params, dev_batch_size, dev_data, dev_inputs, dev_len, dev_order, dev_outputs, int2y, y2int, results_file_path=results_file_path) log_to_file(log_path, e, total_batches, avg_train_loss, dev_loss, dev_bleu) save_model(model, results_file_path, total_batches, models_to_save=int(arguments['--models-to-save'])) if dev_bleu >= best_dev_accuracy: best_dev_accuracy = dev_bleu best_dev_epoch = e # save best model to disk save_best_model(model, results_file_path) print 'saved new best model' patience = 0 else: patience += 1 if dev_loss < best_dev_loss: best_dev_loss = dev_loss print 'epoch: {0} train loss: {1:.4f} dev loss: {2:.4f} dev bleu: {3:.4f} \ best dev bleu {4:.4f} (epoch {5}) patience = {6}'.format( e, avg_train_loss, dev_loss, dev_bleu, best_dev_accuracy, best_dev_epoch, patience) if patience == max_patience: print 'out of patience after {0} checkpoints'.format( str(e)) train_progress_bar.finish() if plot: plt.cla() print 'checkpoint patience exceeded' return model, params, e, best_train_epoch # plotting results from checkpoint evaluation if plot: train_loss_y.append(avg_train_loss) checkpoints_x.append(total_batches) dev_accuracy_y.append(dev_bleu) dev_loss_y.append(dev_loss) y_vals = [('train_loss', train_loss_y), ('dev loss', dev_loss_y), ('dev_bleu', dev_accuracy_y)] common.plot_to_file(y_vals, x_name='total batches', x_vals=checkpoints_x, file_path=results_file_path + '_learning_curve.png') # update progress bar after completing epoch train_progress_bar.update(e) # update progress bar after completing training train_progress_bar.finish() if plot: # clear plot when done plt.cla() print 'finished training. average loss: {} best epoch on dev: {} best epoch on train: {}'.format( str(avg_train_loss), best_dev_epoch, best_train_epoch) return model, params, e, best_train_epoch
BEGIN_TOKEN = '<s>' END_TOKEN = '<e>' # define model and obtain vocabulary # (reload vocab files is saved model or create new vocab files if new model) model = dynet.Model() if not args.trainer or args.trainer=="simple_sgd": trainer = dynet.SimpleSGDTrainer(model) elif args.trainer == "momentum_sgd": trainer = dynet.MomentumSGDTrainer(model) elif args.trainer == "adadelta": trainer = dynet.AdadeltaTrainer(model) elif args.trainer == "adagrad": trainer = dynet.AdagradTrainer(model) elif args.trainer == "adam": trainer = dynet.AdamTrainer(model) else: raise Exception("Trainer not recognized! Please use one of {simple_sgd, momentum_sgd, adadelta, adagrad, adam}") trainer.set_clip_threshold(-1.0) trainer.set_sparse_updates(True) # load corpus print "Loading corpus..." train_data = list(util.get_reader(args.reader_mode)(args.train, mode=args.reader_mode, begin=BEGIN_TOKEN, end=END_TOKEN)) if args.valid: valid_data = list(util.get_reader(args.reader_mode)(args.valid, mode=args.reader_mode, begin=BEGIN_TOKEN, end=END_TOKEN)) else:
def main(): parser = argparse.ArgumentParser(description='Train attention model') parser.add_argument('--model_path', default=None, type=str) parser.add_argument('--checkpoint_dir', default='./checkpoints', type=str) parser.add_argument('--train_set', default='./train_set', type=str) parser.add_argument('--train_set_dmp', default='./train_set.dmp', type=str) parser.add_argument('--valid_set', default='./valid_set', type=str) parser.add_argument('--valid_set_dmp', default='./valid_set_dmp', type=str) parser.add_argument('--vocab_path', default='./vocab.dmp', type=str) parser.add_argument('--unk_threshold', default=20, type=int) parser.add_argument('--batch_size', default=8, type=int) parser.add_argument('--trainer', default='adam', choices={'sgd', 'adam', 'adagrad'}, type=str) parser.add_argument('--type_embed_dim', default=128, type=int) parser.add_argument('--literal_embed_dim', default=128, type=int) parser.add_argument('--byte_embed_dim', default=64, type=int) parser.add_argument('--hash_dim', default=64, type=int) parser.add_argument('--att_dim', default=64, type=int) parser.add_argument('--num_layers', default=2, type=int) parser.add_argument('--hidden_dim', default=256, type=int) parser.add_argument('--dropout', default=None, type=float) parser.add_argument('--seed', default=11927, type=int) args, _ = parser.parse_known_args() if not os.path.exists(args.train_set_dmp): train_set = [] for path in glob.glob('%s/*.py' % args.train_set): with codecs.open(path, 'r', 'utf-8') as f: train_set.append(tokenize_without_empty_tail(f.read())) with open(args.train_set_dmp, 'wb') as f: pickle.dump(train_set, f) else: with open(args.train_set_dmp, 'rb') as f: train_set = pickle.load(f) train_set = [tokens for tokens in train_set if len(tokens) < 4000] print('size of train_set:', len(train_set)) token_literal_counters = defaultdict(lambda: defaultdict(int)) for token_type, token_literal in chain(*map(set, train_set)): token_literal_counters[token_type][token_literal] += 1 if not os.path.exists(args.vocab_path): type_vocabs = { token_type: { literal for literal, count in literal_counters.items() if count > args.unk_threshold } for token_type, literal_counters in token_literal_counters.items() } for token_type in tok_name: if token_type not in type_vocabs: type_vocabs[token_type] = set() with open(args.vocab_path, 'wb') as f: pickle.dump(type_vocabs, f) else: with open(args.vocab_path, 'rb') as f: type_vocabs = pickle.load(f) print( 'vocab_types:', { tok_name[token_type]: len(type_vocab) for token_type, type_vocab in type_vocabs.items() if len(type_vocab) > 2 }) copyable_types = {STRING, NAME, NUMBER} print('copyable_types:', {tok_name[token_type] for token_type in copyable_types}) if not os.path.exists(args.valid_set_dmp): valid_set = [] for path in glob.glob('%s/*.py' % args.valid_set): with codecs.open(path, 'r', 'utf-8') as f: valid_set.append(tokenize_without_empty_tail(f.read())) with open(args.valid_set_dmp, 'wb') as f: pickle.dump(valid_set, f) else: with open(args.valid_set_dmp, 'rb') as f: valid_set = pickle.load(f) print('size of valid_set:', len(valid_set)) random.seed(args.seed) model = dy.ParameterCollection() if args.trainer == 'sgd': trainer = dy.SimpleSGDTrainer(model) elif args.trainer == 'adam': trainer = dy.AdamTrainer(model) elif args.trainer == 'adagrad': trainer = dy.AdagradTrainer(model) decoder = Decoder(model, type_vocabs, copyable_types, args.type_embed_dim, args.literal_embed_dim, args.byte_embed_dim, args.hash_dim, args.att_dim, args.num_layers, args.hidden_dim) if not os.path.exists(args.checkpoint_dir): os.makedirs(args.checkpoint_dir) if args.model_path is None: model.save('%s/init.dmp' % args.checkpoint_dir) else: model.populate(args.model_path) if args.dropout is not None: decoder.set_dropout(args.dropout) num_samples = len(train_set) for num_epoch in itertools.count(1): random.shuffle(train_set) epoch_loss = 0.0 epoch_seq_length = 0 batch_losses = [] hash_cache = {} batch_seq_length = 0 num_batch = 0 dy.renew_cg() for i, (tokens) in enumerate(train_set, 1): print('batch', i, len(tokens)) loss = cal_loss(decoder, hash_cache, tokens) batch_losses.append(loss) batch_seq_length += len(tokens) epoch_seq_length += len(tokens) if i % args.batch_size == 0 or i == num_samples: batch_loss = dy.esum(batch_losses) / len(batch_losses) batch_loss.backward() trainer.update() batch_loss_value = batch_loss.value() epoch_loss += batch_loss_value dy.renew_cg() num_batch += 1 batch_losses = [] hash_cache = {} if num_batch % 20 == 0: batch_per_item_loss = batch_loss_value / batch_seq_length epoch_perplexity = math.exp(epoch_loss / epoch_seq_length) print('epoch %d, batch %d, batch_per_item_loss %f, epoch_perplexity %f' % \ (num_epoch, num_batch, batch_per_item_loss, epoch_perplexity)) batch_seq_length = 0 model.save('%s/epoch_%d.dmp' % (args.checkpoint_dir, num_epoch))
def main(): parser = argparse.ArgumentParser(description='Train attention model') parser.add_argument('--model_path', default=None, type=str) parser.add_argument('--checkpoint_dir', default='./checkpoints', type=str) parser.add_argument('--vocab_file', default='./vocab.dmp', type=str) parser.add_argument('--train_set', default='./train_set.dmp', type=str) parser.add_argument('--valid_set', default='./valid_set.dmp', type=str) parser.add_argument('--batch_size', default=64, type=int) parser.add_argument('--trainer', default='adam', choices={'sgd', 'adam', 'adagrad'}, type=str) parser.add_argument('--word_embed_dim', default=256, type=int) parser.add_argument('--encoder_num_layers', default=2, type=int) parser.add_argument('--encoder_state_dim', default=256, type=int) parser.add_argument('--op_embed_dim', default=32, type=int) parser.add_argument('--num_embed_dim', default=256, type=int) parser.add_argument('--sign_embed_dim', default=64, type=int) parser.add_argument('--att_dim', default=128, type=int) parser.add_argument('--decoder_num_layers', default=2, type=int) parser.add_argument('--decoder_state_dim', default=256, type=int) parser.add_argument('--dropout', default=None, type=float) parser.add_argument('--seed', default=11747, type=int) parser.add_argument('--max_op_count', default=50, type=int) args, _ = parser.parse_known_args() with open(args.vocab_file, 'rb') as f: op_names, word2wid, wid2word, num2nid, nid2num = pickle.load(f) op_names = sorted(op_names) with open(args.train_set, 'rb') as f: train_set = pickle.load(f) if len(train_set) > 0 and len(train_set[0][2][0]) == 8: print('add expr values...') train_set = add_expr_val(train_set) with open(args.train_set, 'wb') as f: pickle.dump(train_set, f) if len(train_set) > 0 and type(train_set[0][0][0]) == str: print('add num values...') train_set = add_num_val(train_set) with open(args.train_set, 'wb') as f: pickle.dump(train_set, f) print('size of train_set:', len(train_set)) random.seed(args.seed) model = dy.ParameterCollection() if args.trainer == 'sgd': trainer = dy.SimpleSGDTrainer(model) elif args.trainer == 'adam': trainer = dy.AdamTrainer(model) elif args.trainer == 'adagrad': trainer = dy.AdagradTrainer(model) encoder = Encoder(model, word2wid, args.word_embed_dim, args.encoder_num_layers, args.encoder_state_dim) decoder = Decoder(model, op_names, args.op_embed_dim, num2nid, args.num_embed_dim, args.sign_embed_dim, args.encoder_state_dim, args.att_dim, args.decoder_num_layers, args.decoder_state_dim) if not os.path.exists(args.checkpoint_dir): os.makedirs(args.checkpoint_dir) if args.model_path is None: model.save('%s/init.dmp' % args.checkpoint_dir) else: model.populate(args.model_path) if args.dropout is not None: encoder.set_dropout(args.dropout) decoder.set_dropout(args.dropout) num_problems = len(train_set) for num_epoch in itertools.count(1): random.shuffle(train_set) epoch_loss = 0.0 epoch_seq_length = 0 batch_losses = [] batch_seq_length = 0 num_batch = 0 dy.renew_cg() for i, (question, options, trace, input_num_indexes, answer) in enumerate(train_set, 1): problem_loss = cal_loss(encoder, decoder, question, options, input_num_indexes, trace, answer) batch_losses.append(problem_loss) batch_seq_length += len(trace) epoch_seq_length += len(trace) if i % args.batch_size == 0 or i == num_problems: batch_loss = dy.esum(batch_losses) / len(batch_losses) batch_loss.backward() trainer.update() batch_loss_value = batch_loss.value() batch_per_item_loss = batch_loss_value / batch_seq_length epoch_loss += batch_loss_value epoch_perplexity = math.exp(epoch_loss / epoch_seq_length) dy.renew_cg() num_batch += 1 batch_losses = [] batch_seq_length = 0 if num_batch % 20 == 0: print('epoch %d, batch %d, batch_per_item_loss %f, epoch_perplexity %f' % \ (num_epoch, num_batch, batch_per_item_loss, epoch_perplexity)) model.save('%s/epoch_%d.dmp' % (args.checkpoint_dir, num_epoch))
def __init__(self, params, vocab, label2tag, pretrained_embeddings=None): """ :param params: :param vocab: :param label2tag: :param pretrained_embeddings: """ self.dim_w = params.dim_w self.win = params.win self.vocab = vocab self.n_words = len(self.vocab) self.dim_asp = params.dim_asp self.dim_opi = params.dim_opi self.dim_y_asp = params.n_asp_tags self.dim_y_opi = params.n_opi_tags self.n_steps = params.n_steps self.asp_label2tag = label2tag self.opi_label2tag = {0: 'O', 1: 'T'} self.dropout_asp = params.dropout_asp self.dropout_opi = params.dropout_opi self.dropout = params.dropout self.rnn_type = params.rnn_type self.ds_name = params.ds_name self.model_name = params.model_name self.attention_type = params.attention_type self.pc = dy.ParameterCollection() self.Emb = WDEmb(pc=self.pc, n_words=self.n_words, dim_w=self.dim_w, pretrained_embeddings=pretrained_embeddings) #self.ASP_RNN = LSTM(pc=self.pc, n_in=self.win*self.dim_w, n_out=self.dim_asp, dropout_rate=self.dropout_asp) #self.OPI_RNN = LSTM(pc=self.pc, n_in=self.win*self.dim_w, n_out=self.dim_opi, dropout_rate=self.dropout_opi) # use dynet RNNBuilder rather than the self-defined RNN classes if self.rnn_type == 'LSTM': self.ASP_RNN = dy.LSTMBuilder(1, self.win * self.dim_w, self.dim_asp, self.pc) self.OPI_RNN = dy.LSTMBuilder(1, self.win * self.dim_w, self.dim_opi, self.pc) elif self.rnn_type == 'GRU': # NOT TRIED! self.ASP_RNN = dy.GRUBuilder(1, self.win * self.dim_w, self.dim_asp, self.pc) self.OPI_RNN = dy.GRUBuilder(1, self.win * self.dim_w, self.dim_opi, self.pc) else: raise Exception("Invalid RNN type!!!") self.THA = THA(pc=self.pc, n_steps=self.n_steps, n_in=2*self.dim_asp) if self.attention_type == 'bilinear': self.STN = ST_bilinear(pc=self.pc, dim_asp=self.dim_asp, dim_opi=self.dim_opi) # here dot attention is not applicable since the aspect representation and opinion representation # have different dimensions # elif self.attention_type == 'dot': # self.STN = ST_dot(pc=self.pc, dim_asp=self.dim_asp, dim_opi=self.dim_opi) elif self.attention_type == 'concat': self.STN = ST_concat(pc=self.pc, dim_asp=self.dim_asp, dim_opi=self.dim_opi) else: raise Exception("Invalid attention type!!!") self.ASP_FC = Linear(pc=self.pc, n_in=2*self.dim_asp+2*self.dim_opi, n_out=self.dim_y_asp) self.OPI_FC = Linear(pc=self.pc, n_in=2*self.dim_opi, n_out=self.dim_y_opi) self.layers = [self.ASP_FC, self.OPI_FC, self.THA, self.STN] if params.optimizer == 'sgd': self.optimizer = dy.SimpleSGDTrainer(self.pc, params.sgd_lr) elif params.optimizer == 'momentum': self.optimizer = dy.MomentumSGDTrainer(self.pc, 0.01, 0.9) elif params.optimizer == 'adam': self.optimizer = dy.AdamTrainer(self.pc, 0.001, 0.9, 0.9) elif params.optimizer == 'adagrad': self.optimizer = dy.AdagradTrainer(self.pc) elif params.optimizer == 'adadelta': # use default value of adadelta self.optimizer = dy.AdadeltaTrainer(self.pc) else: raise Exception("Invalid optimizer!!")
def main(): parser = argparse.ArgumentParser(description='Train attention model') parser.add_argument('--nl_embed_dim', default=256, type=int) parser.add_argument('--nl_rnn_layers', default=1, type=int) parser.add_argument('--nl_rnn_state_dim', default=256, type=int) parser.add_argument('--code_embed_dim', default=256, type=int) parser.add_argument('--code_rnn_layers', default=1, type=int) parser.add_argument('--code_rnn_state_dim', default=256, type=int) # parser.add_argument('--rnn_token_mlp_dim', default=128, type=int) # parser.add_argument('--rnn_type_mlp_dim', default=32, type=int) # parser.add_argument('--rnn_word_mlp_dim', default=128, type=int) parser.add_argument('--attention_dim', default=256, type=int) parser.add_argument('--dropout', default=0.5, type=float) parser.add_argument('--rnn_dropout', default=0.2, type=float) parser.add_argument('--nl_to_code', default=True, action='store_true') parser.add_argument('--code_to_nl', dest='nl_to_code', action='store_false') parser.add_argument('--vocab_file', default='./vocab.dmp', type=str) parser.add_argument('--batch_size', default=32, type=int) parser.add_argument('--train_set', default='./train.txt', type=str) parser.add_argument('--valid_set', default='./valid.txt', type=str) parser.add_argument('--trainer', default='adam', choices={'sgd', 'adam', 'adagrad'}, type=str) # TODO: Commented out for now, could implement the learning rate if necessary # parser.add_argument('--learning_rate', type=float) args, unknown = parser.parse_known_args() is_nl2code = args.nl_to_code nl_voc2wid, nl_wid2voc, code_voc2wid, code_wid2voc = load_vocabs(args.vocab_file) args.nl_vocab_size = len(nl_wid2voc) args.code_vocab_size = len(code_wid2voc) args.num_token_type = len(tok_type2id) + 1 # count the undifined_token for <S> and </S> if is_nl2code: model, translator = new_nl2code_model(args) config_name = 'nl2code' else: model, translator = new_code2nl_model(args) config_name = 'code2nl' config_name = datetime.now().strftime(config_name + '_%m%d%H%M%S') logging.basicConfig(filename=config_name + '.log', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') if args.trainer == 'sgd': trainer = dy.SimpleSGDTrainer(model) elif args.trainer == 'adam': trainer = dy.AdamTrainer(model) elif args.trainer == 'adagrad': trainer = dy.AdagradTrainer(model) # if args.learning_rate is None: # args.learning_rate = learning_rate def lookup_nl(seqs): return [[START] + map(lambda w:nl_voc2wid[w], seq) + [END] for seq in seqs] def lookup_code(seqs): return [[START] + map(lambda w:code_voc2wid[w[1]], seq) + [END] for seq in seqs] tokenized_nl_train, tokenized_code_train = read_data(args.train_set) tokenized_nl_valid, tokenized_code_valid = read_data(args.valid_set) nl_train = lookup_nl(tokenized_nl_train) nl_valid = lookup_nl(tokenized_nl_valid) code_train = lookup_code(tokenized_code_train) code_valid = lookup_code(tokenized_code_valid) if is_nl2code: train_pairs = partition(zip(nl_train, code_train)) valid_pairs = partition(zip(nl_valid, code_valid)) else: train_pairs = partition(zip(code_train, nl_train)) valid_pairs = partition(zip(code_valid, nl_valid)) def validate_loss(): cum_loss = 0.0 cum_trg_item_count = 0 for batch_pairs in batch_iter(valid_pairs, args.batch_size): src_seqs, trg_seqs = map(list, zip(*batch_pairs)) dy.renew_cg() batch_loss = translator.calc_loss(src_seqs, trg_seqs, training=False) cum_loss += batch_loss.value() cum_trg_item_count += sum(map(len, trg_seqs)) return cum_loss, cum_trg_item_count logging.info('config: %s', args) logging.info('nl vocab size: %d, code vocab size: %d' % (len(nl_voc2wid), len(code_voc2wid))) min_v_cum_loss = 1e20 for epoch in count(1): epoch_cum_loss = 0.0 epoch_cum_trg_item_count = 0 for batch_id, batch_pairs in enumerate(batch_iter(train_pairs, args.batch_size), 1): src_seqs, trg_seqs = map(list, zip(*batch_pairs)) dy.renew_cg() batch_loss = translator.calc_loss(src_seqs, trg_seqs, training=True) batch_loss.backward() trainer.update() batch_loss_value = batch_loss.value() batch_trg_item_count = sum(map(len, trg_seqs)) batch_per_item_loss = batch_loss_value / batch_trg_item_count epoch_cum_loss += batch_loss_value epoch_cum_trg_item_count += batch_trg_item_count epoch_cum_perplexity = math.exp(epoch_cum_loss / epoch_cum_trg_item_count) if batch_id % 100 == 0: logging.info('epoch %d, batch %d, batch_per_item_loss %f, epoch_cum_perplexity %f' % (epoch, batch_id, batch_per_item_loss, epoch_cum_perplexity)) epoch_cum_perplexity = math.exp(epoch_cum_loss / epoch_cum_trg_item_count) logging.info('epoch %d, #training item count#\t%d' % (epoch, epoch_cum_trg_item_count)) logging.info('epoch %d, #training total loss#\t%f' % (epoch, epoch_cum_loss)) logging.info('epoch %d, #training per item loss#\t%f' % (epoch, epoch_cum_loss / epoch_cum_trg_item_count)) logging.info('epoch %d, #training perplexity#\t%f' % (epoch, epoch_cum_perplexity)) v_cum_loss, v_cum_trg_item_count = validate_loss() v_cum_perplexity = math.exp(v_cum_loss / v_cum_trg_item_count) logging.info('epoch %d, #validation item count#\t%d' % (epoch, v_cum_trg_item_count)) logging.info('epoch %d, #validation total loss#\t%f' % (epoch, v_cum_loss)) logging.info('epoch %d, #validation per item loss#\t%f' % (epoch, v_cum_loss / v_cum_trg_item_count)) logging.info('epoch %d, #validation perplexity#\t%f' % (epoch, v_cum_perplexity)) if v_cum_loss < min_v_cum_loss: min_v_cum_loss = v_cum_loss min_v_cum_perplexity = v_cum_perplexity dmp_name = config_name + '_model_dmp' model.save(dmp_name+'.data') with open(dmp_name+'.meta', 'wb') as f: for k, v in vars(args).items(): f.write('--{}\t{}\n'.format(k, v)) logging.info('epoch %d, model saved to %s' % (epoch, dmp_name))
def __init__(self, e0=0.1, eps=1e-20): self.optimizer = dy.AdagradTrainer(ParamManager.global_collection(), e0, eps=eps)
def __init__(self, word_count, tag_count, word_dims, tag_dims, lstm_units, hidden_units, struct_out, label_out, droprate=0, struct_spans=4, label_spans=3, optimizer=1): self.word_count = word_count self.tag_count = tag_count self.word_dims = word_dims self.tag_dims = tag_dims self.lstm_units = lstm_units self.hidden_units = hidden_units self.struct_out = struct_out self.label_out = label_out self.droprate = droprate self.model = dynet.Model() if optimizer == 1: self.trainer = dynet.SimpleSGDTrainer(self.model) elif optimizer == 2: self.trainer = dynet.MomentumSGDTrainer(self.model) elif optimizer == 3: self.trainer = dynet.AdagradTrainer(self.model, learning_rate=0.01, eps=0.001) elif optimizer == 4: self.trainer = dynet.RMSPropTrainer(self.model) elif optimizer == 5: self.trainer = dynet.AdamTrainer(self.model) random.seed(1) self.activation = dynet.rectify self.word_embed = self.model.add_lookup_parameters( (word_count, word_dims), ) self.tag_embed = self.model.add_lookup_parameters( (tag_count, tag_dims), ) self.fwd_lstm1 = LSTM(word_dims + tag_dims, lstm_units, self.model) self.back_lstm1 = LSTM(word_dims + tag_dims, lstm_units, self.model) self.fwd_lstm2 = LSTM(2 * lstm_units, lstm_units, self.model) self.back_lstm2 = LSTM(2 * lstm_units, lstm_units, self.model) self.struct_hidden_W = self.model.add_parameters( (hidden_units, 4 * struct_spans * lstm_units), dynet.UniformInitializer(0.01), ) self.struct_hidden_b = self.model.add_parameters( (hidden_units, ), dynet.ConstInitializer(0), ) self.struct_output_W = self.model.add_parameters( (struct_out, hidden_units), dynet.ConstInitializer(0), ) self.struct_output_b = self.model.add_parameters( (struct_out, ), dynet.ConstInitializer(0), ) self.label_hidden_W = self.model.add_parameters( (hidden_units, 4 * label_spans * lstm_units), dynet.UniformInitializer(0.01), ) self.label_hidden_b = self.model.add_parameters( (hidden_units, ), dynet.ConstInitializer(0), ) self.label_output_W = self.model.add_parameters( (label_out, hidden_units), dynet.ConstInitializer(0), ) self.label_output_b = self.model.add_parameters( (label_out, ), dynet.ConstInitializer(0), )
words.append("_UNK_") chars.add("<*>") vw = Vocab.from_corpus([words]) vt = Vocab.from_corpus([tags]) vc = Vocab.from_corpus([chars]) UNK = vw.w2i["_UNK_"] nwords = vw.size() ntags = vt.size() nchars = vc.size() # DyNet Starts model = dy.Model() trainer = dy.AdagradTrainer(model) NUM_LAYERS = 1 embeddings, emb_dim = load_embeddings_file(embedding) # init model parameters and initialize them WORDS_LOOKUP = model.add_lookup_parameters((nwords, emb_dim)) CHARS_LOOKUP = model.add_lookup_parameters((nchars, 20)) init = 0 UNK_vec = np.random.rand(emb_dim) notfound = found= 0.0 for word in vw.w2i.keys(): # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers) if word in embeddings.keys(): found+=1
def run(filename): startTime = time.time() # read word embedding word_embedding_size = 300 word_embedding_file = "small_glove.txt" word_embedding = [] with open(word_embedding_file, 'r') as f: for (counter, line) in enumerate(f): if counter == 0: word_embedding_length = int(line) else: word_embedding.append( np.asarray([float(i) for i in line.split()]).reshape(1, -1)) word_embedding = np.concatenate(word_embedding, axis=0) print(word_embedding.shape) print(word_embedding_length) # read tree_data tree_data_file = "array_tree.txt" scores = [] words = [] lchs = [] rchs = [] with open(tree_data_file, 'r') as f: for (counter, line) in enumerate(f): if counter == 0: tree_data_size = int(line) else: temp = np.asarray([int(i) for i in line.split()]) if (counter - 1) % 5 == 1: scores.append(temp) elif (counter - 1) % 5 == 2: words.append(temp) elif (counter - 1) % 5 == 3: lchs.append(temp) elif (counter - 1) % 5 == 4: rchs.append(temp) print(len(scores)) print(len(words)) print(len(lchs)) print(len(rchs)) print(tree_data_size) # hyperparameters hidden_size = 150 output_size = 5 learning_rate = 0.05 batch = 1 # using larger batch size actually hurt the performance # parameters # for leaf m = dy.ParameterCollection() # Wi = m.add_parameters((hidden_size, word_embedding_size), init='normal', std=0.01) # bi = m.add_parameters(hidden_size, init = 0) # Wo = m.add_parameters((hidden_size, word_embedding_size), init='normal', std=0.01) # bo = m.add_parameters(hidden_size, init = 0) Wu = m.add_parameters((hidden_size, word_embedding_size), init='normal', std=0.01) bu = m.add_parameters(hidden_size, init=0) # for non leaf # U0i = m.add_parameters((hidden_size, hidden_size), init='normal', std=0.01) # U1i = m.add_parameters((hidden_size, hidden_size), init='normal', std=0.01) # bbi = m.add_parameters(hidden_size, init = 0) # U00f = m.add_parameters((hidden_size, hidden_size), init='normal', std=0.01) # U01f = m.add_parameters((hidden_size, hidden_size), init='normal', std=0.01) # U10f = m.add_parameters((hidden_size, hidden_size), init='normal', std=0.01) # U11f = m.add_parameters((hidden_size, hidden_size), init='normal', std=0.01) # bbf = m.add_parameters(hidden_size, init = 0) # U0o = m.add_parameters((hidden_size, hidden_size), init='normal', std=0.01) # U1o = m.add_parameters((hidden_size, hidden_size), init='normal', std=0.01) # bbo = m.add_parameters(hidden_size, init = 0) U0u = m.add_parameters((hidden_size, hidden_size), init='normal', std=0.01) U1u = m.add_parameters((hidden_size, hidden_size), init='normal', std=0.01) bbu = m.add_parameters(hidden_size, init=0) # for softmax Why = m.add_parameters((output_size, hidden_size), init='normal', std=0.01) by = m.add_parameters(output_size, init=0) trainer = dy.AdagradTrainer(m, learning_rate=learning_rate, eps=1e-8) # create a network for the xor problem given input and output def tree_lstm_network(scores, words, lchs, rchs): def rec(index): if (words[index] == -1): # branch node (l_loss, l_hidden) = rec(lchs[index]) (r_loss, r_hidden) = rec(rchs[index]) # i_gate = dy.logistic(U0i * l_hidden + U1i * r_hidden + bbi) # fl_gate = dy.logistic(U00f * l_hidden + U01f * r_hidden + bbf) # fr_gate = dy.logistic(U10f * l_hidden + U11f * r_hidden + bbf) # o_gate = dy.logistic(U0o * l_hidden + U1o * r_hidden + bbo) hidden = dy.tanh(U0u * l_hidden + U1u * r_hidden + bbu) # cell = dy.cmult(i_gate, u_value) + dy.cmult(fl_gate, l_cell) + dy.cmult(fr_gate, r_cell) # hidden = dy.cmult(o_gate, dy.tanh(cell)) pred1 = dy.log_softmax(Why * hidden + by) loss = l_loss + r_loss - pred1[int(scores[index])] return (loss, hidden) else: embedding_tensor = dy.inputTensor(word_embedding[words[index]]) # i_gate = dy.logistic(Wi * embedding_tensor + bi) # o_gate = dy.logistic(Wo * embedding_tensor + bo) hidden = dy.tanh(Wu * embedding_tensor + bu) # cell = dy.cmult(i_gate, u_value) # hidden = dy.cmult(o_gate, dy.tanh(cell)) pred1 = dy.log_softmax(Why * hidden + by) loss = -pred1[int(scores[index])] return (loss, hidden) return rec(0)[0] epocNum = 6 loopStart = time.time() loss_save = [] for epoc in range(epocNum): total_loss = 0 for batch_n in range(int(tree_data_size // batch)): dy.renew_cg() # new computation graph losses = [] for n in range(batch): index = batch_n * batch + n losses.append( tree_lstm_network(scores[index], words[index], lchs[index], rchs[index])) batch_loss = dy.esum(losses) total_loss += batch_loss.value() batch_loss.backward() trainer.update() loss_save.append(total_loss / tree_data_size) print("epoc {}, average_loss {}".format(epoc, total_loss / tree_data_size)) loopEnd = time.time() print('looptime is %s ' % (loopEnd - loopStart)) prepareTime = loopStart - startTime loopTime = loopEnd - loopStart timePerEpoch = loopTime / epocNum with open(filename, "w") as f: f.write("unit: " + "1 epoch\n") for loss in loss_save: f.write(str(loss) + "\n") f.write("run time: " + str(prepareTime) + " " + str(timePerEpoch) + "\n")
ergm = MultiGraphErgm(tr_graphs, embs, opts.assoc_mode, reg=opts.regularize, dropout=drop, model_path=opts.model, path_only_init=True, ergm_path=opts.ergm_model) else: dev_results = [] # training phase if opts.model is not None: # there's a pretrained association model ergm = MultiGraphErgm(tr_graphs, embs, opts.assoc_mode, reg=opts.regularize, dropout=drop, model_path=opts.model, path_only_init=True) else: ergm = MultiGraphErgm(tr_graphs, embs, opts.assoc_mode, reg=opts.regularize, dropout=drop) initial_weights = ergm.ergm_weights.as_array() trainer = dy.AdagradTrainer(ergm.model, opts.learning_rate) iteration_scores = [] log_file_name = 'pred-train-log-{}_{}.txt'.format(start_time.date(), start_time.time()) timeprint('starting training phase, writing to {}'.format(log_file_name)) with open(log_file_name, 'a') as log_file: log_file.write('====\n') for ep in range(opts.epochs): iteration_scores.extend(macro_loops(opts, ep + 1, ergm, trainer, log_file, synsets)) if opts.eval_dev and ep < opts.epochs - 1: dev_results.append(eval(tr_graphs, te_graphs, ergm, opts, N, log_file=None, rerank_file=None)) if opts.model_out is not None: # save model timeprint('saving trained model to {}'.format(opts.model_out)) ergm.save(opts.model_out, initial_weights) print('scores:', '\t'.join([str(sc) for sc in iteration_scores[::100]]))
def __init__(self, yaml_context, e0=0.1, eps=1e-20): self.optimizer = dy.AdagradTrainer( yaml_context.dynet_param_collection.param_col, e0, eps=eps)
def train_model(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, train_lemmas, train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts, dev_words, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, train_aligned_pairs, dev_aligned_pairs, feat_index, feature_types, plot): print 'training...' np.random.seed(17) random.seed(17) if optimization == 'ADAM': trainer = pc.AdamTrainer(model, lam=REGULARIZATION, alpha=LEARNING_RATE, beta_1=0.9, beta_2=0.999, eps=1e-8) elif optimization == 'MOMENTUM': trainer = pc.MomentumSGDTrainer(model) elif optimization == 'SGD': trainer = pc.SimpleSGDTrainer(model) elif optimization == 'ADAGRAD': trainer = pc.AdagradTrainer(model) elif optimization == 'ADADELTA': trainer = pc.AdadeltaTrainer(model) else: trainer = pc.SimpleSGDTrainer(model) total_loss = 0 best_avg_dev_loss = 999 best_dev_accuracy = -1 best_train_accuracy = -1 patience = 0 train_len = len(train_words) sanity_set_size = 100 epochs_x = [] train_loss_y = [] dev_loss_y = [] train_accuracy_y = [] dev_accuracy_y = [] e = -1 # progress bar init widgets = [progressbar.Bar('>'), ' ', progressbar.ETA()] train_progress_bar = progressbar.ProgressBar(widgets=widgets, maxval=epochs).start() avg_loss = -1 for e in xrange(epochs): # randomize the training set indices = range(train_len) random.shuffle(indices) train_set = zip(train_lemmas, train_feat_dicts, train_words, train_aligned_pairs) train_set = [train_set[i] for i in indices] # compute loss for each example and update for i, example in enumerate(train_set): lemma, feats, word, alignment = example loss = one_word_loss(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, word, alphabet_index, alignment, feat_index, feature_types) loss_value = loss.value() total_loss += loss_value loss.backward() trainer.update() if i > 0: avg_loss = total_loss / float(i + e * train_len) else: avg_loss = total_loss if EARLY_STOPPING: # get train accuracy print 'evaluating on train...' train_predictions = predict_sequences(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, alphabet_index, inverse_alphabet_index, train_lemmas[:sanity_set_size], train_feat_dicts[:sanity_set_size], feat_index, feature_types) train_accuracy = evaluate_model(train_predictions, train_lemmas[:sanity_set_size], train_feat_dicts[:sanity_set_size], train_words[:sanity_set_size], feature_types, print_results=False)[1] if train_accuracy > best_train_accuracy: best_train_accuracy = train_accuracy dev_accuracy = 0 avg_dev_loss = 0 if len(dev_lemmas) > 0: # get dev accuracy dev_predictions = predict_sequences(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, alphabet_index, inverse_alphabet_index, dev_lemmas, dev_feat_dicts, feat_index, feature_types) print 'evaluating on dev...' # get dev accuracy dev_accuracy = evaluate_model(dev_predictions, dev_lemmas, dev_feat_dicts, dev_words, feature_types, print_results=True)[1] if dev_accuracy > best_dev_accuracy: best_dev_accuracy = dev_accuracy # save best model to disk save_pycnn_model(model, results_file_path) print 'saved new best model' patience = 0 else: patience += 1 # found "perfect" model if dev_accuracy == 1: train_progress_bar.finish() if plot: plt.cla() return model, e # get dev loss total_dev_loss = 0 for i in xrange(len(dev_lemmas)): total_dev_loss += one_word_loss(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, dev_lemmas[i], dev_feat_dicts[i], dev_words[i], alphabet_index, dev_aligned_pairs[i], feat_index, feature_types).value() avg_dev_loss = total_dev_loss / float(len(dev_lemmas)) if avg_dev_loss < best_avg_dev_loss: best_avg_dev_loss = avg_dev_loss print 'epoch: {0} train loss: {1:.4f} dev loss: {2:.4f} dev accuracy: {3:.4f} train accuracy = {4:.4f} \ best dev accuracy {5:.4f} best train accuracy: {6:.4f} patience = {7}'.format(e, avg_loss, avg_dev_loss, dev_accuracy, train_accuracy, best_dev_accuracy, best_train_accuracy, patience) log_to_file(results_file_path + '_log.txt', e, avg_loss, train_accuracy, dev_accuracy) if patience == MAX_PATIENCE: print 'out of patience after {0} epochs'.format(str(e)) # TODO: would like to return best model but pycnn has a bug with save and load. Maybe copy via code? # return best_model[0] train_progress_bar.finish() if plot: plt.cla() return model, e else: # if no dev set is present, optimize on train set print 'no dev set for early stopping, running all epochs until perfectly fitting or patience was \ reached on the train set' if train_accuracy > best_train_accuracy: best_train_accuracy = train_accuracy # save best model to disk save_pycnn_model(model, results_file_path) print 'saved new best model' patience = 0 else: patience += 1 print 'epoch: {0} train loss: {1:.4f} train accuracy = {2:.4f} best train accuracy: {3:.4f} \ patience = {4}'.format(e, avg_loss, train_accuracy, best_train_accuracy, patience) # found "perfect" model on train set or patience has reached if train_accuracy == 1 or patience == MAX_PATIENCE: train_progress_bar.finish() if plot: plt.cla() return model, e # update lists for plotting train_accuracy_y.append(train_accuracy) epochs_x.append(e) train_loss_y.append(avg_loss) dev_loss_y.append(avg_dev_loss) dev_accuracy_y.append(dev_accuracy) # finished epoch train_progress_bar.update(e) if plot: with plt.style.context('fivethirtyeight'): p1, = plt.plot(epochs_x, dev_loss_y, label='dev loss') p2, = plt.plot(epochs_x, train_loss_y, label='train loss') p3, = plt.plot(epochs_x, dev_accuracy_y, label='dev acc.') p4, = plt.plot(epochs_x, train_accuracy_y, label='train acc.') plt.legend(loc='upper left', handles=[p1, p2, p3, p4]) plt.savefig(results_file_path + '.png') train_progress_bar.finish() if plot: plt.cla() print 'finished training. average loss: ' + str(avg_loss) return model, e
meta.w2i = {} for w in wvm.vocab: meta.w2i[w] = wvm.vocab[w].index if args.save_model: pickle.dump(meta, open('%s.meta' % args.save_model, 'wb')) if args.load_model: ontoparser = SubsumptionLearning(model=args.load_model) else: ontoparser = SubsumptionLearning(meta=meta) trainers = { 'momsgd': dy.MomentumSGDTrainer(ontoparser.model, edecay=0.25), 'adam': dy.AdamTrainer(ontoparser.model, edecay=0.25), 'simsgd': dy.SimpleSGDTrainer(ontoparser.model, edecay=0.25), 'adagrad': dy.AdagradTrainer(ontoparser.model, edecay=0.25), 'adadelta': dy.AdadeltaTrainer(ontoparser.model, edecay=0.25) } trainer = trainers[args.trainer] nntraining(train_sents) if args.dev: accuracy = Test(inputGenDev) sys.stdout.write("Accuracy: {}%\n".format(accuracy)) if args.isDaemon and args.daemonPort: sys.stderr.write('Leastening at port %d\n' % args.daemonPort) host = "0.0.0.0" #Listen on all interfaces port = args.daemonPort #Port number tcpsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)