def __init__(self, args): # set up output directory self.output_dir = os.path.join(args.experiment_dir, args.run_name) if not os.path.exists(args.experiment_dir): os.mkdir(args.experiment_dir) if not os.path.exists(self.output_dir): os.mkdir(self.output_dir) if not os.path.exists(os.path.join(args.experiment_dir,"runs/")): os.mkdir(os.path.join(args.experiment_dir,"runs/")) # initialize tensorboard writer self.runs_dir = os.path.join(args.experiment_dir,"runs/",args.run_name) self.writer = SummaryWriter(self.runs_dir) # initialize global steps self.train_gs = 0 self.val_gs = 0 # initialize model config self.config = ModelConfig(args) # check if there is a model to load if args.old_model_dir is not None: self.use_old_model = True self.load_dir = args.old_model_dir self.config.load_from_file( os.path.join(self.load_dir, "config.json")) # create vocab self.vocab = Vocab() self.vocab.load_from_dict(os.path.join(self.load_dir, "vocab.json")) self.update_vocab = False self.config.min_count=1 else: self.use_old_model = False self.vocab = None self.update_vocab = True # create data sets self.dataset_filename = args.dataset_filename # train self.train_dataset = DialogueDataset( os.path.join(self.dataset_filename, "train_data.json"), self.config.sentence_len, self.vocab, self.update_vocab) self.data_loader_train = torch.utils.data.DataLoader( self.train_dataset, self.config.train_batch_size, shuffle=True) self.config.train_len = len(self.train_dataset) self.vocab = self.train_dataset.vocab # eval self.val_dataset = DialogueDataset( os.path.join(self.dataset_filename, "val_data.json"), self.config.sentence_len, self.vocab, self.update_vocab) self.data_loader_val = torch.utils.data.DataLoader( self.val_dataset, self.config.val_batch_size, shuffle=True) self.config.val_len = len(self.val_dataset) # update, and save vocab self.vocab = self.val_dataset.vocab self.train_dataset.vocab = self.vocab if (self.config.min_count > 1): self.config.old_vocab_size = len(self.vocab) self.vocab.prune_vocab(self.config.min_count) self.vocab.save_to_dict(os.path.join(self.output_dir, "vocab.json")) self.vocab_size = len(self.vocab) self.config.vocab_size = self.vocab_size # load embeddings if self.config.pretrained_embeddings_dir is None: pretrained_embeddings = get_pretrained_embeddings(self.config.pretrained_embeddings_dir , self.vocab) else: pretrained_embeddings = None # print and save the config file self.config.print_config(self.writer) self.config.save_config(os.path.join(self.output_dir, "config.json")) # set device self.device = torch.device('cuda') # create model self.model = Transformer( self.config.vocab_size, self.config.label_len, self.config.sentence_len, d_word_vec=self.config.embedding_dim, d_model=self.config.model_dim, d_inner=self.config.inner_dim, n_layers=self.config.num_layers, n_head=self.config.num_heads, d_k=self.config.dim_k, d_v=self.config.dim_v, dropout=self.config.dropout, pretrained_embeddings=pretrained_embeddings ).to(self.device) # create optimizer self.optimizer = torch.optim.Adam( filter(lambda x: x.requires_grad, self.model.parameters()), betas=(0.9, 0.98), eps=1e-09) # load old model, optimizer if there is one if self.use_old_model: self.model, self.optimizer = load_checkpoint( os.path.join(self.load_dir, "model.bin"), self.model, self.optimizer, self.device) # create a sceduled optimizer object self.optimizer = ScheduledOptim( self.optimizer, self.config.model_dim, self.config.warmup_steps)
def main(): nb_epochs = 30 batch_size = 200 hidden_size = 256 embedding_dim = 300 max_len = 20 teacher_forcing = 0.6 min_count = 2 max_grad_norm = 5 val_len = 5000 weight_decay = 0.00001 model_filename = '/home/mattd/pycharm/yelp/models' \ '/baseline_frozen_pretrained' eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train' dataset = SentenceDataset(eng_fr_filename, max_len, min_count) print('Dataset: {}'.format(len(dataset))) train_len = len(dataset) - val_len dataset_train, dataset_val = torch.utils.data.dataset.random_split( dataset, [train_len, val_len]) print('Train {}, val: {}'.format(len(dataset_train), len(dataset_val))) embeddings_dir = '/home/mattd/pycharm/yelp/embeddings.npy' embeddings = cuda(get_pretrained_embeddings(embeddings_dir, dataset)) data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False) vocab_size = len(dataset.vocab) padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN] init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN] model = Seq2SeqModel(embeddings, hidden_size, padding_idx, init_idx, max_len, teacher_forcing) model = cuda(model) parameters = list(model.parameters()) optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=weight_decay) criterion = torch.nn.CrossEntropyLoss( ignore_index=dataset.vocab[SentenceDataset.PAD_TOKEN]) phases = [ 'train', 'val', ] data_loaders = [ data_loader_train, data_loader_val, ] lowest_loss = 500 for epoch in range(nb_epochs): for phase, data_loader in zip(phases, data_loaders): if phase == 'train': model.train() else: model.eval() epoch_loss = [] for i, (inputs, targets) in enumerate(data_loader): optimizer.zero_grad() inputs = variable(inputs) targets = variable(targets) outputs = model(inputs, targets) targets = targets.view(-1) outputs = outputs.view(targets.size(0), -1) loss = criterion(outputs, targets) if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm) optimizer.step() epoch_loss.append(float(loss)) epoch_loss = np.mean(epoch_loss) if epoch_loss < lowest_loss: save_checkpoint(model, loss, optimizer, model_filename) lowest_loss = epoch_loss if phase == 'train': print('Epoch {:03d} | {} loss: {:.3f}'.format( epoch, phase, epoch_loss), end='') else: print(', {} loss: {:.3f}'.format(phase, epoch_loss), end='\n') # print random sentence if phase == 'val': random_idx = np.random.randint(len(dataset_val)) inputs, targets = dataset_val[random_idx] inputs_var = variable(inputs) outputs_var = model(inputs_var.unsqueeze( 0)) # unsqueeze to get the batch dimension outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy() print(u'> {}'.format( get_sentence_from_indices(inputs, dataset.vocab, SentenceDataset.EOS_TOKEN))) print(u'= {}'.format( get_sentence_from_indices(targets, dataset.vocab, SentenceDataset.EOS_TOKEN))) print(u'< {}'.format( get_sentence_from_indices(outputs, dataset.vocab, SentenceDataset.EOS_TOKEN))) print()
def build_graph(self): # Reset previous graph. reset_graph() # Placeholders. x_source = tf.placeholder(tf.int32, shape=[None, None], name="x_source") source_seq_length = tf.placeholder(tf.int32, shape=[None], name="source_seq_length") x_target = tf.placeholder(tf.int32, shape=[None, None], name="x_target") target_seq_length = tf.placeholder(tf.int32, shape=[None], name="target_seq_length") labels = tf.placeholder(tf.float32, shape=[None], name="labels") input_dropout = tf.placeholder_with_default(1.0, shape=[], name="input_dropout") output_dropout = tf.placeholder_with_default(1.0, shape=[], name="output_dropout") decision_threshold = tf.placeholder_with_default(0.5, shape=[], name="decision_threshold") # Embedding layer. with tf.variable_scope("embeddings"): if self.config.source_embeddings_path is not None and self.config.target_embeddings_path is not None: source_pretrained_embeddings,\ target_pretrained_embeddings = get_pretrained_embeddings( source_embeddings_path, target_embeddings_path, source_vocab, target_vocab) assert source_pretrained_embeddings.shape[1] == target_pretrained_embeddings.shape[1] self.config.embedding_size = source_pretrained_embeddings.shape[1] if self.config.fix_pretrained: source_embeddings = tf.get_variable( name="source_embeddings_matrix", shape=[self.config.source_vocab_size, self.config.embedding_size], initializer=tf.constant_initializer(source_pretrained_embeddings), trainable=False) target_embeddings = tf.get_variable( name="target_embeddings_matrix", shape=[self.config.target_vocab_size, self.config.embedding_size], initializer=tf.constant_initializer(target_pretrained_embeddings), trainable=False) else: source_embeddings = tf.get_variable( name="source_embeddings_matrix", shape=[self.config.source_vocab_size, self.config.embedding_size], initializer=tf.constant_initializer(source_pretrained_embeddings)) target_embeddings = tf.get_variable( name="target_embeddings_matrix", shape=[self.config.target_vocab_size, self.config.embedding_size], initializer=tf.constant_initializer(target_pretrained_embeddings)) else: source_embeddings = tf.get_variable( name="source_embeddings_matrix", shape=[self.config.source_vocab_size, self.config.embedding_size]) target_embeddings = tf.get_variable( name="target_embeddings_matrix", shape=[self.config.target_vocab_size, self.config.embedding_size]) source_rnn_inputs = tf.nn.embedding_lookup(source_embeddings, x_source) target_rnn_inputs = tf.nn.embedding_lookup(target_embeddings, x_target) source_rnn_inputs = tf.nn.dropout(source_rnn_inputs, keep_prob=input_dropout, name="source_seq_embeddings") target_rnn_inputs = tf.nn.dropout(target_rnn_inputs, keep_prob=input_dropout, name="target_seq_embeddings") # BiRNN encoder. with tf.variable_scope("birnn") as scope: if self.config.use_lstm: cell_fw = tf.nn.rnn_cell.LSTMCell(self.config.state_size, use_peepholes=True) cell_bw = tf.nn.rnn_cell.LSTMCell(self.config.state_size, use_peepholes=True) else: cell_fw = tf.nn.rnn_cell.GRUCell(self.config.state_size) cell_bw = tf.nn.rnn_cell.GRUCell(self.config.state_size) cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw, output_keep_prob=output_dropout) cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw, output_keep_prob=output_dropout) if self.config.num_layers > 1: if self.config.use_lstm: cell_fw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(self.config.state_size, use_peepholes=True) for _ in range(self.config.num_layers)]) cell_bw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(self.config.state_size, use_peepholes=True) for _ in range(self.config.num_layers)]) else: cell_fw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.GRUCell(self.config.state_size) for _ in range(self.config.num_layers)]) cell_bw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.GRUCell(self.config.state_size) for _ in range(self.config.num_layers)]) with tf.variable_scope(scope): source_rnn_outputs, source_final_state = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=source_rnn_inputs, sequence_length=source_seq_length, dtype=tf.float32) with tf.variable_scope(scope, reuse=True): target_rnn_outputs, target_final_state = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=target_rnn_inputs, sequence_length=target_seq_length, dtype=tf.float32) self.config.state_size *= 2 # Mean and max pooling only work for 1 layer BiRNN. if self.config.use_mean_pooling: source_final_state = self.average_pooling(source_rnn_outputs, source_seq_length) target_final_state = self.average_pooling(target_rnn_outputs, target_seq_length) elif self.config.use_max_pooling: source_final_state = self.max_pooling(source_rnn_outputs) target_final_state = self.max_pooling(target_rnn_outputs) else: source_final_state_fw, source_final_state_bw = source_final_state target_final_state_fw, target_final_state_bw = target_final_state if self.config.num_layers > 1: source_final_state_fw = source_final_state_fw[-1] source_final_state_bw = source_final_state_bw[-1] target_final_state_fw = target_final_state_fw[-1] target_final_state_bw = target_final_state_bw[-1] if self.config.use_lstm: source_final_state_fw = source_final_state_fw.h source_final_state_bw = source_final_state_bw.h target_final_state_fw = target_final_state_fw.h target_final_state_bw = target_final_state_bw.h source_final_state = tf.concat([source_final_state_fw, source_final_state_bw], axis=1, name="source_final_state_ph") target_final_state = tf.concat([target_final_state_fw, target_final_state_bw], axis=1) # Feed-forward neural network. with tf.variable_scope("feed_forward"): h_multiply = tf.multiply(source_final_state, target_final_state) h_abs_diff = tf.abs(tf.subtract(source_final_state, target_final_state)) W_1 = tf.get_variable(name="W_1", shape=[self.config.state_size, self.config.hidden_size]) W_2 = tf.get_variable(name="W_2", shape=[self.config.state_size, self.config.hidden_size]) b_1 = tf.get_variable(name="b_1", shape=[self.config.hidden_size], initializer=tf.constant_initializer(0.0)) h_semantic = tf.tanh(tf.matmul(h_multiply, W_1) + tf.matmul(h_abs_diff, W_2) + b_1) W_3 = tf.get_variable(name="W_3", shape=[self.config.hidden_size, 1]) b_2 = tf.get_variable(name="b_2", shape=[1], initializer=tf.constant_initializer(0.0)) logits = tf.matmul(h_semantic, W_3) + b_2 logits = tf.squeeze(logits, name="logits") # Sigmoid output layer. with tf.name_scope("output"): probs = tf.sigmoid(logits, name="probs") predicted_class = tf.cast(tf.greater(probs, decision_threshold), tf.float32, name="predicted_class") # Loss. with tf.name_scope("cross_entropy"): losses = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=labels, name="cross_entropy_per_sequence") mean_loss = tf.reduce_mean(losses, name="cross_entropy_loss") # Optimization. with tf.name_scope("optimization"): global_step = tf.Variable(initial_value=0, trainable=False, name="global_step") optimizer = tf.train.AdamOptimizer(self.config.learning_rate) trainable_variables = tf.trainable_variables() gradients = tf.gradients(mean_loss, trainable_variables, name="gradients") clipped_gradients, global_norm = tf.clip_by_global_norm( gradients, clip_norm=self.config.max_gradient_norm, name="clipped_gradients") train_op = optimizer.apply_gradients(zip(clipped_gradients, trainable_variables), global_step=global_step) # Evaluation metrics. accuracy = tf.metrics.accuracy(labels, predicted_class, name="accuracy") precision = tf.metrics.precision(labels, predicted_class, name="precision") recall = tf.metrics.recall(labels, predicted_class, name="recall") # Add summaries. tf.summary.scalar("loss", mean_loss) tf.summary.scalar("global_norm", global_norm) tf.summary.scalar("accuracy", accuracy[0]) tf.summary.scalar("precision", precision[0]) tf.summary.scalar("recall", recall[0]) tf.summary.scalar("logits" + "/sparsity", tf.nn.zero_fraction(logits)) tf.summary.histogram("logits" + "/activations", logits) tf.summary.histogram("probs", probs) # Add histogram for trainable variables. for var in trainable_variables: tf.summary.histogram(var.op.name, var) # Add histogram for gradients. for grad, var in zip(clipped_gradients, trainable_variables): if grad is not None: tf.summary.histogram(var.op.name + "/gradients", grad) # Assign placeholders and operations. self.x_source = x_source self.x_target = x_target self.source_seq_length = source_seq_length self.target_seq_length = target_seq_length self.labels = labels self.input_dropout = input_dropout self.output_dropout = output_dropout self.decision_threshold = decision_threshold self.train_op = train_op self.probs = probs self.predicted_class = predicted_class self.mean_loss = mean_loss self.accuracy = accuracy self.precision = precision self.recall = recall self.summaries = tf.summary.merge_all() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
def main(): nb_epochs = 50 batch_size = 500 hidden_size = 256 embedding_dim = 300 pretrained_embeddings = "/embeddings_min2_max15.npy" max_grad_norm = 5 max_len = 15 min_count = 2 weight_decay = 0.00001 learning_rate = 0.001 model_group = "/auto_encoder" autoencoder_name = "/auto_encoder_3" autoencoder_version = 1 project_file = "/home/mattd/PycharmProjects/reddit" dataset_path = "/home/mattd/PycharmProjects/reddit/data/" string = 'nb_epochs: {}\nbatch_size: {}\nhidden_size: {}\nembedding_dim: ' \ '{}\npretrained_embeddings: {}\nmax_len: {}\nmin_countmin_count: '\ '{}\nweight_decay: {}\nlearning_rate: {}\nmodel_group: ' \ '{}\nautoencoder_name: {}\nautoencoder_version: {}\n'.format( nb_epochs, batch_size, hidden_size, embedding_dim, pretrained_embeddings, max_len, min_count, weight_decay, learning_rate, model_group, autoencoder_name,autoencoder_version) print(string) output = string + '\n' # embedding_filename = 'embeddings_20_1.npy's' model_filename = '{}{}s{}'.format( project_file, model_group, autoencoder_name) new_model_filename = '{}_{}'.format(model_filename, autoencoder_version) output_file = '{}{}_outputs{}_{}'.format( project_file, model_group, autoencoder_name, autoencoder_version) description_filename = \ '{}/description/description_1.txt'.format(project_file) # eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train' dataset_train_filename = "{}train.csv".format(dataset_path) dataset_val_filename = "{}validation.csv".format(dataset_path) dataset_train = SentenceDataset(dataset_train_filename, max_len, min_count) dataset_val = SentenceDataset(dataset_val_filename, max_len, min_count, dataset_train.vocab) string = 'Train {}, val: {}'.format(len(dataset_train), len(dataset_val)) print(string) output += string + '\n' # getting pretrained embeddings if pretrained_embeddings is not None: embeddings_dir = '{}{}'.format(project_file, pretrained_embeddings) pretrained_embeddings = cuda( get_pretrained_embeddings(embeddings_dir)) embedding_dim = pretrained_embeddings.shape[1] data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False) vocab_size = len(dataset_val.vocab) padding_idx = dataset_val.vocab[SentenceDataset.PAD_TOKEN] init_idx = dataset_val.vocab[SentenceDataset.INIT_TOKEN] model = Seq2SeqModel(hidden_size, padding_idx, init_idx, max_len, vocab_size, embedding_dim, pretrained_embeddings) model = cuda(model) parameters = list(model.parameters()) optimizer = torch.optim.Adam( parameters, amsgrad=True, weight_decay=weight_decay, lr=learning_rate) criterion = torch.nn.CrossEntropyLoss(ignore_index=dataset_val.vocab[ SentenceDataset.PAD_TOKEN]) model, optimizer, lowest_loss, description, last_epoch, \ train_loss, val_loss, found_model = load_checkpoint(model_filename, model, optimizer) if found_model: string = 'Loaded Model:\nlowest_validation_loss: {}\ndescription: {}' \ '\nlast_epoch:{}\n'.format(lowest_loss, description, last_epoch) else: string = 'No model found at {}\n'.format(model_filename) print(string) output = output + string + '\n' outfile = open(output_file, 'w') outfile.write(output) outfile.close() phases = ['train', 'val', ] data_loaders = [data_loader_train, data_loader_val, ] intervals = 6 for epoch in range(last_epoch, last_epoch+nb_epochs): start = time.clock() #if epoch == 6: # model.unfreeze_embeddings() # parameters = list(model.parameters()) # optimizer = torch.optim.Adam( # parameters, amsgrad=True, weight_decay=weight_decay) for phase, data_loader in zip(phases, data_loaders): if phase == 'train': model.train() else: model.eval() epoch_loss = [] epoch_sentenence_accuracy = [] epoch_token_accuracy = [] j = 1 for i, inputs in tqdm(enumerate(data_loader)): optimizer.zero_grad() inputs = variable(inputs) targets = variable(inputs) outputs = model.auto_encoder(inputs, targets) targets = targets.view(-1) outputs = outputs.view(targets.size(0), -1) loss = criterion(outputs, targets) epoch_loss.append(float(loss)) average_epoch_loss = np.mean(epoch_loss) if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm) optimizer.step() if (len(data_loader) / intervals)*j <= i+1: train_loss.append(average_epoch_loss) string = ( 'Epoch {:03d} Example {:03d} | {} loss: {:.3f}'.format( epoch, i, phase, average_epoch_loss)) print(string, end='\n') output = output + string + '\n' j += 1 else: predicted = torch.argmax( outputs.view(-1, max_len, vocab_size), -1) batch_sentence_accuracy, batch_token_accuracy = encoder_accuracy( targets.view(-1, max_len), predicted) epoch_sentenence_accuracy.append(batch_sentence_accuracy) epoch_token_accuracy.append(batch_token_accuracy) if phase == 'val': averege_epoch_sentenence_accuracy = np.mean(epoch_sentenence_accuracy) averege_epoch_token_accuracy = np.mean(epoch_token_accuracy) time_taken = time.clock() - start val_loss.append(average_epoch_loss) string = ' {} loss: {:.3f} | time: {:.3f}'.format( phase, average_epoch_loss, time_taken) print(string, end='') output = output + '\n' + string + '\n' string = '| sentence accuracy:{:.3f}| token accuracy:{:.3f}'.format( averege_epoch_sentenence_accuracy, averege_epoch_token_accuracy) print(string, end='\n') output = output + string + '\n' if average_epoch_loss < lowest_loss: save_checkpoint( model, average_epoch_loss, optimizer, new_model_filename, description_filename, epoch, train_loss, val_loss) lowest_loss = average_epoch_loss random_idx = np.random.randint(len(dataset_val)) inputs = dataset_val[random_idx] targets = inputs inputs_var = variable(inputs) outputs_var = model.auto_encoder(inputs_var.unsqueeze(0)) # # unsqueeze to # get the batch dimension outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy() string = '> {}\n'.format(get_sentence_from_indices( inputs, dataset_val.vocab, SentenceDataset.EOS_TOKEN)) string = string + u'= {}\n'.format(get_sentence_from_indices( targets, dataset_val.vocab, SentenceDataset.EOS_TOKEN)) string = string + u'< {}'.format(get_sentence_from_indices( outputs, dataset_val.vocab, SentenceDataset.EOS_TOKEN)) print(string, end='\n') output = output + string + '\n' + '\n' outfile = open(output_file, 'w') outfile.write(output) outfile.close()
def main(): nb_epochs = 100 batch_size = 500 hidden_size = 256 embedding_dim = 300 pretrained_embeddings = None max_len = 20 min_count = 2 max_grad_norm = 5 val_len = 10000 weight_decay = 0.00001 model_filename = '/home/mattd/pycharm/encoder/models3' \ '/Baseline' description_filename = \ '/home/mattd/pycharm/encoder/description/description2.txt' output_file = '/home/mattd/pycharm/encoder/model_outputs_3/baseline' outfile = open(output_file, 'w') eng_fr_filename = '/home/okovaleva/projects/forced_apart/autoencoder/data' \ '/train_1M.txt' dataset = SentenceDataset(eng_fr_filename, max_len, min_count) string = 'Dataset: {}'.format(len(dataset)) print(string) outfile.write(string+'\n') train_len = len(dataset) - val_len dataset_train, dataset_val = torch.utils.data.dataset.random_split(dataset, [train_len, val_len]) string = 'Train {}, val: {}'.format(len(dataset_train), len(dataset_val)) print(string) outfile.write(string+'\n') embeddings_dir = '/home/mattd/pycharm/encoder' \ '/embeddings_3min.npy' pretrained_embeddings = cuda( get_pretrained_embeddings(embeddings_dir, dataset)) embedding_dim = pretrained_embeddings.shape[1] data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False) vocab_size = len(dataset.vocab) padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN] init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN] model = Seq2SeqModel( pretrained_embeddings, hidden_size, padding_idx, init_idx, max_len, vocab_size, embedding_dim) model = cuda(model) parameters = list(model.parameters()) optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=weight_decay) criterion = torch.nn.CrossEntropyLoss(ignore_index=dataset.vocab[SentenceDataset.PAD_TOKEN]) model, optimizer, lowest_loss, description, last_epoch, \ train_loss, val_loss = load_checkpoint(model_filename, model, optimizer) print(description) phases = ['train', 'val', ] data_loaders = [data_loader_train, data_loader_val, ] for epoch in range(last_epoch, last_epoch+nb_epochs): start = time.clock() #if epoch == 6: # model.unfreeze_embeddings() # parameters = list(model.parameters()) # optimizer = torch.optim.Adam( # parameters, amsgrad=True, weight_decay=weight_decay) for phase, data_loader in zip(phases, data_loaders): if phase == 'train': model.train() else: model.eval() epoch_loss = [] epoch_sentenence_accuracy = [] epoch_token_accuracy = [] for i, inputs in enumerate(data_loader): optimizer.zero_grad() inputs = variable(inputs) targets = variable(inputs) outputs = model(inputs, targets) targets = targets.view(-1) outputs = outputs.view(targets.size(0), -1) loss = criterion(outputs, targets) if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm) optimizer.step() if phase == 'val': predicted = torch.argmax(outputs.view(batch_size, max_len, -1), -1) batch_sentenence_accuracy, batch_token_accuracy = accuracy( targets.view(batch_size, -1), predicted) epoch_sentenence_accuracy.append(batch_sentenence_accuracy) epoch_token_accuracy.append(batch_token_accuracy) epoch_loss.append(float(loss)) epoch_loss = np.mean(epoch_loss) if phase == 'train': train_loss.append(epoch_loss) string = ('Epoch {:03d} | {} loss: {:.3f}'.format( epoch, phase, epoch_loss)) print(string, end='\n') outfile.write(string+'\n') else: averege_epoch_sentenence_accuracy = sum(epoch_sentenence_accuracy) / \ len(epoch_sentenence_accuracy) averege_epoch_token_accuracy = sum(epoch_token_accuracy) / \ len(epoch_token_accuracy) time_taken = time.clock() - start val_loss.append(epoch_loss) string = ' {} loss: {:.3f} | time: {:.3f}'.format( phase, epoch_loss, time_taken) print(string, end='') string = '| sentence accuracy:{:.3f}| token accuracy:{:.3f}'.format( averege_epoch_sentenence_accuracy, averege_epoch_token_accuracy) print(string, end='\n') outfile.write(string+'\n') if epoch_loss < lowest_loss: save_checkpoint( model, epoch_loss, optimizer, model_filename, description_filename, epoch, train_loss, val_loss) lowest_loss = epoch_loss # print random sentence if phase == 'val': random_idx = np.random.randint(len(dataset_val)) inputs = dataset_val[random_idx] targets = inputs inputs_var = variable(inputs) outputs_var = model(inputs_var.unsqueeze(0)) # unsqueeze to get the batch dimension outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy() string = '> {}'.format(get_sentence_from_indices( inputs, dataset.vocab, SentenceDataset.EOS_TOKEN)) print(string, end='\n') outfile.write(string+'\n') string = u'= {}'.format(get_sentence_from_indices( targets, dataset.vocab, SentenceDataset.EOS_TOKEN)) print(string, end='\n') outfile.write(string+'\n') string = u'< {}'.format(get_sentence_from_indices( outputs, dataset.vocab, SentenceDataset.EOS_TOKEN)) print(string, end='\n') outfile.write(string+'\n') print() outfile.close()
def main(): file = { "model_group": "/seq_len_exp", "model_name": "/generation_6", "old_model_name": None, "model_version": 0, "project_file": "/home/mattd/PycharmProjects/reddit/generation" } file["dataset_path"] = "{}/data/".format(file["project_file"]) file["model_filename"] = '{}{}s{}_{}'.format(file["project_file"], file["model_group"], file["model_name"], file["model_version"]) file["output_file"] = '{}{}_outputs{}_{}'.format(file["project_file"], file["model_group"], file["model_name"], file["model_version"]) # check_files(file) use_old_model = file["old_model_name"] is not None params = {} if use_old_model: file["old_model_filename"] = '{}{}s{}'.format(file["project_file"], file["model_group"], file["old_model_name"]) params, old_files = load_params(file["old_model_filename"]) use_old_model = old_files != {} if not use_old_model: params = { "batch_size": 1000, "hidden_size": 256, "embedding_dim": 300, "pretrained_embeddings": True, "max_grad_norm": 5, "max_len": 30, "min_count": 2, "weight_decay": 0.00001, "learning_rate": 0.005, } params["num_training_examples"] = 78260 params["num_val_examples"] = -1 params["nb_epochs"] = 40 if params["pretrained_embeddings"]: file["pretrained_embeddings_file"] = \ "/embeddings/embeddings_min{}_max{}.npy".format( params["min_count"], params["max_len"]) string = "" for k, v in file.items(): string += "{}: {}\n".format(k, v) for k, v in params.items(): string += "{}: {}\n".format(k, v) print(string) output = string + '\n' # eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train' dataset_train_filename = "{}train.csv".format(file["dataset_path"]) dataset_val_filename = "{}validation.csv".format(file["dataset_path"]) dataset_train = PairsDataset(dataset_train_filename, params["max_len"], params["min_count"]) dataset_val = PairsDataset(dataset_val_filename, params["max_len"], params["min_count"], dataset_train.vocab) string = 'Vocab size {}\n'.format(len(dataset_train.vocab)) string += 'Train {} '.format(len(dataset_train)) if params["num_training_examples"] != -1: dataset_train.prune_examples(params["num_training_examples"]) string += '> {}'.format(len(dataset_train)) string += '\nVal: {}'.format(len(dataset_val)) if params["num_val_examples"] != -1: dataset_val.prune_examples(params["num_val_examples"]) string += '-> {}'.format(len(dataset_val)) print(string) output += string + '\n' if params["pretrained_embeddings"]: embeddings_dir = '{}{}'.format(file["project_file"], file["pretrained_embeddings_file"]) pretrained_embeddings = cuda(get_pretrained_embeddings(embeddings_dir)) params["embedding_dim"] = pretrained_embeddings.shape[1] else: pretrained_embeddings = None data_loader_train = torch.utils.data.DataLoader(dataset_train, params["batch_size"], shuffle=True) data_loader_val = torch.utils.data.DataLoader(dataset_val, params["batch_size"], shuffle=False) vocab_size = len(dataset_train.vocab) padding_idx = dataset_train.vocab[PairsDataset.PAD_TOKEN] init_idx = dataset_train.vocab[PairsDataset.INIT_TOKEN] model = Seq2SeqModel(params["hidden_size"], padding_idx, init_idx, params["max_len"], vocab_size, params["embedding_dim"], pretrained_embeddings) model = cuda(model) parameters = list(model.parameters()) optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=params["weight_decay"], lr=params["learning_rate"]) criterion = torch.nn.CrossEntropyLoss() if use_old_model: model, optimizer = load_checkpoint(file["old_model_filename"], model, optimizer) lowest_loss = 100 train_loss = [] val_loss = [] best_model = model best_optimizer = optimizer average_epoch_loss = 0 metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []} outfile = open(file["output_file"], 'w') outfile.write(output) outfile.close() phases = [ 'train', 'val', ] data_loaders = [ data_loader_train, data_loader_val, ] intervals = 2 highest_acc = 0 for epoch in range(0, params["nb_epochs"]): start = time.clock() string = 'Epoch: {}\n'.format(epoch) print(string, end='') output = output + '\n' + string #if epoch == 6: # model.unfreeze_embeddings() # parameters = list(model.parameters()) # optimizer = torch.optim.Adam( # parameters, amsgrad=True, weight_decay=weight_decay) for phase, data_loader in zip(phases, data_loaders): if phase == 'train': model.train() string = 'Train: \n' else: model.eval() string = 'Validation \n' print(string, end='') output = output + '\n' + string epoch_loss = [] epoch_accuracy = [] epoch_precision = [] epoch_recall = [] epoch_f1 = [] j = 1 for i, (sentence_1, sentence_2, labels) in tqdm(enumerate(data_loader)): optimizer.zero_grad() sentence_1 = variable(sentence_1) sentence_2 = variable(sentence_2) targets = variable(labels) outputs = model(sentence_1, sentence_2, targets) targets = targets.view(-1) outputs = outputs.view(targets.size(0), -1) loss = criterion(outputs, targets) epoch_loss.append(float(loss)) average_epoch_loss = np.mean(epoch_loss) if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm_(parameters, params["max_grad_norm"]) optimizer.step() if (len(data_loader) / intervals) * j <= i + 1: string = ('Example {:03d} | {} loss: {:.3f}'.format( i, phase, average_epoch_loss)) print(string, end='\n') output = output + string + '\n' j += 1 else: # get result metrics accuracy, precision, recall, f1 = classifier_accuracy( targets.cpu().numpy(), torch.argmax(outputs.view(-1, 2), -1).cpu().numpy()) #print('{},{},{},{}'.format(accuracy, precision, recall, # f1)) epoch_accuracy.append(accuracy) epoch_precision.append(precision) epoch_recall.append(recall) epoch_f1.append(f1) # print random sentence if phase == 'val': time_taken = time.clock() - start val_loss.append(average_epoch_loss) string = ' {} loss: {:.3f} | time: {:.3f}'.format( phase, average_epoch_loss, time_taken) string += ' | lowest loss: {:.3f} highest accuracy:' \ ' {:.3f}'.format(lowest_loss, highest_acc) print(string, end='\n') output = output + '\n' + string + '\n' average_epoch_accuracy = np.mean(epoch_accuracy) average_epoch_precision = np.mean(epoch_precision) average_epoch_recall = np.mean(epoch_recall) average_epoch_f1 = np.mean(epoch_f1) metrics["accuracy"].append(average_epoch_accuracy), metrics["precision"].append(average_epoch_precision) metrics["recall"].append(average_epoch_recall) metrics["f1"].append(average_epoch_f1) if average_epoch_loss < lowest_loss: best_model = model best_optimizer = optimizer best_epoch = epoch lowest_loss = average_epoch_loss save_checkpoint(best_epoch, best_model, best_optimizer, epoch, model, optimizer, train_loss, val_loss, metrics, params, file) if average_epoch_accuracy > highest_acc: highest_acc = average_epoch_accuracy string = "Accuracy: {:.3f}\nPrecision: {:.3f}\nRecall: {:.3f}\n" \ "F1: {:.3f}\n".format( average_epoch_accuracy, average_epoch_precision, average_epoch_recall, average_epoch_f1) print(string, end='\n') output = output + string + '\n' random_idx = np.random.randint(len(dataset_val)) sentence_1, sentence_2, labels = dataset_val[random_idx] targets = labels sentence_1_var = variable(sentence_1) sentence_2_var = variable(sentence_2) outputs_var = model(sentence_1_var.unsqueeze(0), sentence_2_var.unsqueeze(0)) # unsqueeze # to get the batch dimension outputs = outputs_var.squeeze(0).data.cpu().numpy() string = '> {}\n'.format( get_sentence_from_indices(sentence_1, dataset_val.vocab, PairsDataset.EOS_TOKEN)) string = string + u'> {}\n'.format( get_sentence_from_indices(sentence_2, dataset_val.vocab, PairsDataset.EOS_TOKEN)) string = string + u'target:{}| P false:{:.3f}, P true:' \ u' {:.3f}'.format(targets, float(outputs[0]), float(outputs[1])) print(string, end='\n\n') output = output + string + '\n' + '\n' else: train_loss.append(average_epoch_loss) outfile = open(file["output_file"], 'w') outfile.write(output) outfile.close()