def word_assoc_score(self, source_idx, target_idx, relation): """ NOTE THAT DROPOUT IS BEING APPLIED HERE :param source_idx: embedding index of source atom :param target_idx: embedding index of target atom :param relation: relation type :return: score """ # prepare s = self.embeddings[source_idx] if self.no_assoc: A = dy.const_parameter(self.word_assoc_weights[relation]) else: A = dy.parameter(self.word_assoc_weights[relation]) dy.dropout(A, self.dropout) t = self.embeddings[target_idx] # compute if self.mode == BILINEAR_MODE: return dy.transpose(s) * A * t elif self.mode == DIAG_RANK1_MODE: diag_A = dyagonalize(A[0]) rank1_BC = A[1] * dy.transpose(A[2]) ABC = diag_A + rank1_BC return dy.transpose(s) * ABC * t elif self.mode == TRANSLATIONAL_EMBED_MODE: return -dy.l2_norm(s - t + A) elif self.mode == DISTMULT: return dy.sum_elems(dy.cmult(dy.cmult(s, A), t))
def __call__(self, word, alternative=None, const=False): idx = self.vocab.get(word, 0) if idx == 0 and alternative is not None: for word_i in alternative: idx = self.vocab.get(word_i, 0) if idx != 0: break return self.lookup[idx] if not const else dn.transpose( dn.const_parameter(self.lookup))[idx]
number_of_learning_cycles = 2 m = dy.Model() input = dy.vecInput(sizes[0]) trainer = dy.SimpleSGDTrainer(m) values = [0] * number_of_nodes weight = [0] * maximum bias = [0] * maximum a = 0 for i in range(maximum): if (i == 0): a = matrices[i] weight[i] = m.add_parameters(a.shape, init=a) bias = m.add_parameters((layer_nodes[i + 1].__len__())) con = dy.const_parameter(m.add_parameters(a.shape, init=a)) weight[i] = dy.cmult(weight[i], con) result = weight[i] * input result = dy.logistic(result + bias) for j in range(layer_nodes[i + 1].__len__()): values[layer_nodes[i + 1][j]] = result[j] else: inp = [] for node in input_layer[i]: inp.extend([values[node]]) a = matrices[i] weight[i] = m.add_parameters(a.shape, init=a) inp = dy.concatenate(inp) bias = m.add_parameters(layer_nodes[i + 1].__len__()) con = dy.const_parameter(m.add_parameters(a.shape, init=a)) weight[i] = dy.cmult(weight[i], con)
def predict(self, features, task_name, train=False): """ Steps through the computation graph and obtains predictions for the provided input features. :param features: a list of word embeddings for every word in the sequence :param task_name: the name of the task that should be predicted :param train: if the model is training; apply noise in this case :return output: the output predictions penalty: the summed subspace penalty (0 if no constraint) """ if train: # noise is added only at training time features = [dynet.noise(fe, self.noise_sigma) for fe in features] # only if we use cross-stitch we have a layer for each task; # otherwise we just have one layer for all tasks num_layers = self.h_layers inputs = [features] * len(self.task_names) inputs_rev = [features] * len(self.task_names) target_task_id = self.task_names.index( task_name) if self.cross_stitch else 0 # collect the forward and backward sequences for each task at every # layer for the layer connection units layer_forward_sequences = [] layer_backward_sequences = [] penalty = dynet.const_parameter(self.subspace_penalty) for i in range(0, num_layers): forward_sequences = [] backward_sequences = [] for j in range(num_task_layers): predictor = self.predictors['inner'][i][j] forward_sequence, backward_sequence = predictor.predict_sequence( inputs[j], inputs_rev[j]) if i > 0 and self.activation: # activation between LSTM layers forward_sequence = [ self.activation(s) for s in forward_sequence ] backward_sequence = [ self.activation(s) for s in backward_sequence ] forward_sequences.append(forward_sequence) backward_sequences.append(backward_sequence) if self.num_subspaces == 2 and self.constraint_weight != 0: # returns a list per layer, i.e. here a list with one item lstm_parameters = \ predictor.builder.get_parameter_expressions()[0] # lstm parameters consists of these weights: # Wix,Wih,Wic,bi,Wox,Woh,Woc,bo,Wcx,Wch,bc for param_idx in range(len(lstm_parameters)): if param_idx in self.constrain_matrices: W = lstm_parameters[param_idx] W_shape = np.array(W.value()).shape if (len(W_shape) < 2): W_shape = [W_shape[0], 1] # split matrix into its two subspaces W_subspaces = dynet.reshape( W, (self.num_subspaces, W_shape[0] / float(self.num_subspaces), W_shape[1])) subspace_1, subspace_2 = W_subspaces[ 0], W_subspaces[1] # calculate the matrix product of the two matrices matrix_product = dynet.transpose( subspace_1) * subspace_2 # take the squared Frobenius norm by squaring # every element and then summing them squared_frobenius_norm = dynet.sum_elems( dynet.square(matrix_product)) penalty += squared_frobenius_norm if self.cross_stitch: # takes as input a list of input lists and produces a list of # outputs where the index indicates the task forward_sequences = self.predictors['cross_stitch'][i].stitch( forward_sequences) backward_sequences = self.predictors['cross_stitch'][i].stitch( backward_sequences) inputs = forward_sequences inputs_rev = backward_sequences layer_forward_sequences.append(forward_sequences) layer_backward_sequences.append(backward_sequences) if i == num_layers - 1: output_predictor = \ self.predictors['output_layers_dict'][task_name] # get the forward/backward states of all task layers task_forward_sequences = [ layer_seq_list[target_task_id][-1] for layer_seq_list in layer_forward_sequences ] task_backward_sequences = [ layer_seq_list[target_task_id][0] for layer_seq_list in layer_backward_sequences ] if (num_layers > 1): forward_input = \ self.predictors['layer_stitch'][ target_task_id].stitch(task_forward_sequences) backward_input = \ self.predictors['layer_stitch'][ target_task_id].stitch(task_backward_sequences) else: forward_input = task_forward_sequences[0] backward_input = task_backward_sequences[0] concat_layer = dynet.concatenate( [forward_input, backward_input]) if train and self.noise_sigma > 0.0: concat_layer = dynet.noise(concat_layer, self.noise_sigma) output = [] if ('sentiment' in task_name): #Multi-label for i in range(len(output_predictor)): output.append(output_predictor[i](concat_layer)) else: output.append(output_predictor(concat_layer)) #output = output_predictor.predict_sequence(concat_layer) return output, penalty raise Exception('Error: This place should not be reached.')
def fit(self, train_languages, num_epochs, patience, optimizer, train_dir, dev_dir): """ Train the model, return the train and dev score :param train_language: the language used for training :param num_epochs: the max number of epochs the model should be trained :param patience: the patience to use for early stopping :param optimizer: the optimizer that should be used :param train_dir: the directory containing the training files :param dev_dir: the directory containing the development files """ first_train = True if self.best_epoch == ( -1) else False #Check whether this is a loaded model print("Reading training data from %s..." % train_dir, flush=True) train_X, train_Y, word2id = get_data(train_languages, self.task_names, word2id=self.word2id, task2label2id=self.task2label2id, data_dir=train_dir, train=first_train) print("Finished reading training data") print("Reading development data from %s..." % train_dir, flush=True) dev_X, dev_Y, _ = get_data(train_languages, self.task_names, word2id, self.task2label2id, data_dir=dev_dir, train=False) print("Finished reading development data") print('Length of training data:', len(train_X), flush=True) print('Length of development data:', len(dev_X), flush=True) if (first_train): self.word2id = word2id print('Building the computation graph...', flush=True) self.predictors= \ self.build_computation_graph() if optimizer == SGD: trainer = dynet.SimpleSGDTrainer(self.model) elif optimizer == ADAM: trainer = dynet.AdamTrainer(self.model) else: raise ValueError('%s is not a valid optimizer.' % optimizer) train_data = list(zip(train_X, train_Y)) num_iterations = 0 num_epochs_no_improvement = 0 train_score = {} dev_score = {} print('Training model with %s for %d epochs and patience of %d.' % (optimizer, num_epochs, patience)) for epoch in range(self.best_epoch + 1, num_epochs): print('', flush=True) bar = Bar('Training epoch %d/%d...' % (epoch + 1, num_epochs), max=len(train_data), flush=True) # keep track of the # of updates, total loss, and total # of # predicted instances per task task2num_updates = {task: 0 for task in self.task_names} task2total_loss = {task: 0.0 for task in self.task_names} task2total_predicted = {task: 0.0 for task in self.task_names} total_loss = 0.0 total_penalty = 0.0 total_predicted = 0.0 random.shuffle(train_data) # for every instance, we optimize the loss of the corresponding task for word_indices, task2label_id_seq in train_data: # get the concatenated word and char-based features for every # word in the sequence features = self.get_word_features(word_indices) for task, y in task2label_id_seq.items(): output, penalty = self.predict(features, task, train=True) if task not in TASK_NAMES: raise NotImplementedError('Task %s has not been ' 'implemented yet.' % task) loss = dynet.esum([pick_neg_log(o, gold) for \ o, gold in zip(output, y)]) lv = loss.value() # sum the loss and the subspace constraint penalty combined_loss = loss + dynet.const_parameter( self.constraint_weight_param) * penalty total_loss += lv total_penalty += penalty.value() total_predicted += 1 task2total_loss[task] += lv task2total_predicted[task] += 1 task2num_updates[task] += 1 # back-propagate through the combined loss combined_loss.backward() trainer.update() bar.next() num_iterations += 1 print( "\nEpoch %d. Loss per instance: %.3f. Penalty per instance: %.3f. " % (epoch + 1, total_loss / total_predicted, total_penalty / total_predicted), end='', flush=True) print('Loss per instance by task: ') for task in task2total_loss.keys(): print( '%s: %.3f. ' % (task, task2total_loss[task] / task2total_predicted[task]), end='', flush=True) print('', flush=True) # evaluate after every epoch avg_train_score_by_task_list = [ ] #Each item stores the avg train score (by task) for a particular language avg_dev_score_by_task_list = [ ] #Each item stores the avg dev score (by task) for a particular language train_data_size_list = [ ] #Each item stores the size for a particular language train set dev_data_size_list = [ ] #Each item stores the size for a particular language dev set for lang in train_languages: train_eval_X, train_eval_Y, _ = utils.get_data( [lang], model.task_names, model.word2id, model.task2label2id, data_dir=args.train_dir, train=False) train_data_size_list += [len(train_eval_Y)] dev_eval_X, dev_eval_Y, _ = utils.get_data( [lang], model.task_names, model.word2id, model.task2label2id, data_dir=args.dev_dir, train=False) dev_data_size_list += [len(dev_eval_Y)] train_score = self.evaluate(train_eval_X, train_eval_Y, lang, args.threshold) dev_score = self.evaluate(dev_eval_X, dev_eval_Y, lang, args.threshold) avg_train_score_by_task_list.append( utils.average_by_task(train_score)) avg_dev_score_by_task_list.append( utils.average_by_task(dev_score)) print('=' * 50) print('\tStart logging for {} in epoch {}'.format( test_lang, epoch + 1)) utils.log_fit(self.log_dir, epoch + 1, train_languages, test_lang, task_names, train_score, dev_score) print('\tFinish logging for {} in epoch {}'.format( test_lang, epoch + 1)) #Compute the weighted average over all languages and use it to determine the overall performance of training total_train_size = len(train_Y) total_dev_size = len(dev_Y) avg_train_score = util.average_by_lang( avg_train_score_by_task_list, train_data_size_list, total_train_size) avg_dev_score = util.average_by_lang(avg_dev_score_by_task_list, dev_data_size_list, total_dev_size) if avg_dev_score > self.avg_dev_score: self.avg_dev_score = avg_dev_score self.avg_train_score = avg_train_score self.best_train_dict = train_score self.best_dev_dict = dev_score self.best_epoch = epoch num_epochs_no_improvement = 0 print('Saving model to directory %s...' % self.model_dir, flush=True) self.save() else: num_epochs_no_improvement += 1 if num_epochs_no_improvement == patience: #dynet.load(self.model_file, self.model) break print('Finished training', flush=True) print('Loading the best performing model from %s...'\ % self.model_dir, flush=True) self.model.populate(self.model_file) return self.best_train_dict, self.best_dev_dict, self.avg_train_score, self.avg_dev_score
def parameters(*params, **kargs): trainable = 'trainable' not in kargs or kargs['trainable'] yield tuple(map(lambda x:dy.parameter(x) if trainable else dy.const_parameter(x), params))