示例#1
0
文件: mnnl.py 项目: duffau/bilstm-aux
    def forward(self, observations):
        # calculate forward pass
        def log_sum_exp(scores):
            npval = scores.npvalue()
            argmax_score = np.argmax(npval)
            max_score_expr = dynet.pick(scores, argmax_score)
            max_score_expr_broadcast = dynet.concatenate([max_score_expr] *
                                                         self.num_tags)
            return max_score_expr + dynet.logsumexp_dim(
                (scores - max_score_expr_broadcast), 0)

        init_alphas = [-1e10] * self.num_tags
        init_alphas[START_TAG] = 0
        for_expr = dynet.inputVector(init_alphas)
        for obs in observations:
            alphas_t = []
            for next_tag in range(self.num_tags):
                obs_broadcast = dynet.concatenate([dynet.pick(obs, next_tag)] *
                                                  self.num_tags)
                next_tag_expr = for_expr + self.trans_mat[
                    next_tag] + obs_broadcast
                alphas_t.append(log_sum_exp(next_tag_expr))
            for_expr = dynet.concatenate(alphas_t)
        terminal_expr = for_expr + self.trans_mat[END_TAG]
        alpha = log_sum_exp(terminal_expr)
        return alpha
示例#2
0
 def pick_neg_log(self, pred, gold):
     # TODO make this a static function in both classes
     if not isinstance(gold, int) and not isinstance(gold, np.int64):
         # calculate cross-entropy loss against the whole vector
         dy_gold = dynet.inputVector(gold)
         return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred)))
     return -dynet.log(dynet.pick(pred, gold))
示例#3
0
    def get_top_k_paths(self, all_paths, k=None, threshold=None):
        """
        Get the top k scoring paths
        """
        cg = renew_cg()
        path_scores = []
        lemma_lookup = self.model_parameters["lemma_lookup"]
        pos_lookup = self.model_parameters["pos_lookup"]
        dep_lookup = self.model_parameters["dep_lookup"]
        dir_lookup = self.model_parameters["dir_lookup"]
        builder = self.builder
        W = parameter(self.model_parameters["W"])

        for path in all_paths:
            path_embedding = get_path_embedding(builder, lemma_lookup,
                                                pos_lookup, dep_lookup,
                                                dir_lookup, path)

            if self.use_xy_embeddings:
                zero_word = _dynet.inputVector([0.0] * self.lemma_dim)
                path_embedding = concatenate(
                    [zero_word, path_embedding, zero_word])

            path_scores.append(softmax(W * path_embedding).npvalue()[1])

        path_scores = np.array(path_scores)
        indices = np.argsort(-path_scores)

        if k is not None:
            indices = indices[:k]

        top_paths = [(all_paths[index], path_scores[index])
                     for index in indices
                     if threshold is None or path_scores[index] >= threshold]
        return top_paths
示例#4
0
 def get_summer(s, size):  # list of values (bidirection) => one value
     if s == "avg":
         return dy.average
     else:
         mask = [0. for _ in range(size // 2)
                 ] + [1. for _ in range(size // 2)]
         mask2 = [1. for _ in range(size // 2)
                  ] + [0. for _ in range(size // 2)]
         if s == "fend":
             return lambda x: dy.cmult(dy.inputVector(mask2), x[-1])
         elif s == "bend":
             return lambda x: dy.cmult(dy.inputVector(mask), x[0])
         elif s == "ends":
             return lambda x: dy.cmult(dy.inputVector(mask2), x[
                 -1]) + dy.cmult(dy.inputVector(mask), x[0])
         else:
             return None
示例#5
0
    def predict(self,
                feature_vector,
                task_ids,
                train=False,
                soft_labels=False,
                temperature=None,
                dropout_rate=0.0,
                orthogonality_weight=0.0,
                domain_id=None):
        dynet.renew_cg()  # new graph

        feature_vector = feature_vector.toarray()
        feature_vector = np.squeeze(feature_vector, axis=0)

        # self.input = dynet.vecInput(self.vocab_size)
        # self.input.set(feature_vector)
        # TODO this takes too long; can we speed this up somehow?
        input = dynet.inputVector(feature_vector)
        for i in range(self.h_layers):
            if train:  # add some noise
                input = dynet.noise(input, self.noise_sigma)
                input = dynet.dropout(input, dropout_rate)
            input = self.layers[i](input)
        outputs = []
        for task_id in task_ids:
            output = self.output_layers_dict[task_id](input,
                                                      soft_labels=soft_labels,
                                                      temperature=temperature)
            outputs.append(output)

        constraint, adv_loss = 0, 0
        if orthogonality_weight != 0:
            # put the orthogonality constraint either directly on the
            # output layer or on the hidden layer if it's an MLP
            F0_layer = self.output_layers_dict["F0"]
            F1_layer = self.output_layers_dict["F1"]
            F0_param = F0_layer.W_mlp if self.add_hidden else F0_layer.W
            F1_param = F1_layer.W_mlp if self.add_hidden else F1_layer.W
            F0_W = dynet.parameter(F0_param)
            F1_W = dynet.parameter(F1_param)

            # calculate the matrix product of the task matrix with both others
            matrix_product = dynet.transpose(F0_W) * F1_W

            # take the squared Frobenius norm by squaring
            # every element and then summing them
            squared_frobenius_norm = dynet.sum_elems(
                dynet.square(matrix_product))
            constraint += squared_frobenius_norm
            # print('Constraint with first matrix:', squared_frobenius_norm.value())

        if domain_id is not None:
            # flip the gradient when back-propagating through here
            adv_input = dynet.flip_gradient(input)  # last state
            adv_output = self.adv_layer(adv_input)
            adv_loss = self.pick_neg_log(adv_output, domain_id)
            # print('Adversarial loss:', avg_adv_loss.value())
        return outputs, constraint, adv_loss
示例#6
0
    def get_top_k_paths(self, all_paths, relation_index, threshold):
        """
        Get the top k scoring paths
        """
        builder = self.builder
        model = self.model
        model_parameters = self.model_parameters
        lemma_lookup = model_parameters['lemma_lookup']
        pos_lookup = model_parameters['pos_lookup']
        dep_lookup = model_parameters['dep_lookup']
        dir_lookup = model_parameters['dir_lookup']

        path_scores = []

        for i, path in enumerate(all_paths):

            if i % 1000 == 0:
                cg = dy.renew_cg()
                W1 = dy.parameter(model_parameters['W1'])
                b1 = dy.parameter(model_parameters['b1'])
                W2 = None
                b2 = None

                if self.num_hidden_layers == 1:
                    W2 = dy.parameter(model_parameters['W2'])
                    b2 = dy.parameter(model_parameters['b2'])

            path_embedding = get_path_embedding(builder, lemma_lookup,
                                                pos_lookup, dep_lookup,
                                                dir_lookup, path)

            if self.use_xy_embeddings:
                zero_word = dy.inputVector([0.0] * self.lemma_embeddings_dim)
                path_embedding = dy.concatenate(
                    [zero_word, path_embedding, zero_word])

            h = W1 * path_embedding + b1

            if self.num_hidden_layers == 1:
                h = W2 * dy.tanh(h) + b2

            path_score = dy.softmax(h).npvalue().T
            path_scores.append(path_score)

        path_scores = np.vstack(path_scores)

        top_paths = []
        for i in range(len(relation_index)):
            indices = np.argsort(-path_scores[:, i])
            top_paths.append([
                (all_paths[index], path_scores[index, i]) for index in indices
                if threshold is None or path_scores[index, i] >= threshold
            ])

        return top_paths
示例#7
0
    def augment(scores, oracle_index, crossing=False):
        '''
            Add the hinge loss into scores.
        '''

        assert isinstance(scores, dy.Expression)
        shape = scores.dim()[0]
        assert len(shape) == 1
        increment = np.ones(shape)
        increment[oracle_index] = crossing
        return scores + dy.inputVector(increment)
示例#8
0
 def get_w_repr(self, word, train=False, update=True):
     """
     Get representation of word (word embedding)
     """
     if train:
         if self.w_dropout_rate > 0.0:
             w_id = self.w2i[UNK] if drop(word, self.wcount, self.w_dropout_rate) else self.w2i.get(word, self.w2i[UNK])
     else:
         if self.mimickx_model_path: # if given use MIMICKX
             if word not in self.w2i: #
                 #print("predict with MIMICKX for: ", word)
                 return dynet.inputVector(self.mimickx_model.predict(word).npvalue())
         w_id = self.w2i.get(word, self.w2i[UNK])
     if not update:
         return dynet.nobackprop(self.wembeds[w_id])
     else:
         return self.wembeds[w_id] 
示例#9
0
 def get_w_repr(self, word, train=False, update=True):
     """
     Get representation of word (word embedding)
     """
     if train:
         if self.w_dropout_rate > 0.0:
             w_id = self.w2i[UNK] if drop(word, self.wcount, self.w_dropout_rate) else self.w2i.get(word, self.w2i[UNK])
     else:
         if self.mimickx_model_path: # if given use MIMICKX
             if word not in self.w2i: #
                 #print("predict with MIMICKX for: ", word)
                 return dynet.inputVector(self.mimickx_model.predict(word).npvalue())
         w_id = self.w2i.get(word, self.w2i[UNK])
     if not update:
         return dynet.nobackprop(self.wembeds[w_id])
     else:
         return self.wembeds[w_id] 
示例#10
0
文件: mnnl.py 项目: duffau/bilstm-aux
    def viterbi(self, observations, unk_tag=None, dictionary=None):
        #if dictionary:
        #    raise NotImplementedError("type constraints not yet implemented for CRF")
        backpointers = []
        init_vvars = [-1e10] * self.num_tags
        init_vvars[START_TAG] = 0  # <Start> has all the probability
        for_expr = dynet.inputVector(init_vvars)
        trans_exprs = [self.trans_mat[idx] for idx in range(self.num_tags)]
        for obs in observations:
            bptrs_t = []
            vvars_t = []
            for next_tag in range(self.num_tags):
                next_tag_expr = for_expr + trans_exprs[next_tag]
                next_tag_arr = next_tag_expr.npvalue()
                best_tag_id = np.argmax(next_tag_arr)
                if unk_tag:
                    best_tag = self.index2tag[best_tag_id]
                    if best_tag == unk_tag:
                        next_tag_arr[np.argmax(next_tag_arr)] = 0  # set to 0
                        best_tag_id = np.argmax(
                            next_tag_arr)  # get second best

                bptrs_t.append(best_tag_id)
                vvars_t.append(dynet.pick(next_tag_expr, best_tag_id))
            for_expr = dynet.concatenate(vvars_t) + obs
            backpointers.append(bptrs_t)
        # Perform final transition to terminal
        terminal_expr = for_expr + trans_exprs[END_TAG]
        terminal_arr = terminal_expr.npvalue()
        best_tag_id = np.argmax(terminal_arr)
        path_score = dynet.pick(terminal_expr, best_tag_id)
        # Reverse over the backpointers to get the best path
        best_path = [best_tag_id
                     ]  # Start with the tag that was best for terminal
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        start = best_path.pop()  # Remove the start symbol
        best_path.reverse()
        assert start == START_TAG
        # Return best path and best path's score
        return best_path, path_score
示例#11
0
    def evaluate_adversary(self, dataset):
        loss = 0
        acc = 0
        tot = len(dataset)
        
        predictions = []
        for i, ex in enumerate(dataset):
            
            dy.renew_cg()
            vec, labels = ex
            vec = dy.inputVector(vec)
            
            l, p = self.adversary_classifier.get_loss_and_prediction(vec, labels)
            
            predictions.append(p)
            if p == labels:
                acc += 1
            loss += l.value()

        return loss / tot, acc / tot * 100, predictions
示例#12
0
文件: mnnl.py 项目: bplank/bilstm-aux
    def viterbi(self, observations, unk_tag=None, dictionary=None):
        #if dictionary:
        #    raise NotImplementedError("type constraints not yet implemented for CRF")
        backpointers = []
        init_vvars   = [-1e10] * self.num_tags
        init_vvars[START_TAG] = 0 # <Start> has all the probability
        for_expr     = dynet.inputVector(init_vvars)
        trans_exprs  = [self.trans_mat[idx] for idx in range(self.num_tags)]
        for obs in observations:
            bptrs_t = []
            vvars_t = []
            for next_tag in range(self.num_tags):
                next_tag_expr = for_expr + trans_exprs[next_tag]
                next_tag_arr = next_tag_expr.npvalue()
                best_tag_id  = np.argmax(next_tag_arr)
                if unk_tag:
                    best_tag = self.index2tag[best_tag_id]
                    if best_tag == unk_tag:
                        next_tag_arr[np.argmax(next_tag_arr)] = 0 # set to 0
                        best_tag_id = np.argmax(next_tag_arr) # get second best

                bptrs_t.append(best_tag_id)
                vvars_t.append(dynet.pick(next_tag_expr, best_tag_id))
            for_expr = dynet.concatenate(vvars_t) + obs
            backpointers.append(bptrs_t)
        # Perform final transition to terminal
        terminal_expr = for_expr + trans_exprs[END_TAG]
        terminal_arr  = terminal_expr.npvalue()
        best_tag_id   = np.argmax(terminal_arr)
        path_score    = dynet.pick(terminal_expr, best_tag_id)
        # Reverse over the backpointers to get the best path
        best_path = [best_tag_id] # Start with the tag that was best for terminal
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        start = best_path.pop() # Remove the start symbol
        best_path.reverse()
        assert start == START_TAG
        # Return best path and best path's score
        return best_path, path_score
示例#13
0
文件: mnnl.py 项目: bplank/bilstm-aux
    def forward(self, observations):
        # calculate forward pass
        def log_sum_exp(scores):
            npval = scores.npvalue()
            argmax_score = np.argmax(npval)
            max_score_expr = dynet.pick(scores, argmax_score)
            max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags)
            return max_score_expr + dynet.logsumexp_dim((scores - max_score_expr_broadcast),0)

        init_alphas = [-1e10] * self.num_tags
        init_alphas[START_TAG] = 0
        for_expr = dynet.inputVector(init_alphas)
        for obs in observations:
            alphas_t = []
            for next_tag in range(self.num_tags):
                obs_broadcast = dynet.concatenate([dynet.pick(obs, next_tag)] * self.num_tags)
                next_tag_expr = for_expr + self.trans_mat[next_tag] + obs_broadcast
                alphas_t.append(log_sum_exp(next_tag_expr))
            for_expr = dynet.concatenate(alphas_t)
        terminal_expr = for_expr + self.trans_mat[END_TAG]
        alpha = log_sum_exp(terminal_expr)
        return alpha
示例#14
0
    def predict(self,
                feature_vector,
                train=False,
                soft_labels=False,
                temperature=None,
                dropout_rate=None):
        dynet.renew_cg()  # new graph

        feature_vector = feature_vector.toarray()
        feature_vector = np.squeeze(feature_vector, axis=0)

        # self.input = dynet.vecInput(self.vocab_size)
        # self.input.set(feature_vector)
        # TODO this takes too long; can we speed this up somehow?
        input = dynet.inputVector(feature_vector)
        for i in range(self.h_layers - 1):
            if train:  # add some noise
                input = dynet.noise(input, self.noise_sigma)
                input = dynet.dropout(input, dropout_rate)
            input = self.layers[i](input)
        output = self.layers[-1](input,
                                 soft_labels=soft_labels,
                                 temperature=temperature)
        return output
示例#15
0
    def fit(self,
            train_X,
            train_Y,
            num_epochs,
            val_X=None,
            val_Y=None,
            patience=2,
            model_path=None,
            seed=None,
            word_dropout_rate=0.25,
            trg_vectors=None,
            unsup_weight=1.0,
            variance_weights=None,
            labeled_weight_proportion=1.0):
        """
        train the tagger
        :param trg_vectors: the prediction targets used for the unsupervised loss
                            in temporal ensembling
        :param unsup_weight: weight for the unsupervised consistency loss
                                    used in temporal ensembling
        :param clip_threshold: use gradient clipping with threshold (on if >0; default: 5.0)
        :param labeled_weight_proportion: proportion of the unsupervised weight
                                          that should be assigned to labeled
                                          examples
        """
        print("read training data", file=sys.stderr)

        if variance_weights is not None:
            print('First 20 variance weights:', variance_weights[:20])

        if seed:
            print(">>> using seed: ", seed, file=sys.stderr)
            random.seed(seed)  #setting random seed

        # if we use word dropout keep track of counts
        if word_dropout_rate > 0.0:
            widCount = Counter()
            for sentence, _ in train_X:
                widCount.update([w for w in sentence])

        assert (len(train_X) == len(train_Y))
        train_data = list(zip(train_X, train_Y))

        # if we use target vectors, keep track of the targets per sentence
        if trg_vectors is not None:
            trg_start_id = 0
            sentence_trg_vectors = []
            sentence_var_weights = []
            for i, (example, y) in enumerate(train_data):
                sentence_trg_vectors.append(
                    trg_vectors[trg_start_id:trg_start_id +
                                len(example[0]), :])
                if variance_weights is not None:
                    sentence_var_weights.append(
                        variance_weights[trg_start_id:trg_start_id +
                                         len(example[0])])
                trg_start_id += len(example[0])
            assert trg_start_id == len(trg_vectors),\
                'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors))
            assert len(sentence_trg_vectors) == len(train_X)
            if variance_weights is not None:
                assert trg_start_id == len(variance_weights)
                assert len(sentence_var_weights) == len(train_X)

        print('Starting training for {} epochs...'.format(num_epochs))
        best_val_acc, epochs_no_improvement = 0., 0
        if val_X is not None and val_Y is not None and model_path is not None:
            print(
                'Using early stopping with patience of {}...'.format(patience))

        for cur_iter in range(num_epochs):
            bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1,
                                                       num_epochs),
                      max=len(train_data),
                      flush=True)
            total_loss = 0.0
            total_tagged = 0.0

            total_other_loss, total_other_loss_weighted = 0.0, 0.0

            random_indices = np.arange(len(train_data))
            random.shuffle(random_indices)

            for i, idx in enumerate(random_indices):
                (word_indices, char_indices), y = train_data[idx]

                if word_dropout_rate > 0.0:
                    word_indices = [
                        self.w2i["_UNK"] if
                        (random.random() >
                         (widCount.get(w) /
                          (word_dropout_rate + widCount.get(w)))) else w
                        for w in word_indices
                    ]
                output = self.predict(word_indices, char_indices, train=True)

                if len(y) == 1 and y[0] == 0:
                    # in temporal ensembling, we assign a dummy label of [0] for
                    # unlabeled sequences; we skip the supervised loss for these
                    loss = dynet.scalarInput(0)
                else:
                    loss = dynet.esum([
                        self.pick_neg_log(pred, gold)
                        for pred, gold in zip(output, y)
                    ])

                if trg_vectors is not None:
                    # the consistency loss in temporal ensembling is used for
                    # both supervised and unsupervised input
                    targets = sentence_trg_vectors[idx]
                    assert len(output) == len(targets)
                    if variance_weights is not None:
                        var_weights = sentence_var_weights[idx]
                        assert len(output) == len(var_weights)
                        # multiply the normalized mean variance with each loss
                        other_loss = dynet.esum([
                            v * dynet.squared_distance(o, dynet.inputVector(t))
                            for o, t, v in zip(output, targets, var_weights)
                        ])
                    else:
                        other_loss = dynet.esum([
                            dynet.squared_distance(o, dynet.inputVector(t))
                            for o, t in zip(output, targets)
                        ])

                    total_other_loss += other_loss.value()
                    if len(y) == 1 and y[0] == 0:  #unlab_ex
                        other_loss += other_loss * unsup_weight
                    else:  #lab_ex
                        # assign the unsupervised weight for labeled examples
                        other_loss += other_loss * unsup_weight * labeled_weight_proportion
                    # keep track for logging
                    total_loss += loss.value()  # main loss
                    total_tagged += len(word_indices)
                    total_other_loss_weighted += other_loss.value()

                    # combine losses
                    loss += other_loss

                else:
                    # keep track for logging
                    total_loss += loss.value()
                    total_tagged += len(word_indices)

                loss.backward()
                self.trainer.update()
                bar.next()

            if trg_vectors is None:
                print("iter {2} {0:>12}: {1:.2f}".format(
                    "total loss", total_loss / total_tagged, cur_iter),
                      file=sys.stderr)
            else:
                print(
                    "iter {2} {0:>12}: {1:.2f} unsupervised loss: {3:.2f} (weighted: {4:.2f})"
                    .format("supervised loss", total_loss / total_tagged,
                            cur_iter, total_other_loss / total_tagged,
                            total_other_loss_weighted / total_tagged),
                    file=sys.stderr)

            if val_X is not None and val_Y is not None and model_path is not None:
                # get the best accuracy on the validation set
                val_correct, val_total = self.evaluate(val_X, val_Y)
                val_accuracy = val_correct / val_total

                if val_accuracy > best_val_acc:
                    print(
                        'Accuracy {:.4f} is better than best val accuracy {:.4f}'
                        .format(val_accuracy, best_val_acc))
                    best_val_acc = val_accuracy
                    epochs_no_improvement = 0
                    save_tagger(self, model_path)
                else:
                    print(
                        'Accuracy {:.4f} is worse than best val loss {:.4f}.'.
                        format(val_accuracy, best_val_acc))
                    epochs_no_improvement += 1
                if epochs_no_improvement == patience:
                    print('No improvement for {} epochs. Early stopping...'.
                          format(epochs_no_improvement))
                    break
示例#16
0
    def train_adversary(self, train, dev):
        lr = self.args.learning_rate
        dc = self.args.decay_constant
        
        random.shuffle(train)
        sample_train = train[:len(dev)]
        self.trainer.learning_rate = lr
        
        epochs = self.args.iterations_adversary
        
        n_updates = 0
        best = 0
        ibest=0
        
        for epoch in range(self.args.iterations_adversary):
            random.shuffle(train)
            
            for i, example in enumerate(train):
                
                dy.renew_cg()
                
                vec, label = example
                vec = dy.inputVector(vec)
                
                sys.stderr.write("\r{}%".format(i / len(train) * 100))
                
                loss = self.adversary_classifier.get_loss(vec, label)
                loss.backward()
                self.trainer.update()
                self.trainer.learning_rate = lr / (1 + n_updates * dc)
                
                n_updates += 1
            
            sys.stderr.write("\r")
            
            
            targets_t = [label for _, label in sample_train]
            targets_d = [label for _, label in dev]
            
            loss_t, acc_t, predictions_t = self.evaluate_adversary(sample_train)
            loss_d, acc_d, predictions_d = self.evaluate_adversary(dev)
            
            cmpare = acc_d
            
            ftrain = compute_eval_metrics(self.adversary_classifier.output_size(), targets_t, predictions_t)
            fdev = compute_eval_metrics(self.adversary_classifier.output_size(), targets_d, predictions_d)

            Fscore = "F: t = {} d = {}".format(ftrain, fdev)
            cmpare = fdev[2]
            
            if "tp" in self.args.dataset or "bl" in self.args.dataset:
                acc_all = fdev[3]
                cmpare = sum(acc_all) / len(acc_all)
            
            
            if cmpare >= best:
                best = cmpare
                ibest = epoch
                self.model.save("{}/adverse_model{}".format(self.output_folder, ibest))
            
            print("Epoch {} train: l={:.4f} acc={:.2f} dev: l={:.4f} acc={:.2f} {} ".format(epoch, loss_t, acc_t, loss_d, acc_d, Fscore), flush=True)
        
        if epochs > 0:
            self.model.populate("{}/adverse_model{}".format(self.output_folder, ibest))
        
        return best
示例#17
0
 def pick_neg_log(self, pred, gold):
     if not isinstance(gold, int):
         # calculate cross-entropy loss against the whole vector
         dy_gold = dynet.inputVector(gold)
         return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred)))
     return -dynet.log(dynet.pick(pred, gold))
示例#18
0
    def fit(self,
            train_dict,
            num_epochs,
            val_X=None,
            val_Y=None,
            patience=2,
            model_path=None,
            seed=None,
            word_dropout_rate=0.25,
            trg_vectors=None,
            unsup_weight=1.0,
            clip_threshold=5.0,
            orthogonality_weight=0.0,
            adversarial=False,
            adversarial_weight=1.0,
            ignore_src_Ft=False):
        """
        train the tagger
        :param trg_vectors: the prediction targets used for the unsupervised loss
                            in temporal ensembling
        :param unsup_weight: weight for the unsupervised consistency loss
                                    used in temporal ensembling
        :param adversarial: note: if we want to use adversarial, we have to
                            call add_adversarial_loss before;
        :param adversarial_weight: 1 by default (do not weigh adv loss)
        :param ignore_src_Ft: if asymm.tri. 2nd stage, do not further train Ft on 'src'
        :param train_dict: a dictionary mapping tasks ("F0", "F1", and "Ft")
                           to a dictionary
                           {"X": list of examples,
                            "Y": list of labels,
                            "domain": list of domain tag (0,1) of example}
        Three tasks are indexed as "F0", "F1" and "Ft"

        Note: if a task 'src' is given than a single model with three heads is trained where
        all data is given to all outputs
        """
        print("read training data")

        widCount = Counter()
        train_data = []
        for task, task_dict in train_dict.items():  #task: eg. "F0"
            for key in ["X", "Y", "domain"]:
                assert key in task_dict, "Error: %s is not available." % key
            examples, labels, domain_tags = task_dict["X"], task_dict[
                "Y"], task_dict["domain"]
            assert len(examples) == len(labels)
            if word_dropout_rate > 0.0:
                # keep track of the counts for word dropout
                for sentence, _ in examples:
                    widCount.update([w for w in sentence])

            # train data is a list of 4-tuples: (example, label, task_id, domain_id)
            train_data += list(
                zip(examples, labels, [[task] * len(labels)][0], domain_tags))

        # if we use target vectors, keep track of the targets per sentence
        if trg_vectors is not None:
            trg_start_id = 0
            sentence_trg_vectors = []
            for i, (example, y) in enumerate(train_data):
                sentence_trg_vectors.append(
                    trg_vectors[trg_start_id:trg_start_id +
                                len(example[0]), :])
                trg_start_id += len(example[0])
            assert trg_start_id == len(trg_vectors),\
                'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors))

        print('Starting training for {} epochs...'.format(num_epochs))
        best_val_acc, epochs_no_improvement = 0., 0
        if val_X is not None and val_Y is not None and model_path is not None:
            print(
                'Using early stopping with patience of {}...'.format(patience))

        if seed:
            random.seed(seed)

        for cur_iter in range(num_epochs):
            bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1,
                                                       num_epochs),
                      max=len(train_data),
                      flush=True)

            random_indices = np.arange(len(train_data))
            random.shuffle(random_indices)

            total_loss, total_tagged, total_constraint, total_adversarial = 0.0, 0.0, 0.0, 0.0
            total_orth_constr = 0  # count how many updates

            # log separate losses
            log_losses = {}
            log_total = {}
            for task_id in self.task_ids:
                log_losses[task_id] = 0.0
                log_total[task_id] = 0

            for i, idx in enumerate(random_indices):
                (word_indices,
                 char_indices), y, task_id, domain_id = train_data[idx]

                if word_dropout_rate > 0.0:
                    word_indices = [
                        self.w2i["_UNK"] if
                        (random.random() >
                         (widCount.get(w) /
                          (word_dropout_rate + widCount.get(w)))) else w
                        for w in word_indices
                    ]

                output, constraint, adv = self.predict(
                    word_indices,
                    char_indices,
                    task_id,
                    train=True,
                    orthogonality_weight=orthogonality_weight,
                    domain_id=domain_id if adversarial else None)

                if task_id not in ['src', 'trg']:

                    if len(y) == 1 and y[0] == 0:
                        # in temporal ensembling, we assign a dummy label of [0] for
                        # unlabeled sequences; we skip the supervised loss for these
                        loss = dynet.scalarInput(0)
                    else:
                        loss = dynet.esum([
                            self.pick_neg_log(pred, gold)
                            for pred, gold in zip(output, y)
                        ])

                    if trg_vectors is not None:
                        # the consistency loss in temporal ensembling is used for
                        # both supervised and unsupervised input
                        targets = sentence_trg_vectors[idx]
                        assert len(output) == len(targets)
                        other_loss = unsup_weight * dynet.average([
                            dynet.squared_distance(o, dynet.inputVector(t))
                            for o, t in zip(output, targets)
                        ])
                        loss += other_loss

                    if orthogonality_weight != 0.0 and task_id != 'Ft':
                        # add the orthogonality constraint to the loss
                        total_constraint += constraint.value(
                        ) * orthogonality_weight
                        total_orth_constr += 1
                        loss += constraint * orthogonality_weight

                    if adversarial:
                        total_adversarial += adv.value() * adversarial_weight
                        loss += adv * adversarial_weight

                    total_loss += loss.value()  # for output

                    log_losses[task_id] += total_loss
                    total_tagged += len(word_indices)
                    log_total[task_id] += total_tagged

                    loss.backward()
                    self.trainer.update()
                    bar.next()
                else:
                    # bootstrap=False, the output contains list of outputs one for each task
                    assert trg_vectors is None, 'temporal ensembling not implemented for bootstrap=False'
                    loss = dynet.scalarInput(1)  #initialize
                    if ignore_src_Ft:
                        output = output[:
                                        -1]  # ignore last = Ft when further training with 'src'

                    for t_i, output_t in enumerate(
                            output):  # get loss for each task
                        loss += dynet.esum([
                            self.pick_neg_log(pred, gold)
                            for pred, gold in zip(output_t, y)
                        ])
                        task_id = self.task_ids[t_i]
                        log_losses[task_id] += total_loss
                        log_total[task_id] += total_tagged

                    if orthogonality_weight != 0.0:
                        # add the orthogonality constraint to the loss
                        total_constraint += constraint.value(
                        ) * orthogonality_weight
                        total_orth_constr += 1
                        loss += constraint * orthogonality_weight

                    if adversarial:
                        total_adversarial += adv.value() * adversarial_weight
                        loss += adv * adversarial_weight

                    total_loss += loss.value()  # for output
                    total_tagged += len(word_indices)

                    loss.backward()
                    self.trainer.update()
                    bar.next()

            if adversarial and orthogonality_weight:
                print(
                    "iter {}. Total loss: {:.3f}, total penalty: {:.3f}, total weighted adv loss: {:.3f}"
                    .format(cur_iter, total_loss / total_tagged,
                            total_constraint / total_orth_constr,
                            total_adversarial / total_tagged),
                    file=sys.stderr)
            elif orthogonality_weight:
                print("iter {}. Total loss: {:.3f}, total penalty: {:.3f}".
                      format(cur_iter, total_loss / total_tagged,
                             total_constraint / total_orth_constr),
                      file=sys.stderr)
            else:
                print("iter {}. Total loss: {:.3f} ".format(
                    cur_iter, total_loss / total_tagged),
                      file=sys.stderr)

            for task_id in self.task_ids:
                if log_total[task_id] > 0:
                    print("{0}: {1:.3f}".format(
                        task_id, log_losses[task_id] / log_total[task_id]))

            if val_X is not None and val_Y is not None and model_path is not None:
                # get the best accuracy on the validation set
                val_correct, val_total = self.evaluate(val_X, val_Y)
                val_accuracy = val_correct / val_total

                if val_accuracy > best_val_acc:
                    print(
                        'Accuracy {:.4f} is better than best val accuracy {:.4f}.'
                        .format(val_accuracy, best_val_acc))
                    best_val_acc = val_accuracy
                    epochs_no_improvement = 0
                    save_tagger(self, model_path)
                else:
                    print(
                        'Accuracy {:.4f} is worse than best val loss {:.4f}.'.
                        format(val_accuracy, best_val_acc))
                    epochs_no_improvement += 1
                if epochs_no_improvement == patience:
                    print('No improvement for {} epochs. Early stopping...'.
                          format(epochs_no_improvement))
                    break
示例#19
0
    def compute_decoder_batch_loss(self, encoded_inputs, input_masks,
                                   output_word_ids, output_masks, batch_size):
        self.readout = dn.parameter(self.params['readout'])
        self.bias = dn.parameter(self.params['bias'])
        self.w_c = dn.parameter(self.params['w_c'])
        self.u_a = dn.parameter(self.params['u_a'])
        self.v_a = dn.parameter(self.params['v_a'])
        self.w_a = dn.parameter(self.params['w_a'])

        # initialize the decoder rnn
        s_0 = self.decoder_rnn.initial_state()

        # initial "input feeding" vectors to feed decoder - 3*h
        init_input_feeding = dn.lookup_batch(self.init_lookup,
                                             [0] * batch_size)

        # initial feedback embeddings for the decoder, use begin seq symbol embedding
        init_feedback = dn.lookup_batch(
            self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size)

        # init decoder rnn
        decoder_init = dn.concatenate([init_feedback, init_input_feeding])
        s = s_0.add_input(decoder_init)

        # loss per timestep
        losses = []

        # run the decoder through the output sequences and aggregate loss
        for i, step_word_ids in enumerate(output_word_ids):

            # returns h x batch size matrix
            decoder_rnn_output = s.output()

            # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix)
            attention_output_vector, alphas = self.attend(
                encoded_inputs, decoder_rnn_output, input_masks)

            # compute output scores (returns vocab_size x batch size matrix)
            # h = readout * attention_output_vector + bias
            h = dn.affine_transform(
                [self.bias, self.readout, attention_output_vector])

            # encourage diversity by punishing highly confident predictions
            # TODO: support batching - esp. w.r.t. scalar inputs
            if self.diverse:
                soft = dn.softmax(dn.tanh(h))
                batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \
                    - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4))
            else:
                # get batch loss for this timestep
                batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids)

            # mask the loss if at least one sentence is shorter
            if output_masks and output_masks[i][-1] != 1:
                mask_expr = dn.inputVector(output_masks[i])
                # noinspection PyArgumentList
                mask_expr = dn.reshape(mask_expr, (1, ), batch_size)
                batch_loss = batch_loss * mask_expr

            # input feeding approach - input h (attention_output_vector) to the decoder
            # prepare for the next iteration - "feedback"
            feedback_embeddings = dn.lookup_batch(self.output_lookup,
                                                  step_word_ids)
            decoder_input = dn.concatenate(
                [feedback_embeddings, attention_output_vector])
            s = s.add_input(decoder_input)

            losses.append(batch_loss)

        # sum the loss over the time steps and batch
        total_batch_loss = dn.sum_batches(dn.esum(losses))

        return total_batch_loss
示例#20
0
    def fit(self,
            train_X,
            train_Y,
            num_epochs,
            val_X=None,
            val_Y=None,
            patience=2,
            model_path=None,
            seed=None,
            word_dropout_rate=0.25,
            trg_vectors=None,
            unsup_weight=1.0,
            labeled_weight_proportion=1.0):
        """
        train the model
        :param trg_vectors: the prediction targets used for the unsupervised loss
                            in temporal ensembling
        :param unsup_weight: weight for the unsupervised consistency loss
                                    used in temporal ensembling
        """
        if seed:
            print(">>> using seed: ", seed, file=sys.stderr)
            random.seed(seed)  #setting random seed

        assert(train_X.shape[0] == len(train_Y)), \
            '# examples %d != # labels %d.' % (train_X.shape[0], len(train_Y))
        train_data = list(zip(train_X, train_Y))

        print('Starting training for %d epochs...' % num_epochs)
        best_val_f1, epochs_no_improvement = 0., 0
        if val_X is not None and val_Y is not None and model_path is not None:
            print('Using early stopping with patience of %d...' % patience)
        for cur_iter in range(num_epochs):
            bar = Bar('Training epoch %d/%d...' % (cur_iter + 1, num_epochs),
                      max=len(train_data),
                      flush=True)
            total_loss = 0.0

            random_indices = np.arange(len(train_data))
            random.shuffle(random_indices)

            for i, idx in enumerate(random_indices):

                x, y = train_data[idx]
                output = self.predict(x,
                                      train=True,
                                      dropout_rate=word_dropout_rate)
                # in temporal ensembling, we assign a dummy label of -1 for
                # unlabeled sequences; we skip the supervised loss for these
                loss = dynet.scalarInput(0) if y == -1 else self.pick_neg_log(
                    output, y)

                if trg_vectors is not None:
                    # the consistency loss in temporal ensembling is used for
                    # both supervised and unsupervised input
                    target = trg_vectors[idx]

                    other_loss = dynet.squared_distance(
                        output, dynet.inputVector(target))

                    if y != -1:
                        other_loss *= labeled_weight_proportion
                    loss += other_loss * unsup_weight
                total_loss += loss.value()

                loss.backward()
                self.trainer.update()
                bar.next()

            print(" iter {2} {0:>12}: {1:.2f}".format(
                "total loss", total_loss / len(train_data), cur_iter),
                  file=sys.stderr)

            if val_X is not None and val_Y is not None and model_path is not None:
                # get the best F1 score on the validation set
                val_f1 = self.evaluate(val_X, val_Y)

                if val_f1 > best_val_f1:
                    print('F1 %.4f is better than best val F1 %.4f.' %
                          (val_f1, best_val_f1))
                    best_val_f1 = val_f1
                    epochs_no_improvement = 0
                    save_model(self, model_path)
                else:
                    print('F1 %.4f is worse than best val F1 %.4f.' %
                          (val_f1, best_val_f1))
                    epochs_no_improvement += 1
                if epochs_no_improvement == patience:
                    print('No improvement for %d epochs. Early stopping...' %
                          epochs_no_improvement)
                    break
示例#21
0
 def pick_neg_log(self, pred, gold):
     if hasattr(gold, "__len__"):
         # calculate cross-entropy loss against the whole vector
         dy_gold = dynet.inputVector(gold)
         return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred)))
     return -dynet.log(dynet.pick(pred, gold))
示例#22
0
    def fit(self,
            train_dict,
            num_epochs,
            val_X=None,
            val_Y=None,
            patience=2,
            model_path=None,
            seed=None,
            word_dropout_rate=0.25,
            trg_vectors=None,
            unsup_weight=1.0,
            orthogonality_weight=0.0,
            adversarial=False):
        """
        train the model
        :param trg_vectors: the prediction targets used for the unsupervised loss
                            in temporal ensembling
        :param unsup_weight: weight for the unsupervised consistency loss
                                    used in temporal ensembling
        """
        if seed:
            print(">>> using seed: ", seed, file=sys.stderr)
            random.seed(seed)  #setting random seed

        train_data = []
        for task, task_dict in train_dict.items():
            for key in ["X", "Y", "domain"]:
                assert key in task_dict, "Error: %s is not available." % key
            examples, labels, domain_tags = task_dict["X"], task_dict["Y"], \
                                            task_dict["domain"]
            assert examples.shape[0] == len(labels)

            # train data is a list of 4-tuples: (example, label, task_id, domain_id)
            train_data += list(
                zip(examples, labels, [[task] * len(labels)][0], domain_tags))

        print('Starting training for %d epochs...' % num_epochs)
        best_val_f1, epochs_no_improvement = 0., 0

        if val_X is not None and val_Y is not None and model_path is not None:
            print('Using early stopping with patience of %d...' % patience)

        for cur_iter in range(num_epochs):
            bar = Bar('Training epoch %d/%d...' % (cur_iter + 1, num_epochs),
                      max=len(train_data),
                      flush=True)
            total_loss, total_constraint, total_adversarial = 0.0, 0.0, 0.0

            random_indices = np.arange(len(train_data))
            random.shuffle(random_indices)

            for i, idx in enumerate(random_indices):

                x, y, task_id, domain_id = train_data[idx]
                task_ids = [task_id]

                if task_id == 'src':
                    # we train both F0 and F1 on source data
                    task_ids = ['F0', 'F1']
                elif task_id == 'src_all':
                    # we train F0, F1, and Ft on source data for base training
                    task_ids = ['F0', 'F1', 'Ft']

                loss = 0
                outputs, constraint, adv = self.predict(
                    x,
                    task_ids,
                    train=True,
                    dropout_rate=word_dropout_rate,
                    orthogonality_weight=orthogonality_weight,
                    domain_id=domain_id if adversarial else None)

                # in temporal ensembling, we assign a dummy label of -1 for
                # unlabeled sequences; we skip the supervised loss for these
                for output in outputs:
                    loss += dynet.scalarInput(
                        0) if y == -1 else self.pick_neg_log(output, y)

                    if trg_vectors is not None:
                        # the consistency loss in temporal ensembling is used for
                        # both supervised and unsupervised input
                        target = trg_vectors[idx]

                        other_loss = dynet.squared_distance(
                            output, dynet.inputVector(target))
                        loss += other_loss * unsup_weight

                # the orthogonality weight is the same for every prediction,
                # so we can add it in the end
                if orthogonality_weight != 0.0:
                    # add the orthogonality constraint to the loss
                    loss += constraint * orthogonality_weight
                    total_constraint += constraint.value()
                if adversarial:
                    total_adversarial += adv.value()
                    loss += adv

                total_loss += loss.value()
                loss.backward()
                self.trainer.update()
                bar.next()

            print(
                "\niter {}. Total loss: {:.3f}, total penalty: {:.3f}, adv: {:.3f}"
                .format(cur_iter, total_loss / len(train_data),
                        total_constraint / len(train_data),
                        total_adversarial / len(train_data)),
                file=sys.stderr)

            if val_X is not None and val_Y is not None and model_path is not None:
                # get the best F1 score on the validation set
                val_f1 = self.evaluate(val_X, val_Y, 'F0')

                if val_f1 > best_val_f1:
                    print('F1 %.4f is better than best val F1 %.4f.' %
                          (val_f1, best_val_f1))
                    best_val_f1 = val_f1
                    epochs_no_improvement = 0
                    save_mttri_model(self, model_path)
                else:
                    print('F1 %.4f is worse than best val F1 %.4f.' %
                          (val_f1, best_val_f1))
                    epochs_no_improvement += 1
                if epochs_no_improvement == patience:
                    print('No improvement for %d epochs. Early stopping...' %
                          epochs_no_improvement)
                    break