Пример #1
0
def build_model(first_level, model, emb_doc, doc_labels, w_param, b_param):
    """
    Runs the model for training, calculating the loss. 
    @params: first_level is I, O, or P,
             model is the LSTM model,
             emb_doc is a numpy array of embeddings for one document,
             doc_labels is a list of the labels associated with emb_doc,
             w_param is a Dynet parameter multiplied with the layer output,
             b_param is a Dynet parameter added to the product of output and w_param.
    @returns: the sum of the errors computed for the document
    """
    dy.renew_cg()
    s = model.initial_state()
    i = dy.vecInput(200)
    o = dy.vecInput(200)
    p = dy.vecInput(200)
    si = s.add_input(i)
    so = s.add_input(o)
    sp = s.add_input(p)
    loss = []

    for wdemb, label in zip(emb_doc, doc_labels):
        x = dy.inputVector(wdemb)
        dy.noise(x, 0.5)  #noise for student model
        if first_level == 'I':
            s2 = si.add_input(x)
        elif first_level == 'O':
            s2 = so.add_input(x)
        else:
            s2 = sp.add_input(x)
        loss.append(
            dy.pickneglogsoftmax((w_param * s2.output()) + b_param, label))
    return dy.esum(loss)
Пример #2
0
    def predict(self, word_indices, char_indices, task_id, train=False):
        """
        predict tags for a sentence represented as char+word embeddings
        """

        # word embeddings
        wfeatures = [self.wembeds[w] for w in word_indices]

        # char embeddings
        if self.c_in_dim > 0:
            char_emb = []
            rev_char_emb = []
            # get representation for words
            for chars_of_token in char_indices:
                char_feats = [self.cembeds[c] for c in chars_of_token]
                # use last state as word representation
                f_char, b_char = self.char_rnn.predict_sequence(char_feats, char_feats)
                last_state = f_char[-1]
                rev_last_state = b_char[-1]
                char_emb.append(last_state)
                rev_char_emb.append(rev_last_state)

            features = [dynet.concatenate([w,c,rev_c]) for w,c,rev_c in zip(wfeatures,char_emb,rev_char_emb)]
        else:
            features = wfeatures
        
        if train: # only do at training time
            features = [dynet.noise(fe,self.noise_sigma) for fe in features]

        output_expected_at_layer = self.predictors["task_expected_at"][task_id]
        output_expected_at_layer -=1

        # go through layers
        # input is now combination of w + char emb
        prev = features
        prev_rev = features
        num_layers = self.h_layers

        for i in range(0,num_layers):
            predictor = self.predictors["inner"][i]
            forward_sequence, backward_sequence = predictor.predict_sequence(prev, prev_rev)        
            if i > 0 and self.activation:
                # activation between LSTM layers
                forward_sequence = [self.activation(s) for s in forward_sequence]
                backward_sequence = [self.activation(s) for s in backward_sequence]

            if i == output_expected_at_layer:
                output_predictor = self.predictors["output_layers_dict"][task_id] 
                concat_layer = [dynet.concatenate([f, b]) for f, b in zip(forward_sequence,reversed(backward_sequence))]

                if train and self.noise_sigma > 0.0:
                    concat_layer = [dynet.noise(fe,self.noise_sigma) for fe in concat_layer]
                output = output_predictor.predict_sequence(concat_layer)
                return output

            prev = forward_sequence
            prev_rev = backward_sequence 

        raise Exception("oops should not be here")
        return None
Пример #3
0
def build_tagging_graph_lvl1(words, tags, builders):
    dy.renew_cg()
    f_init, b_init = [b.initial_state() for b in builders]

    wembs = [E[w] for w in words]
    wembs = [dy.noise(we, 0.1) for we in wembs]

    fw = [x.output() for x in f_init.add_inputs(wembs)]
    bw = [x.output() for x in b_init.add_inputs(reversed(wembs))]

    # fw_rnn_hidden_outs = [x.value() for x in fw]
    # bw_rnn_hidden_outs = [x.value() for x in bw]

    # print ("Transducing")
    # fw_rnn_hidden_outs = f_init.transduce(wembs)
    # bw_rnn_hidden_outs = b_init.transduce(reversed(wembs))

    if MLP:
        H = dy.parameter(pH)
        O = dy.parameter(pO)
    else:
        O = dy.parameter(pO)
    errs = []
    for f, b, t in zip(fw, reversed(bw), tags):
        f_b = dy.concatenate([f, b])
        if MLP:
            r_t = O * (dy.tanh(H * f_b))
        else:
            r_t = O * f_b
        err = dy.pickneglogsoftmax(r_t, t)
        errs.append(err)

    return {'err': dy.esum(errs), 'fw': fw, 'bw': bw}
Пример #4
0
    def build_tagging_graph(self, words, tags):
        dy.renew_cg()
        f_init, b_init = [b.initial_state() for b in self.first_layer_builders]

        wembs = [self.E[w] for w in words]
        wembs = [dy.noise(we, 0.1) for we in wembs]

        fw = [x.output() for x in f_init.add_inputs(wembs)]
        bw = [x.output() for x in b_init.add_inputs(reversed(wembs))]

        errs = []
        output_from_first_layer = [
            dy.concatenate([f, b]) for f, b in zip(fw, reversed(bw))
        ]

        f_init, b_init = [
            b.initial_state() for b in self.second_layer_builders
        ]
        fw = [x.output() for x in f_init.add_inputs(output_from_first_layer)]
        bw = [
            x.output()
            for x in b_init.add_inputs(reversed(output_from_first_layer))
        ]

        for f, b, t in zip(fw, reversed(bw), tags):
            f_b = dy.concatenate([f, b])
            r_t = self.pO * f_b

            err = dy.pickneglogsoftmax(r_t, t)
            errs.append(err)
        return dy.esum(errs)
Пример #5
0
def build_tagging_graph(words, tags, builders):
    dy.renew_cg()
    f_init, b_init = [b.initial_state() for b in builders]

    wembs = [E[w] for w in words]
    wembs = [dy.noise(we,0.1) for we in wembs]

    fw = [x.output() for x in f_init.add_inputs(wembs)]
    bw = [x.output() for x in b_init.add_inputs(reversed(wembs))]

    if MLP:
        H = dy.parameter(pH)
        O = dy.parameter(pO)
    else:
        O = dy.parameter(pO)
    errs = []
    for f,b,t in zip(fw, reversed(bw), tags):
        f_b = dy.concatenate([f,b])
        if MLP:
            r_t = O*(dy.tanh(H * f_b))
        else:
            r_t = O * f_b
        err = dy.pickneglogsoftmax(r_t, t)
        errs.append(err)
    return dy.esum(errs)
Пример #6
0
 def embed(self, x):
   if self.train and self.word_dropout > 0.0 and self.word_id_mask is None:
     batch_size = x.batch_size() if xnmt.batcher.is_batched(x) else 1
     self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)]
   # single mode
   if not xnmt.batcher.is_batched(x):
     if self.train and self.word_id_mask and x in self.word_id_mask[0]:
       ret = dy.zeros((self.emb_dim,))
     else:
       ret = self.embeddings[x]
       if self.fix_norm is not None:
         ret = dy.cdiv(ret, dy.l2_norm(ret))
         if self.fix_norm != 1:
           ret *= self.fix_norm
   # minibatch mode
   else:
     ret = self.embeddings.batch(x)
     if self.fix_norm is not None:
       ret = dy.cdiv(ret, dy.l2_norm(ret))
       if self.fix_norm != 1:
         ret *= self.fix_norm
     if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(x.batch_size())):
       dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(x.batch_size())]), batched=True)
       ret = dy.cmult(ret, dropout_mask)
   if self.train and self.weight_noise > 0.0:
     ret = dy.noise(ret, self.weight_noise)
   return ret
Пример #7
0
    def build_tagging_graph(self, words):
        dy.renew_cg()

        # Initialize the LSTMs
        f_init = self.fwdRNN.initial_state()
        b_init = self.bwdRNN.initial_state()

        cf_init = self.cFwdRNN.initial_state()
        cb_init = self.cBwdRNN.initial_state()

        # Get the word vectors, a 128-dim vector expression for each word.
        if self.hp.dynamic:
            wembs = [self.dynamic_rep(w, cf_init, cb_init) for w in words]
        else:
            wembs = [
                self.word_and_char_rep(w, cf_init, cb_init) for w in words
            ]

        if self.hp.noise > 0:
            wembs = [dy.noise(we, self.hp.noise) for we in wembs]

        # Feed word vectors into biLSTM
        fw_exps = f_init.transduce(wembs)
        bw_exps = b_init.transduce(reversed(wembs))

        # biLSTM states
        bi_exps = [
            dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))
        ]

        # Feed each biLSTM state to an MLP
        return [self.pO * (dy.tanh(self.pH * x)) for x in bi_exps]
Пример #8
0
    def build_tagging_graph_for_chars(self, words):
        #self.lstm = dy.LSTMBuilder(NUM_LAYERS, INPUT_DIM, HIDDEN_DIM, model)

        wembs = self.convert_words_to_vecs(words)
        #wembs = [self.E[w] for w in words]
        wembs = [dy.noise(we, 0.1) for we in wembs]

        f_init, b_init = [
            b.initial_state() for b in self.char_flow_first_layer
        ]

        fw = [x.output() for x in f_init.add_inputs(wembs)]
        bw = [x.output() for x in b_init.add_inputs(reversed(wembs))]

        output_from_first_layer = [
            dy.concatenate([f, b]) for f, b in zip(fw, reversed(bw))
        ]

        f_init, b_init = [
            b.initial_state() for b in self.char_flow_second_layer
        ]

        fw = [x.output() for x in f_init.add_inputs(output_from_first_layer)]
        bw = [
            x.output()
            for x in b_init.add_inputs(reversed(output_from_first_layer))
        ]

        vector_result = []

        for f, b in zip(fw, reversed(bw)):
            f_b = dy.concatenate([f, b])
            vector_result.append(f_b)

        return vector_result
Пример #9
0
 def embed(self, x: Union[batchers.Batch, numbers.Integral]) -> dy.Expression:
   if self.train and self.word_dropout > 0.0 and self.word_id_mask is None:
     batch_size = x.batch_size() if batchers.is_batched(x) else 1
     self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)]
   emb_e = dy.parameter(self.embeddings)
   # single mode
   if not batchers.is_batched(x):
     if self.train and self.word_id_mask and x in self.word_id_mask[0]:
       ret = dy.zeros((self.emb_dim,))
     else:
       ret = dy.pick(emb_e, index=x)
       if self.fix_norm is not None:
         ret = dy.cdiv(ret, dy.l2_norm(ret))
         if self.fix_norm != 1:
           ret *= self.fix_norm
   # minibatch mode
   else:
     ret = dy.pick_batch(emb_e, x)
     if self.fix_norm is not None:
       ret = dy.cdiv(ret, dy.l2_norm(ret))
       if self.fix_norm != 1:
         ret *= self.fix_norm
     if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(x.batch_size())):
       dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(x.batch_size())]), batched=True)
       ret = dy.cmult(ret, dropout_mask)
   if self.train and self.weight_noise > 0.0:
     ret = dy.noise(ret, self.weight_noise)
   return ret
Пример #10
0
    def get_embeddings(self, words, is_train):
        if is_train:
            self.char_lstm.set_dropout(self.dropout)
        else:
            self.char_lstm.disable_dropout()

        embeddings = []
        for pos, word in enumerate(words):
            count = self.word_vocab.count(word)
            if not count:
                word = UNK

            chars = list(word)
            char_lstm_outputs = self.char_lstm.transduce([
                self.char_embeddings[self.char_vocab.index_or_unk(char, UNK)]
                for char in chars
            ])
            char_embedding = dy.concatenate([
                char_lstm_outputs[-1][:self.char_lstm_dim],
                char_lstm_outputs[0][self.char_lstm_dim:]
            ])

            word_embedding = self.word_embeddings[self.word_vocab.index(word)]
            pos_embedding = self.pos_embeddings[pos]
            embeddings.append(
                dy.concatenate([char_embedding, word_embedding,
                                pos_embedding]))

        embeddings = [dy.noise(e, 0.1) for e in embeddings]
        return embeddings
Пример #11
0
    def get_embeddings(self, words, is_train):
        if is_train:
            self.char_lstm.set_dropout(self.dropout)
        else:
            self.char_lstm.disable_dropout()

        embeddings = []
        for word in [START] + words + [STOP]:
            count = self.word_vocab.count(word)
            if not count or (is_train and np.random.rand() < 1 / (1 + count)):
                word = UNK

            chars = list(word) if word not in (START, STOP) else [word]
            char_lstm_outputs = self.char_lstm.transduce([
                self.char_embeddings[self.char_vocab.index_or_unk(char, UNK)]
                for char in [START] + chars + [STOP]
            ])
            char_embedding = dy.concatenate([
                char_lstm_outputs[-1][:self.char_lstm_dim],
                char_lstm_outputs[0][self.char_lstm_dim:]
            ])

            word_embedding = self.word_embeddings[self.word_vocab.index(word)]

            embeddings.append(dy.concatenate([char_embedding, word_embedding]))

        embeddings = [dy.noise(e, 0.1) for e in embeddings]
        return embeddings
Пример #12
0
def build_tagging_graph(words, tags, builders, topic):
    dy.renew_cg()
    f_init, b_init = [b.initial_state() for b in builders]

    # embeddings
    wembs = [E[w] for w in words]
    wembs = [dy.noise(we,0.1) for we in wembs]

    # bilstm
    fw = [x.output() for x in f_init.add_inputs(wembs)]
    bw = [x.output() for x in b_init.add_inputs(reversed(wembs))]

    # MLP for tag prediction
    H = dy.parameter(pH)
    O = dy.parameter(pO)
    
    errs = []

    for f,b,t in zip(fw, reversed(bw), tags):
        f_b = dy.concatenate([f,b])
        r_t = O*(dy.tanh(H * f_b))
        # r_t = O * f_b
        err = dy.pickneglogsoftmax(r_t, t)
        errs.append(err)
    
    # add an extra layer with MLP to predict topic
    if TOPIC:
        # aux_layer = dy.reshape(dy.parameter(topic_olayer) * dy.parameter(topic_hlayer),(5000,1)) * f_b[-1]
        aux_layer = dy.parameter(topic_olayer) * (dy.tanh(dy.parameter(topic_hlayer)*f_b))
        aux_loss = dy.pickneglogsoftmax(aux_layer, topic)
        errs.append(aux_loss)
    return dy.esum(errs)
Пример #13
0
    def build_tagging_graph(self, words, tags, builders):
        """
        Builds the graph for a single sentence.
        :param words:
        :param tags:
        :param builders:
        :return:
        """
        dy.renew_cg()
        f_init, b_init = [b.initial_state() for b in builders]

        wembs = [self.params["E"][w] for w in words]
        wembs = [dy.noise(we, 0.1) for we in wembs]

        fw = [x.output() for x in f_init.add_inputs(wembs)]
        bw = [x.output() for x in b_init.add_inputs(reversed(wembs))]

        if self.use_mlp:
            H = dy.parameter(self.params["H"])
            O = dy.parameter(self.params["O"])
        else:
            O = dy.parameter(self.params["O"])
        errs = []
        for f, b, t in zip(fw, reversed(bw), tags):
            f_b = dy.concatenate([f, b])
            if self.use_mlp:
                r_t = O * (dy.tanh(H * f_b))
            else:
                r_t = O * f_b
            err = dy.pickneglogsoftmax(r_t, t)
            errs.append(err)
        return dy.esum(errs)
Пример #14
0
    def build_tagging_graph(self, words, tags):
        dy.renew_cg()
        #self.lstm = dy.LSTMBuilder(NUM_LAYERS, INPUT_DIM, HIDDEN_DIM, model)

        wembs = self.convert_words_to_vecs(words)
        wembs = [dy.noise(we, 0.1) for we in wembs]

        f_init, b_init = [b.initial_state() for b in self.first_layer]

        fw = [x.output() for x in f_init.add_inputs(wembs)]
        bw = [x.output() for x in b_init.add_inputs(reversed(wembs))]

        output_from_first_layer = [
            dy.concatenate([f, b]) for f, b in zip(fw, reversed(bw))
        ]

        f_init, b_init = [b.initial_state() for b in self.second_layer]

        fw = [x.output() for x in f_init.add_inputs(output_from_first_layer)]
        bw = [
            x.output()
            for x in b_init.add_inputs(reversed(output_from_first_layer))
        ]

        errs = []

        for f, b, t in zip(fw, reversed(bw), tags):
            f_b = dy.concatenate([f, b])
            r_t = self.pO * f_b

            err = dy.pickneglogsoftmax(r_t, t)
            errs.append(err)
        return dy.esum(errs)
Пример #15
0
 def __call__(self, p, train=True):
   """
   Args:
     DyNet parameter (not expression)
     train: only apply noise if True
   Return:
     DyNet expression with weight noise applied if self.std > 0
   """
   p_expr = dy.parameter(p)
   if self.std > 0.0 and train:
     p_expr = dy.noise(p_expr, self.std)
   return p_expr
def build_tagging_graph_old(words, tags, template, builders, train=True):
    dy.renew_cg()

    if train and args.lstm_dropout is not None and args.lstm_dropout > 0:
        for b in builders:
            b.set_dropouts(args.lstm_dropout, args.lstm_dropout)

    f_init, b_init = [b.initial_state() for b in builders]

    wembs = [dy.lookup(pEmbedding, w) for w in words]
    if train:  # Add noise in training as a regularizer
        wembs = [dy.noise(we, args.train_noise) for we in wembs]

    fw_states = [x for x in f_init.add_inputs(wembs)]
    bw_states = [x for x in b_init.add_inputs(reversed(wembs))]
    fw = [x.output() for x in fw_states]
    bw = [x.output() for x in bw_states]

    O = dy.parameter(pOutput)
    if args.mlp:
        H = dy.parameter(pHidden)
    errs = []
    pred_tags = []
    for f, b, t in zip(fw, reversed(bw), tags):
        f_b = dy.concatenate([f, b])
        if args.mlp:
            f_b = dy.tanh(H * f_b)
        r_t = O * f_b
        if train:
            err = dy.pickneglogsoftmax(r_t, t)
            errs.append(err)
        else:
            out = dy.softmax(r_t)
            chosen = np.argmax(out.npvalue())
            pred_tags.append(vocab_tags.i2w[chosen])

    O_template = dy.parameter(pOutputTemplate)
    H_template = dy.parameter(pHiddenTemplate)
    f_bt = dy.concatenate([fw_states[-1].s()[0], bw_states[-1].s()[0]])
    f_bt = dy.tanh(H_template * f_bt)
    r_tt = O_template * f_bt
    pred_template = None
    if train:
        err = dy.pickneglogsoftmax(r_tt, template)
        errs.append(err)
    else:
        out = dy.softmax(r_tt)
        chosen = np.argmax(out.npvalue())
        pred_template = vocab_templates.i2w[chosen]

    return pred_tags, pred_template, errs
Пример #17
0
 def embed(self, x: Union[batchers.Batch,
                          numbers.Integral]) -> dy.Expression:
     """
 Embed a single word in a sentence.
 :param x: A word id.
 :return: Embedded word.
 """
     ret = self._embed_word(x, batchers.is_batched(x))
     ## Applying Fix normalization
     if self.fix_norm is not None:
         ret = dy.cdiv(ret, dy.l2_norm(ret)) * self.fix_norm
     ## Weight noise only when training
     if self.train and self.weight_noise > 0.0:
         ret = dy.noise(ret, self.weight_noise)
     return ret
Пример #18
0
def get_base_embeddings(trainmode, unkdtokens, tg_start, sentence):
    sentlen = len(unkdtokens)

    if trainmode:
        emb_x = [dy.noise(v_x[tok], 0.1) for tok in unkdtokens]
    else:
        emb_x = [v_x[tok] for tok in unkdtokens]
    pos_x = [p_x[pos] for pos in sentence.postags]
    dist_x = [dy.scalarInput(i - tg_start + 1) for i in range(sentlen)]

    baseinp_x = [(w_i * dy.concatenate([emb_x[j], pos_x[j], dist_x[j]]) + b_i)
                 for j in range(sentlen)]

    if USE_WV:
        for j in range(sentlen):
            if unkdtokens[j] in wvs:
                nonupdatedwv = dy.nobackprop(e_x[unkdtokens[j]])
                baseinp_x[j] = baseinp_x[j] + w_e * nonupdatedwv + b_e

    embposdist_x = [dy.rectify(baseinp_x[j]) for j in range(sentlen)]

    if USE_DROPOUT:
        basefwdlstm.set_dropout(DROPOUT_RATE)
        baserevlstm.set_dropout(DROPOUT_RATE)
    bfinit = basefwdlstm.initial_state()
    basefwd = bfinit.transduce(embposdist_x)
    brinit = baserevlstm.initial_state()
    baserev = brinit.transduce(reversed(embposdist_x))
    basebi_x = [
        dy.rectify(
            w_bi *
            dy.concatenate([basefwd[eidx], baserev[sentlen - eidx - 1]]) +
            b_bi) for eidx in range(sentlen)
    ]

    if USE_DEPS:
        dhead_x = [embposdist_x[dephead] for dephead in sentence.depheads]
        dheadp_x = [pos_x[dephead] for dephead in sentence.depheads]
        drel_x = [dr_x[deprel] for deprel in sentence.deprels]
        baseinp_x = [
            dy.rectify(w_di * dy.concatenate(
                [dhead_x[j], dheadp_x[j], drel_x[j], basebi_x[j]]) + b_di)
            for j in range(sentlen)
        ]
        basebi_x = baseinp_x

    return basebi_x
Пример #19
0
    def _build_computation_graph(self, words, train_mode=True):
        """
        Builds the computational graph.
        """
        dy.renew_cg()
        # turn parameters into expressions
        softmax_weight_exp = dy.parameter(self.softmax_weight)
        softmax_bias_exp = dy.parameter(self.softmax_bias)

        # initialize the RNNs
        f_init = self.fwd_word_rnn.initial_state()
        b_init = self.bwd_word_rnn.initial_state()

        # cf_init = self.fwd_char_rnn.initial_state()
        # cb_init = self.bwd_char_rnn.initial_state()

        # only use word-level for now
        word_reps = [self._word_rep(word) for word in words]

        if train_mode and self.add_word_noise:
            word_reps = [dy.noise(word_rep, 0.05) for word_rep in word_reps]

        # feed word vectors into biLSTM
        fw_exps = f_init.transduce(word_reps)
        bw_exps = b_init.transduce(reversed(word_reps))

        if self.pooling_method == "last":
            average_lstm = dy.concatenate([fw_exps[-1], bw_exps[-1]])
        else:
            bi_exps = [
                dy.concatenate([f, b])
                for f, b in zip(fw_exps, reversed(bw_exps))
            ]
            bi_exps = dy.concatenate(bi_exps, d=1)

            if self.pooling_method == "average":
                average_lstm = dy.mean_dim(bi_exps, d=1)
            elif self.pooling_method == "max":
                average_lstm = dy.max_dim(bi_exps, d=1)
            else:
                raise NotImplementedError

        if self.average_dropout is not None:
            average_lstm = dy.dropout(average_lstm, p=self.average_dropout)

        return softmax_weight_exp * average_lstm + softmax_bias_exp
Пример #20
0
def build_tagging_graph(words):
    dy.renew_cg()
    # parameters -> expressions
    H = dy.parameter(pH)
    O = dy.parameter(pO)

    # initialize the RNNs
    f_init = fwdRNN.initial_state()
    b_init = bwdRNN.initial_state()

    cf_init = cFwdRNN.initial_state()
    cb_init = cBwdRNN.initial_state()

    # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
    wembs = [word_rep(w, cf_init, cb_init) for w in words]
    wembs = [dy.noise(we, 0.2) for we in wembs]  # optional

    # feed word vectors into biLSTM
    fw_exps = f_init.transduce(wembs)
    bw_exps = b_init.transduce(reversed(wembs))
    # OR
    #    fw_exps = []
    #    s = f_init
    #    for we in wembs:
    #        s = s.add_input(we)
    #        fw_exps.append(s.output())
    #    bw_exps = []
    #    s = b_init
    #    for we in reversed(wembs):
    #        s = s.add_input(we)
    #        bw_exps.append(s.output())

    # biLSTM states
    bi_exps = [
        dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))
    ]

    # feed each biLSTM state to an MLP
    exps = []
    for x in bi_exps:
        r_t = O * (dy.tanh(H * x))
        exps.append(r_t)

    return exps
Пример #21
0
def build_tagging_graph(words):
    dy.renew_cg()
    # parameters -> expressions
    H = dy.parameter(pH)
    O = dy.parameter(pO)

    # initialize the RNNs
    f_init = fwdRNN.initial_state()
    b_init = bwdRNN.initial_state()

    cf_init = cFwdRNN.initial_state()
    cb_init = cBwdRNN.initial_state()

    # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
    wembs = [word_rep(w, cf_init, cb_init) for w in words]
    wembs = [dy.noise(we,0.2) for we in wembs] # optional

    # feed word vectors into biLSTM
    fw_exps = f_init.transduce(wembs)
    bw_exps = b_init.transduce(reversed(wembs))
# OR
#    fw_exps = []
#    s = f_init
#    for we in wembs:
#        s = s.add_input(we)
#        fw_exps.append(s.output())
#    bw_exps = []
#    s = b_init
#    for we in reversed(wembs):
#        s = s.add_input(we)
#        bw_exps.append(s.output())

    # biLSTM states
    bi_exps = [dy.concatenate([f,b]) for f,b in zip(fw_exps, reversed(bw_exps))]

    # feed each biLSTM state to an MLP
    exps = []
    for x in bi_exps:
        r_t = O*(dy.tanh(H * x))
        exps.append(r_t)

    return exps
Пример #22
0
    def build_tagging_graph(self, words, tags):
        dy.renew_cg()
        prefix_indices = [p[0] for p in words]
        suffix_indices = [p[2] for p in words]
        words = [p[1] for p in words]
        wembs = []
        for w, p, s in zip(words, prefix_indices, suffix_indices):
            we = self.E[w]
            pe = self.preffix[p]
            se = self.suffix[s]
            wembs.append(dy.esum([we, pe, se]))

        f_init, b_init = [b.initial_state() for b in self.first_layer]

        #wembs = [self.Word_E[w] for w in words]
        wembs = [dy.noise(we, 0.1) for we in wembs]

        fw = [x.output() for x in f_init.add_inputs(wembs)]
        bw = [x.output() for x in b_init.add_inputs(reversed(wembs))]

        output_from_first_layer = [
            dy.concatenate([f, b]) for f, b in zip(fw, reversed(bw))
        ]

        f_init, b_init = [b.initial_state() for b in self.second_layer]

        fw = [x.output() for x in f_init.add_inputs(output_from_first_layer)]
        bw = [
            x.output()
            for x in b_init.add_inputs(reversed(output_from_first_layer))
        ]

        errs = []

        for f, b, t in zip(fw, reversed(bw), tags):
            f_b = dy.concatenate([f, b])
            r_t = self.pO * f_b

            err = dy.pickneglogsoftmax(r_t, t)
            errs.append(err)
        return dy.esum(errs)
Пример #23
0
    def update_batch(self, words_batch, tags_batch):
        dynet.renew_cg()
        length = max(len(words) for words in words_batch)
        word_ids = np.zeros((length, len(words_batch)), dtype='int32')
        for j, words in enumerate(words_batch):
            for i, word in enumerate(words):
                word_ids[i, j] = self.vw.w2i.get(word, self.UNK)
        tag_ids = np.zeros((length, len(words_batch)), dtype='int32')
        for j, tags in enumerate(tags_batch):
            for i, tag in enumerate(tags):
                tag_ids[i, j] = self.vt.w2i.get(tag, self.UNK)
        wembs = [
            dynet.lookup_batch(self._E, word_ids[i]) for i in range(length)
        ]
        wembs = [dynet.noise(we, 0.1) for we in wembs]

        f_state = self._fwd_lstm.initial_state()
        b_state = self._bwd_lstm.initial_state()

        fw = [x.output() for x in f_state.add_inputs(wembs)]
        bw = [x.output() for x in b_state.add_inputs(reversed(wembs))]

        H = dynet.parameter(self._pH)
        O = dynet.parameter(self._pO)

        errs = []
        for i, (f, b) in enumerate(zip(fw, reversed(bw))):
            f_b = dynet.concatenate([f, b])
            r_t = O * (dynet.tanh(H * f_b))
            err = dynet.pickneglogsoftmax_batch(r_t, tag_ids[i])
            errs.append(dynet.sum_batches(err))
        sum_errs = dynet.esum(errs)
        squared = -sum_errs  # * sum_errs
        losses = sum_errs.scalar_value()
        sum_errs.backward()
        self._sgd.update()
        return losses
Пример #24
0
def build_tagging_graph(words):
    lm_wembs = []
    if HASLM:
        lm_wembs = calc_lm_embdding(words)

    dy.renew_cg()
    H = dy.parameter(pH)
    O = dy.parameter(pO)

    f_init = fwdRNN.initial_state()
    b_init = bwdRNN.initial_state()
    cf_init = cFwdRNN.initial_state()
    cb_init = cBwdRNN.initial_state()

    wembs = [word_rep(w, cf_init, cb_init) for w in words]
    if HASLM:
        wembs1 = []
        for lmw, w in zip(lm_wembs, wembs):
            wv = w.value()
            wv.extend(lmw)
            wembs1.append(wv)
        wembs = [dy.inputTensor(w) for w in wembs1]
    wembs = [dy.noise(we, 0.1) for we in wembs]

    fw_exps = f_init.transduce(wembs)
    bw_exps = b_init.transduce(reversed(wembs))
    bi_exps = [
        dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))
    ]

    exps = []
    for x in bi_exps:
        r_t = O * (dy.tanh(H * x))
        exps.append(r_t)

    return exps
Пример #25
0
    def build_tagging_graph(self, words, tags):
        dy.renew_cg()
        words_for_char = [w[1] for w in words]
        words = [w[0] for w in words]
        f_init, b_init = [b.initial_state() for b in self.word_first_layer]

        wembs = [self.Word_E[w] for w in words]
        wembs = [dy.noise(we, 0.1) for we in wembs]

        fw = [x.output() for x in f_init.add_inputs(wembs)]
        bw = [x.output() for x in b_init.add_inputs(reversed(wembs))]

        output_from_first_layer = [
            dy.concatenate([f, b]) for f, b in zip(fw, reversed(bw))
        ]

        f_init, b_init = [b.initial_state() for b in self.word_second_layer]

        fw = [x.output() for x in f_init.add_inputs(output_from_first_layer)]
        bw = [
            x.output()
            for x in b_init.add_inputs(reversed(output_from_first_layer))
        ]

        errs = []
        char_lstm_vectors = self.build_tagging_graph_for_chars(words_for_char)

        for f, b, chars_vec, t in zip(fw, reversed(bw), char_lstm_vectors,
                                      tags):
            f_b = dy.concatenate([f, b])
            con_cat = dy.concatenate([f_b, chars_vec])
            r_t = self.pO * con_cat

            err = dy.pickneglogsoftmax(r_t, t)
            errs.append(err)
        return dy.esum(errs)
Пример #26
0
# Picking values from vector expressions
e = dy.pick(e1, k)  # k is unsigned integer, e1 is vector. return e1[k]
e = e1[k]  # same

e = dy.pickrange(
    e1, k,
    v)  # like python's e1[k:v] for lists. e1 is an Expression, k,v integers.
e = e1[k:v]  # same

e = dy.pickneglogsoftmax(
    e1, k)  # k is unsigned integer. equiv to: (pick(-log(dy.softmax(e1)), k))

# Neural net stuff
dy.noise(
    e1, stddev
)  # add a noise to each element from a gausian with standard-dev = stddev
dy.dropout(e1, p)  # apply dropout with probability p

# functions over lists of expressions
e = dy.esum([e1, e2, ...])  # sum
e = dy.average([e1, e2, ...])  # average
e = dy.concatenate_cols(
    [e1, e2, ...]
)  # e1, e2,.. are column vectors. return a matrix. (sim to np.hstack([e1,e2,...])
e = dy.concatenate([e1, e2, ...])  # concatenate

e = dy.affine_transform([e0, e1, e2, ...])  # e = e0 + ((e1*e2) + (e3*e4) ...)

## Loss functions
e = dy.squared_distance(e1, e2)
def build_tagging_graph(words, tags, template, builders, train=True, k=1):
    dy.renew_cg()

    if train and args.lstm_dropout is not None and args.lstm_dropout > 0:
        for b in builders:
            b.set_dropouts(args.lstm_dropout, args.lstm_dropout)

    f_init, b_init = [b.initial_state() for b in builders]

    wembs = [dy.lookup(pEmbedding, w) for w in words]
    if train:  # Add noise in training as a regularizer
        wembs = [dy.noise(we, args.train_noise) for we in wembs]

    fw_states = [x for x in f_init.add_inputs(wembs)]
    bw_states = [x for x in b_init.add_inputs(reversed(wembs))]
    fw = [x.output() for x in fw_states]
    bw = [x.output() for x in bw_states]

    O = dy.parameter(pOutput)
    if args.mlp:
        H = dy.parameter(pHidden)
    errs = []
    pred_tags = []
    sorted_arg_topk = []
    final_topk = []
    sequences_topk = [(0.0, list())]
    for f, b, t in zip(fw, reversed(bw), tags):
        f_b = dy.concatenate([f, b])
        if args.mlp:
            f_b = dy.tanh(H * f_b)
        r_t = O * f_b
        if train:
            err = dy.pickneglogsoftmax(r_t, t)
            errs.append(err)
        else:
            out = dy.log_softmax(r_t)
            chosen = np.argmax(out.npvalue())
            pred_tags.append(vocab_tags.i2w[chosen])
            all_sequences = list()
            for seq in sequences_topk:
                seq_score, seq_list = seq
                _scores = -out.npvalue()
                arg_topk = np.argsort(_scores)[:k]
                score_topk = _scores[arg_topk]
                for i in range(min(k, len(arg_topk))):
                    _list = list(seq_list)
                    _list.append(vocab_tags.i2w[arg_topk[i]])
                    score = seq_score + score_topk[i]
                    all_sequences.append((score, _list))
            sequences_topk = sorted(all_sequences)[:k]

    O_template = dy.parameter(pOutputTemplate)
    H_template = dy.parameter(pHiddenTemplate)
    f_bt = dy.concatenate([fw_states[-1].s()[0], bw_states[-1].s()[0]])
    f_bt = dy.tanh(H_template * f_bt)
    r_tt = O_template * f_bt
    pred_template = None
    if train:
        err = dy.pickneglogsoftmax(r_tt, template)
        errs.append(err)
    else:
        out = dy.log_softmax(r_tt)
        _scores = -out.npvalue()
        chosen = np.argmin(_scores)
        pred_template = vocab_templates.i2w[chosen]
        sorted_arg_topk = np.argsort(_scores)[:k]

        all_sequences_and_templates = []
        for template_id in sorted_arg_topk:
            _score = _scores[template_id]
            _template = vocab_templates.i2w[template_id]

            for seq_score, seq_list in sequences_topk:
                all_sequences_and_templates.append(
                    (_score + seq_score, seq_list, _template))
        final_topk = sorted(all_sequences_and_templates)[:k]

    return pred_tags, pred_template, errs, final_topk
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, self.c2i)):
                conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]

                for entry in conll_sentence:
                    wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None
                    evec = self.elookup[int(self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)))] if self.external_embedding is not None else None
                    
                    last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])[-1]
                    rev_last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])[-1]

                    # char_state = dynet.noise(concatenate([last_state, rev_last_state]), 0.2)
                    # morph_logit = self.charSeqPredictor.predict_sequence(char_state)
                    # morphID = self.morphs.get(entry.feats)
                    # morphErrs.append(self.pick_neg_log(morph_logit, morphID))
                    # morph_emb = None
                    # for i in morph_logit:
                    #     morph_emb += i * self.mlookup(i)
                      
                    entry.vec = concatenate(filter(None, [wordvec, evec, last_state, rev_last_state]))
                    entry.ch_vec = concatenate([dynet.noise(fe,0.2) for fe in filter(None, [last_state, rev_last_state])])
                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:

                    morcat_layer = [entry.ch_vec for entry in conll_sentence]
                    morph_logits = self.charSeqPredictor.predict_sequence(morcat_layer)
                    predicted_morph_idx = [np.argmax(o.value()) for o in morph_logits]
                    predicted_morphs = [self.id2morph[idx] for idx in predicted_morph_idx]

                    for builder in self.pos_builder:
                        builder.disable_dropout()
                    lstm_forward = self.pos_builder[0].initial_state()
                    lstm_backward = self.pos_builder[1].initial_state()

                    for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    pos_embed = []
                    concat_layer = [concatenate(entry.lstms) for entry in conll_sentence]
                    outputFFlayer = self.ffSeqPredictor.predict_sequence(concat_layer)
                    predicted_posIDs = [np.argmax(o.value()) for o in outputFFlayer]  
                    predicted_postags = [self.id2pos[idx] for idx in predicted_posIDs]
                    for predID, pred in zip(predicted_posIDs, outputFFlayer):
                        if self.gold_pos:
                            pos_embed.append(self.plookup[predID])
                        else:
                            pos_embed.append(soft_embed(pred.value(), self.plookup))
                            
                    for entry in conll_sentence:
                        entry.vec = concatenate(entry.lstms)
                    for builder in self.dep_builders:
                        builder.disable_dropout()
                    blstm_forward = self.dep_builders[0].initial_state()
                    blstm_backward = self.dep_builders[1].initial_state()

                    for entry, rentry, pembed, revpembed in zip(conll_sentence, reversed(conll_sentence),
                                                                pos_embed, reversed(pos_embed)):
                        blstm_forward = blstm_forward.add_input(concatenate([entry.vec, pembed]))
                        blstm_backward = blstm_backward.add_input(concatenate([rentry.vec, revpembed]))

                        entry.lstms[1] = blstm_forward.output()
                        rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                heads = decoder.parse_proj(scores)
                
                #Multiple roots: heading to the previous "rooted" one
                rootCount = 0
                rootWid = -1
                for index, head in enumerate(heads):
                    if head == 0:
                        rootCount += 1
                        if rootCount == 1:
                            rootWid = index
                        if rootCount > 1:    
                            heads[index] = rootWid
                            rootWid = index
                        
                
                for entry, head, pos, feats in zip(conll_sentence, heads, predicted_postags, predicted_morphs):
                    entry.pred_parent_id = head
                    entry.pred_relation = '_'
                    entry.pred_pos = pos
                    entry.pred_feats = feats

                dump = False

                if self.labelsFlag:
                    for modifier, head in enumerate(heads[1:]):
                        scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1)
                        conll_sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]]

                renew_cg()
                if not dump:
                    yield sentence
Пример #29
0
    def __applyNoise(self, exp, train):
        if self.__noise == None or not train:
            return exp

        return dynet.noise(exp, self.__noise)
Пример #30
0
    def Train(self, conll_path):
        errors = 0
        batch = 0
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, self.c2i))
            random.shuffle(shuffledData)

            errs = []
            lerrs = []
            posErrs = []
            eeloss = 0.0

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 500 == 0 and iSentence != 0:
                    print "Processing sentence number: %d" % iSentence, ", Loss: %.2f" % (
                        eloss / etotal), ", Time: %.2f" % (time.time() - start)
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                for entry in conll_sentence:
                    c = float(self.wordsCount.get(entry.norm, 0))
                    dropFlag = (random.random() < (c / (0.25 + c)))
                    wordvec = self.wlookup[
                        int(self.vocab.get(entry.norm, 0)
                            ) if dropFlag else 0] if self.wdims > 0 else None
                    evec = None

                    if self.external_embedding is not None:
                        evec = self.elookup[self.extrnd.get(
                            entry.form, self.extrnd.get(entry.norm, 0)) if
                                            (dropFlag or
                                             (random.random() < 0.5)) else 0]
                    #entry.vec = concatenate(filter(None, [wordvec, evec]))

                    last_state = self.char_rnn.predict_sequence(
                        [self.clookup[c] for c in entry.idChars])[-1]
                    rev_last_state = self.char_rnn.predict_sequence(
                        [self.clookup[c] for c in reversed(entry.idChars)])[-1]

                    entry.vec = concatenate([
                        dynet.noise(fe, 0.2) for fe in filter(
                            None, [wordvec, evec, last_state, rev_last_state])
                    ])

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(
                                rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                gold = [entry.parent_id for entry in conll_sentence]
                heads = decoder.parse_proj(scores,
                                           gold if self.costaugFlag else None)

                if self.labelsFlag:
                    for modifier, head in enumerate(gold[1:]):
                        rscores, rexprs = self.__evaluateLabel(
                            conll_sentence, head, modifier + 1)
                        goldLabelInd = self.rels[conll_sentence[modifier +
                                                                1].relation]
                        wrongLabelInd = max(((l, scr)
                                             for l, scr in enumerate(rscores)
                                             if l != goldLabelInd),
                                            key=itemgetter(1))[0]
                        if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
                            lerrs.append(rexprs[wrongLabelInd] -
                                         rexprs[goldLabelInd])

                e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
                eerrors += e
                if e > 0:
                    loss = [(exprs[h][i] - exprs[g][i])
                            for i, (h, g) in enumerate(zip(heads, gold))
                            if h != g]  # * (1.0/float(e))
                    eloss += (e)
                    mloss += (e)
                    errs.extend(loss)

                etotal += len(conll_sentence)

                concat_layer = [
                    concatenate(entry.lstms) for entry in conll_sentence
                ]
                concat_layer = [dynet.noise(fe, 0.2) for fe in concat_layer]
                outputFFlayer = self.ffSeqPredictor.predict_sequence(
                    concat_layer)
                posIDs = [self.pos.get(entry.pos) for entry in conll_sentence]
                for pred, gold in zip(outputFFlayer, posIDs):
                    posErrs.append(self.pick_neg_log(pred, gold))

                if iSentence % 1 == 0 or len(errs) > 0 or len(
                        lerrs) > 0 or len(posErrs) > 0:
                    eeloss = 0.0

                    if len(errs) > 0 or len(lerrs) > 0 or len(posErrs) > 0:
                        eerrs = (esum(errs + lerrs + posErrs)
                                 )  #* (1.0/(float(len(errs))))
                        eerrs.scalar_value()
                        eerrs.backward()
                        self.trainer.update()
                        errs = []
                        lerrs = []
                        posErrs = []

                    renew_cg()

        if len(errs) > 0:
            eerrs = (esum(errs + lerrs + posErrs))  #* (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []
            posErrs = []
            eeloss = 0.0

            renew_cg()

        self.trainer.update()
        print "Loss: %.2f" % (mloss / iSentence)
Пример #31
0
    def predict(self, features, task_name, train=False):
        """
        Steps through the computation graph and obtains predictions for the
        provided input features.
        :param features: a list of word  embeddings for every word in the sequence
        :param task_name: the name of the task that should be predicted
        :param train: if the model is training; apply noise in this case
        :return output: the output predictions
                penalty: the summed subspace penalty (0 if no constraint)
        """
        if train:  # noise is added only at training time

            features = [dynet.noise(fe, self.noise_sigma) for fe in features]

        # only if we use cross-stitch we have a layer for each task;
        # otherwise we just have one layer for all tasks
        num_layers = self.h_layers
        inputs = [features] * len(self.task_names)
        inputs_rev = [features] * len(self.task_names)

        target_task_id = self.task_names.index(
            task_name) if self.cross_stitch else 0

        # collect the forward and backward sequences for each task at every
        # layer for the layer connection units
        layer_forward_sequences = []
        layer_backward_sequences = []

        penalty = dynet.const_parameter(self.subspace_penalty)

        for i in range(0, num_layers):
            forward_sequences = []
            backward_sequences = []
            for j in range(num_task_layers):
                predictor = self.predictors['inner'][i][j]
                forward_sequence, backward_sequence = predictor.predict_sequence(
                    inputs[j], inputs_rev[j])
                if i > 0 and self.activation:
                    # activation between LSTM layers
                    forward_sequence = [
                        self.activation(s) for s in forward_sequence
                    ]
                    backward_sequence = [
                        self.activation(s) for s in backward_sequence
                    ]
                forward_sequences.append(forward_sequence)
                backward_sequences.append(backward_sequence)

                if self.num_subspaces == 2 and self.constraint_weight != 0:
                    # returns a list per layer, i.e. here a list with one item
                    lstm_parameters = \
                        predictor.builder.get_parameter_expressions()[0]

                    # lstm parameters consists of these weights:
                    # Wix,Wih,Wic,bi,Wox,Woh,Woc,bo,Wcx,Wch,bc
                    for param_idx in range(len(lstm_parameters)):
                        if param_idx in self.constrain_matrices:
                            W = lstm_parameters[param_idx]
                            W_shape = np.array(W.value()).shape

                            if (len(W_shape) < 2):
                                W_shape = [W_shape[0], 1]

                            # split matrix into its two subspaces
                            W_subspaces = dynet.reshape(
                                W, (self.num_subspaces, W_shape[0] /
                                    float(self.num_subspaces), W_shape[1]))
                            subspace_1, subspace_2 = W_subspaces[
                                0], W_subspaces[1]

                            # calculate the matrix product of the two matrices
                            matrix_product = dynet.transpose(
                                subspace_1) * subspace_2

                            # take the squared Frobenius norm by squaring
                            # every element and then summing them
                            squared_frobenius_norm = dynet.sum_elems(
                                dynet.square(matrix_product))
                            penalty += squared_frobenius_norm

            if self.cross_stitch:
                # takes as input a list of input lists and produces a list of
                # outputs where the index indicates the task
                forward_sequences = self.predictors['cross_stitch'][i].stitch(
                    forward_sequences)
                backward_sequences = self.predictors['cross_stitch'][i].stitch(
                    backward_sequences)

            inputs = forward_sequences
            inputs_rev = backward_sequences
            layer_forward_sequences.append(forward_sequences)
            layer_backward_sequences.append(backward_sequences)

            if i == num_layers - 1:
                output_predictor = \
                    self.predictors['output_layers_dict'][task_name]

                # get the forward/backward states of all task layers
                task_forward_sequences = [
                    layer_seq_list[target_task_id][-1]
                    for layer_seq_list in layer_forward_sequences
                ]

                task_backward_sequences = [
                    layer_seq_list[target_task_id][0]
                    for layer_seq_list in layer_backward_sequences
                ]

                if (num_layers > 1):
                    forward_input = \
                        self.predictors['layer_stitch'][
                            target_task_id].stitch(task_forward_sequences)
                    backward_input = \
                        self.predictors['layer_stitch'][
                            target_task_id].stitch(task_backward_sequences)

                else:
                    forward_input = task_forward_sequences[0]
                    backward_input = task_backward_sequences[0]

                concat_layer = dynet.concatenate(
                    [forward_input, backward_input])

                if train and self.noise_sigma > 0.0:
                    concat_layer = dynet.noise(concat_layer, self.noise_sigma)

                output = []

                if ('sentiment' in task_name):  #Multi-label

                    for i in range(len(output_predictor)):

                        output.append(output_predictor[i](concat_layer))

                else:
                    output.append(output_predictor(concat_layer))

                #output = output_predictor.predict_sequence(concat_layer)

                return output, penalty
        raise Exception('Error: This place should not be reached.')