Exemplo n.º 1
0
    def forward(self, sentences, errs, lerrs):
        tmp = time.time()
        self.getWordEmbeddings(sentences, True)
        self.ebd += time.time() - tmp

        for sentence in sentences:
            tmp = time.time()
            scores, exprs = self.__evaluate(sentence, True)
            self.evl += time.time() - tmp
            gold = [entry.parent_id for entry in sentence]
            heads = decoder.parse_proj(scores, gold)

            for modifier, head in enumerate(gold[1:]):
                tmp = time.time()
                rscores, rexprs = self.__evaluateLabel(sentence, head,
                                                       modifier + 1)
                self.evl += time.time() - tmp
                goldLabelInd = self.rels[sentence[modifier + 1].relation]
                wrongLabelInd = \
                    max(((l, scr) for l, scr in enumerate(rscores)
                         if l != goldLabelInd), key=itemgetter(1))[0]
                if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
                    lerrs += [rexprs[wrongLabelInd] - rexprs[goldLabelInd]]

        e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
        if e > 0:
            errs += [(exprs[h][i] - exprs[g][i])[0]
                     for i, (h, g) in enumerate(zip(heads, gold)) if h != g]
        return e
Exemplo n.º 2
0
    def predict(self, sentence):
        self.process_sentence_embeddings(sentence)

        num_vec = len(sentence)
        vec_for = torch.cat([entry.vec
                             for entry in sentence]).view(num_vec, 1, -1)
        vec_back = torch.cat([entry.vec for entry in reversed(sentence)
                              ]).view(num_vec, 1, -1)
        res_for_1, self.hid_for_1 = self.lstm_for_1(vec_for, self.hid_for_1)
        res_back_1, self.hid_back_1 = self.lstm_back_1(vec_back,
                                                       self.hid_back_1)

        vec_cat = [
            concatenate_tensors([res_for_1[i], res_back_1[num_vec - i - 1]])
            for i in range(num_vec)
        ]

        vec_for_2 = torch.cat(vec_cat).view(num_vec, 1, -1)
        vec_back_2 = torch.cat(list(reversed(vec_cat))).view(num_vec, 1, -1)
        res_for_2, self.hid_for_2 = self.lstm_for_2(vec_for_2, self.hid_for_2)
        res_back_2, self.hid_back_2 = self.lstm_back_2(vec_back_2,
                                                       self.hid_back_2)

        for i in range(num_vec):
            sentence[i].lstms[0] = res_for_2[i]
            sentence[i].lstms[1] = res_back_2[num_vec - i - 1]

        scores, exprs = self.__evaluate(sentence, True)
        heads = decoder.parse_proj(scores)

        for entry, head in zip(sentence, heads):
            entry.pred_parent_id = head
            entry.pred_relation = '_'
Exemplo n.º 3
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP)):
                conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]

                for entry in conll_sentence:
                    wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None
                    posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
                    evec = self.elookup[int(self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)))] if self.external_embedding is not None else None
                    entry.vec = concatenate(filter(None, [wordvec, posvec, evec]))

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                heads = decoder.parse_proj(scores)

                for entry, head in zip(conll_sentence, heads):
                    entry.pred_parent_id = head
                    entry.pred_relation = '_'

                dump = False

                if self.labelsFlag:
                    for modifier, head in enumerate(heads[1:]):
                        scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1)
                        conll_sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]]

                renew_cg()
                if not dump:
                    yield sentence
Exemplo n.º 4
0
    def predict(self, sentence):
        for entry in sentence:
            wordvec = self.wlookup(scalar(int(self.vocab.get(
                entry.norm, 0)))) if self.wdims > 0 else None
            posvec = self.plookup(scalar(int(
                self.pos[entry.pos]))) if self.pdims > 0 else None
            evec = self.elookup(
                scalar(
                    int(
                        self.extrnd.get(entry.form,
                                        self.extrnd.get(entry.norm, 0))))
            ) if self.external_embedding is not None else None
            entry.vec = cat([wordvec, posvec, evec])

            entry.lstms = [entry.vec, entry.vec]
            entry.headfov = None
            entry.modfov = None

            entry.rheadfov = None
            entry.rmodfov = None

        if self.blstmFlag:
            lstm_forward = RNNState(self.builders[0])
            lstm_backward = RNNState(self.builders[1])

            for entry, rentry in zip(sentence, reversed(sentence)):
                lstm_forward = lstm_forward.next(entry.vec)
                lstm_backward = lstm_backward.next(rentry.vec)

                entry.lstms[1] = lstm_forward()
                rentry.lstms[0] = lstm_backward()

            if self.bibiFlag:
                for entry in sentence:
                    entry.vec = cat(entry.lstms)

                blstm_forward = RNNState(self.bbuilders[0])
                blstm_backward = RNNState(self.bbuilders[1])

                for entry, rentry in zip(sentence, reversed(sentence)):
                    blstm_forward = blstm_forward.next(entry.vec)
                    blstm_backward = blstm_backward.next(rentry.vec)

                    entry.lstms[1] = blstm_forward()
                    rentry.lstms[0] = blstm_backward()

        scores, exprs = self.__evaluate(sentence, True)
        heads = decoder.parse_proj(scores)

        for entry, head in zip(sentence, heads):
            entry.pred_parent_id = head
            entry.pred_relation = '_'

        if self.labelsFlag:
            for modifier, head in enumerate(heads[1:]):
                scores, exprs = self.__evaluateLabel(sentence, head,
                                                     modifier + 1)
                sentence[modifier + 1].pred_relation = self.irels[max(
                    enumerate(scores), key=itemgetter(1))[0]]
Exemplo n.º 5
0
    def arc_loss(self, gold_arcs, arc_scores):
        errors = []
        arc_scores_values = np.array([[j.value() for j in i] for i in arc_scores])
        arcs = parse_proj(arc_scores_values, gold_arcs)
        
        for i in range(len(gold_arcs)):
            if gold_arcs[i] != arcs[i]:
                error = arc_scores[arcs[i]][i] - arc_scores[gold_arcs[i]][i]
                errors.append(error)

        return errors
Exemplo n.º 6
0
    def predict(self, sentences):
        self.getWordEmbeddings(sentences, False)

        for sentence in sentences:
            scores, exprs = self.__evaluate(sentence, True)
            heads = decoder.parse_proj(scores)

            for entry, head in zip(sentence, heads):
                entry.pred_parent_id = head
                entry.pred_relation = '_'

            head_list = list(heads)
            for modifier, head in enumerate(head_list[1:]):
                scores, exprs = self.__evaluateLabel(sentence, head,
                                                     modifier + 1)
                sentence[modifier + 1].pred_relation = self.irels[max(
                    enumerate(scores), key=itemgetter(1))[0]]
Exemplo n.º 7
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll_predict(conllFP, self.c2i, self.wordsCount)):
                conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]

                for entry in conll_sentence:
                    wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None

                    last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])[-1]
                    rev_last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])[
                        -1]

                    entry.vec = concatenate(filter(None, [wordvec, last_state, rev_last_state]))

                    entry.pos_lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                #Predicted pos tags
                lstm_forward = self.pos_builders[0].initial_state()
                lstm_backward = self.pos_builders[1].initial_state()
                for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                    lstm_forward = lstm_forward.add_input(entry.vec)
                    lstm_backward = lstm_backward.add_input(rentry.vec)

                    entry.pos_lstms[1] = lstm_forward.output()
                    rentry.pos_lstms[0] = lstm_backward.output()

                for entry in conll_sentence:
                    entry.pos_vec = concatenate(entry.pos_lstms)

                blstm_forward = self.pos_bbuilders[0].initial_state()
                blstm_backward = self.pos_bbuilders[1].initial_state()

                for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                    blstm_forward = blstm_forward.add_input(entry.pos_vec)
                    blstm_backward = blstm_backward.add_input(rentry.pos_vec)
                    entry.pos_lstms[1] = blstm_forward.output()
                    rentry.pos_lstms[0] = blstm_backward.output()

                concat_layer = [concatenate(entry.pos_lstms) for entry in conll_sentence]
                outputFFlayer = self.ffSeqPredictor.predict_sequence(concat_layer)
                predicted_pos_indices = [np.argmax(o.value()) for o in outputFFlayer]
                predicted_postags = [self.id2pos[idx] for idx in predicted_pos_indices]

                # Add predicted pos tags for parsing prediction
                for entry, posid in zip(conll_sentence, predicted_pos_indices):
                    entry.vec = concatenate([entry.vec, self.plookup[posid]])
                    entry.lstms = [entry.vec, entry.vec]

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence)
                heads = decoder.parse_proj(scores)

                # Multiple roots: heading to the previous "rooted" one
                rootCount = 0
                rootWid = -1
                for index, head in enumerate(heads):
                    if head == 0:
                        rootCount += 1
                        if rootCount == 1:
                            rootWid = index
                        if rootCount > 1:
                            heads[index] = rootWid
                            rootWid = index

                for entry, head, pos in zip(conll_sentence, heads, predicted_postags):
                    entry.pred_parent_id = head
                    entry.pred_relation = '_'
                    entry.pred_pos = pos

                dump = False

                if self.labelsFlag:
                    concat_layer = [self.__getRelVector(conll_sentence, head, modifier + 1) for modifier, head in
                                    enumerate(heads[1:])]
                    outputFFlayer = self.ffRelPredictor.predict_sequence(concat_layer)
                    predicted_rel_indices = [np.argmax(o.value()) for o in outputFFlayer]
                    predicted_rels = [self.irels[idx] for idx in predicted_rel_indices]
                    for modifier, head in enumerate(heads[1:]):
                        conll_sentence[modifier + 1].pred_relation = predicted_rels[modifier]

                renew_cg()
                if not dump:
                    yield sentence
Exemplo n.º 8
0
    def Train(self, conll_path):
        errors = 0
        batch = 0
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, self.c2i))
            random.shuffle(shuffledData)

            errs = []
            lerrs = []
            posErrs = []
            eeloss = 0.0

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 500 == 0 and iSentence != 0:
                    print "Processing sentence number: %d" % iSentence, ", Loss: %.2f" % (
                        eloss / etotal), ", Time: %.2f" % (time.time() - start)
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                for entry in conll_sentence:
                    c = float(self.wordsCount.get(entry.norm, 0))
                    dropFlag = (random.random() < (c / (0.25 + c)))
                    wordvec = self.wlookup[
                        int(self.vocab.get(entry.norm, 0)
                            ) if dropFlag else 0] if self.wdims > 0 else None
                    evec = None

                    if self.external_embedding is not None:
                        evec = self.elookup[self.extrnd.get(
                            entry.form, self.extrnd.get(entry.norm, 0)) if
                                            (dropFlag or
                                             (random.random() < 0.5)) else 0]
                    #entry.vec = concatenate(filter(None, [wordvec, evec]))

                    last_state = self.char_rnn.predict_sequence(
                        [self.clookup[c] for c in entry.idChars])[-1]
                    rev_last_state = self.char_rnn.predict_sequence(
                        [self.clookup[c] for c in reversed(entry.idChars)])[-1]

                    entry.vec = concatenate([
                        dynet.noise(fe, 0.2) for fe in filter(
                            None, [wordvec, evec, last_state, rev_last_state])
                    ])

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(
                                rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                gold = [entry.parent_id for entry in conll_sentence]
                heads = decoder.parse_proj(scores,
                                           gold if self.costaugFlag else None)

                if self.labelsFlag:
                    for modifier, head in enumerate(gold[1:]):
                        rscores, rexprs = self.__evaluateLabel(
                            conll_sentence, head, modifier + 1)
                        goldLabelInd = self.rels[conll_sentence[modifier +
                                                                1].relation]
                        wrongLabelInd = max(((l, scr)
                                             for l, scr in enumerate(rscores)
                                             if l != goldLabelInd),
                                            key=itemgetter(1))[0]
                        if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
                            lerrs.append(rexprs[wrongLabelInd] -
                                         rexprs[goldLabelInd])

                e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
                eerrors += e
                if e > 0:
                    loss = [(exprs[h][i] - exprs[g][i])
                            for i, (h, g) in enumerate(zip(heads, gold))
                            if h != g]  # * (1.0/float(e))
                    eloss += (e)
                    mloss += (e)
                    errs.extend(loss)

                etotal += len(conll_sentence)

                concat_layer = [
                    concatenate(entry.lstms) for entry in conll_sentence
                ]
                concat_layer = [dynet.noise(fe, 0.2) for fe in concat_layer]
                outputFFlayer = self.ffSeqPredictor.predict_sequence(
                    concat_layer)
                posIDs = [self.pos.get(entry.pos) for entry in conll_sentence]
                for pred, gold in zip(outputFFlayer, posIDs):
                    posErrs.append(self.pick_neg_log(pred, gold))

                if iSentence % 1 == 0 or len(errs) > 0 or len(
                        lerrs) > 0 or len(posErrs) > 0:
                    eeloss = 0.0

                    if len(errs) > 0 or len(lerrs) > 0 or len(posErrs) > 0:
                        eerrs = (esum(errs + lerrs + posErrs)
                                 )  #* (1.0/(float(len(errs))))
                        eerrs.scalar_value()
                        eerrs.backward()
                        self.trainer.update()
                        errs = []
                        lerrs = []
                        posErrs = []

                    renew_cg()

        if len(errs) > 0:
            eerrs = (esum(errs + lerrs + posErrs))  #* (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []
            posErrs = []
            eeloss = 0.0

            renew_cg()

        self.trainer.update()
        print "Loss: %.2f" % (mloss / iSentence)
Exemplo n.º 9
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP)):
                self.hid2Layer = parameter(self.model["hidden2-layer"])
                self.hid2Bias = parameter(self.model["hidden2-bias"])

                self.hidLayerFOM = parameter(self.model["hidden-layer-fom"])
                self.hidLayerFOH = parameter(self.model["hidden-layer-foh"])
                self.hidBias = parameter(self.model["hidden-bias"])

                self.outLayer = parameter(self.model["output-layer"])

                if self.labelsFlag:
                    self.rhid2Layer = parameter(self.model["rhidden2-layer"])
                    self.rhid2Bias = parameter(self.model["rhidden2-bias"])

                    self.rhidLayerFOM = parameter(self.model["rhidden-layer-fom"])
                    self.rhidLayerFOH = parameter(self.model["rhidden-layer-foh"])
                    self.rhidBias = parameter(self.model["rhidden-bias"])

                    self.routLayer = parameter(self.model["routput-layer"])
                    self.routBias = parameter(self.model["routput-bias"])


                for entry in sentence:
                    wordvec = lookup(self.model["word-lookup"], int(self.vocab.get(entry.norm, 0))) if self.wdims > 0 else None
                    posvec = lookup(self.model["pos-lookup"], int(self.pos[entry.pos])) if self.pdims > 0 else None
                    evec = lookup(self.model["extrn-lookup"], int(self.vocab.get(entry.norm, 0))) if self.external_embedding is not None else None
                    entry.vec = concatenate(filter(None, [wordvec, posvec, evec]))

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(sentence, reversed(sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(sentence, reversed(sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(sentence, True)
                heads = decoder.parse_proj(scores) 

                for entry, head in zip(sentence, heads):
                    entry.pred_parent_id = head
                    entry.pred_relation = '_'

                dump = False

                if self.labelsFlag:
                    for modifier, head in enumerate(heads[1:]):
                        scores, exprs = self.__evaluateLabel(sentence, head, modifier+1)
                        sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]]

                renew_cg()
                if not dump:
                    yield sentence
    def Train(self, conll_path, dep_epoch=0, ner_epoch=0):
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()
        dep_epoch = dep_epoch
        ner_epoch = ner_epoch
        with open(conll_path, 'r') as conllFP:
            if ner_epoch == 0:
                read_conll_nerdep = read_conll(conllFP, self.c2i)
            else:
                read_conll_nerdep = read_conll_ner(conllFP, self.c2i)
            shuffledData = list(read_conll_nerdep)
            random.shuffle(shuffledData)

            errs = []
            lerrs = []
            posErrs = 0
            postrErrs = []
            nertr2Errs = []
            ner2Errs = dynet.inputVector([0])
            startind = 0
            e = 0
            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 500 == 0 and iSentence != 0:
                    print "Processing sentence number: %d" % iSentence, ", Loss: %.4f" % (
                        eloss / etotal), ", Time: %.2f" % (time.time() - start)
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                for entry in conll_sentence:
                    c = float(self.wordsCount.get(entry.norm, 0))
                    dropFlag = (random.random() < (c / (0.25 + c)))
                    capvec = self.caps_lookup[entry.capInfo]
                    wordvec = self.wlookup[
                        int(self.vocab.get(entry.norm, 0)
                            ) if dropFlag else 0] if self.wdims > 0 else None

                    last_state = self.char_rnn.predict_sequence(
                        [self.clookup[c] for c in entry.idChars])[-1]
                    rev_last_state = self.char_rnn.predict_sequence(
                        [self.clookup[c] for c in reversed(entry.idChars)])[-1]

                    entry.vec = dynet.dropout(
                        concatenate(
                            filter(
                                None,
                                [wordvec, last_state, rev_last_state, capvec
                                 ])), 0.33)
                    entry.vec2 = entry.vec
                    entry.pos_lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if not self.depFlag:

                    #NER tagging loss
                    lstm_forward = self.pos_builders[0].initial_state()
                    lstm_backward = self.pos_builders[1].initial_state()
                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.pos_lstms[1] = lstm_forward.output()
                        rentry.pos_lstms[0] = lstm_backward.output()

                    for entry in conll_sentence:
                        entry.pos_vec = concatenate(entry.pos_lstms)

                    blstm_forward = self.pos_bbuilders[0].initial_state()
                    blstm_backward = self.pos_bbuilders[1].initial_state()

                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        blstm_forward = blstm_forward.add_input(entry.pos_vec)
                        blstm_backward = blstm_backward.add_input(
                            rentry.pos_vec)

                        entry.pos_lstms[1] = blstm_forward.output()
                        rentry.pos_lstms[0] = blstm_backward.output()

                    concat_layer = [
                        dynet.dropout(concatenate(entry.pos_lstms), 0.33)
                        for entry in conll_sentence
                    ]
                    cap_info_sentence = [
                        self.caps_lookup[entry.capInfo]
                        for entry in conll_sentence
                    ]
                    outputFFlayer = self.ffSeqPredictor.predict_sequence(
                        concat_layer)
                    posIDs = [
                        self.pos.get(entry.pos) for entry in conll_sentence
                    ]
                    posErrs = (self.forward_score(outputFFlayer) -
                               self.pick_gold_score(outputFFlayer, posIDs))

                ##dependency Flag
                if self.depFlag:
                    # Add predicted ner tags
                    #for entry, poses in zip(conll_sentence, outputFFlayer):
                    #    entry.vec = concatenate([entry.vec, dynet.dropout(self.plookup[np.argmax(poses.value())], 0.33)])
                    for entry in conll_sentence:
                        entry.lstms = [entry.vec, entry.vec]

                    #Parsing losses
                    if self.blstmFlag:
                        lstm_forward = self.builders[0].initial_state()
                        lstm_backward = self.builders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            lstm_forward = lstm_forward.add_input(entry.vec)
                            lstm_backward = lstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = lstm_forward.output()
                            rentry.lstms[0] = lstm_backward.output()

                        if self.bibiFlag:
                            for entry in conll_sentence:
                                entry.vec = concatenate(entry.lstms)

                            blstm_forward = self.bbuilders[0].initial_state()
                            blstm_backward = self.bbuilders[1].initial_state()

                            for entry, rentry in zip(conll_sentence,
                                                     reversed(conll_sentence)):
                                blstm_forward = blstm_forward.add_input(
                                    entry.vec)
                                blstm_backward = blstm_backward.add_input(
                                    rentry.vec)

                                entry.lstms[1] = blstm_forward.output()
                                rentry.lstms[0] = blstm_backward.output()

                    scores, exprs = self.__evaluate(conll_sentence)
                    gold = [entry.parent_id for entry in conll_sentence]
                    heads = decoder.parse_proj(
                        scores, gold if self.costaugFlag else None)

                    if self.labelsFlag:

                        concat_layer = [
                            dynet.dropout(
                                self.__getRelVector(conll_sentence, head,
                                                    modifier + 1), 0.33)
                            for modifier, head in enumerate(gold[1:])
                        ]
                        outputFFlayer = self.ffRelPredictor.predict_sequence(
                            concat_layer)
                        if dep_epoch == 1:
                            relIDs = [
                                self.rels[conll_sentence[modifier +
                                                         1].relation]
                                for modifier, _ in enumerate(gold[1:])
                            ]
                            for pred, goldid in zip(outputFFlayer, relIDs):
                                lerrs.append(self.pick_neg_log(pred, goldid))
                    if dep_epoch == 1:
                        e = sum(
                            [1 for h, g in zip(heads[1:], gold[1:]) if h != g])

                    if self.sNerFlag and ner_epoch == 1:

                        conll_sentence[0].vec = concatenate([
                            conll_sentence[0].vec2,
                            self.rellookup[self.rels["rroot"]]
                        ])
                        for entry, pred in zip(conll_sentence[1:],
                                               outputFFlayer):
                            relvec = self.rellookup[np.argmax(pred.value())]
                            entry.vec = concatenate(
                                [entry.vec2,
                                 dynet.dropout(relvec, 0.33)])

                        for entry in conll_sentence:
                            entry.ner2_lstms = [entry.vec, entry.vec]

                        slstm_forward = self.sner_builders[0].initial_state()
                        slstm_backward = self.sner_builders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            lstm_forward = slstm_forward.add_input(entry.vec)
                            lstm_backward = slstm_backward.add_input(
                                rentry.vec)

                            entry.ner2_lstms[1] = lstm_forward.output()
                            rentry.ner2_lstms[0] = lstm_backward.output()

                        for entry in conll_sentence:
                            entry.ner2_vec = concatenate(entry.ner2_lstms)

                        sblstm_forward = self.sner_bbuilders[0].initial_state()
                        sblstm_backward = self.sner_bbuilders[1].initial_state(
                        )

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            blstm_forward = sblstm_forward.add_input(
                                entry.ner2_vec)
                            blstm_backward = sblstm_backward.add_input(
                                rentry.ner2_vec)

                            entry.ner2_lstms[1] = blstm_forward.output()
                            rentry.ner2_lstms[0] = blstm_backward.output()

                        concat_layer = [
                            dynet.dropout(concatenate(entry.ner2_lstms), 0.33)
                            for entry in conll_sentence
                        ]
                        outputFFlayer = self.ffSeqPredictor.predict_sequence(
                            concat_layer)
                        posIDs = [
                            self.pos.get(entry.pos) for entry in conll_sentence
                        ]
                        gold_score = self.pick_gold_score(
                            outputFFlayer, posIDs)
                        ner2Errs = (self.forward_score(outputFFlayer) -
                                    gold_score)

                    if iSentence < 5:
                        print("ner and dep loss")
                        if ner2Errs != 0:
                            print(ner2Errs.value())
                        else:
                            print(0)
                        if dep_epoch != 0:
                            print(esum(lerrs).value())
                        else:
                            print(0)

                eerrors += e
                if e > 0:
                    loss = [(exprs[h][i] - exprs[g][i])
                            for i, (h, g) in enumerate(zip(heads, gold))
                            if h != g]  # * (1.0/float(e))
                    eloss += (e)
                    mloss += (e)
                    errs.extend(loss)

                etotal += len(conll_sentence)

                if iSentence % 1 == 0:
                    if len(errs) > 0 or len(lerrs) > 0 or posErrs > 0 or len(
                            postrErrs) > 0 or ner2Errs > 0 or len(
                                nertr2Errs) > 0:
                        eerrs = 0
                        if len(errs + lerrs + postrErrs + nertr2Errs) > 0:
                            eerrs = esum(errs + lerrs + postrErrs + nertr2Errs)
                        eerrs += (posErrs + ner2Errs)
                        #print(eerrs.value())
                        eerrs.scalar_value()
                        eerrs.backward()
                        self.trainer.update()
                        errs = []
                        e = 0
                        lerrs = []
                        posErrs = []
                        postrErrs = []
                        ner2Errs = []
                        nertr2Errs = []
                        posErrs = 0
                        ner2Errs = 0

                    renew_cg()

        print "Loss: %.4f" % (mloss / iSentence)
    def Predict(self, conll_path, dep_epoch=1, ner_epoch=1):
        with open(conll_path, 'r') as conllFP:
            if ner_epoch == 0:
                read_conll_nerdep = read_conll_predict(conllFP, self.c2i,
                                                       self.wordsCount)
            else:
                read_conll_nerdep = read_conll_predict_ner(
                    conllFP, self.c2i, self.wordsCount)
            for iSentence, sentence in enumerate(read_conll_nerdep):
                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                for entry in conll_sentence:
                    capvec = self.caps_lookup[entry.capInfo]
                    wordvec = self.wlookup[int(self.vocab.get(
                        entry.norm, 0))] if self.wdims > 0 else None

                    last_state = self.char_rnn.predict_sequence(
                        [self.clookup[c] for c in entry.idChars])[-1]
                    rev_last_state = self.char_rnn.predict_sequence(
                        [self.clookup[c] for c in reversed(entry.idChars)])[-1]

                    entry.vec = concatenate(
                        filter(None,
                               [wordvec, last_state, rev_last_state, capvec]))
                    entry.vec2 = concatenate(
                        filter(None,
                               [wordvec, last_state, rev_last_state, capvec]))

                    entry.pos_lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if not self.depFlag:

                    #Predicted pos tags
                    lstm_forward = self.pos_builders[0].initial_state()
                    lstm_backward = self.pos_builders[1].initial_state()
                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.pos_lstms[1] = lstm_forward.output()
                        rentry.pos_lstms[0] = lstm_backward.output()

                    for entry in conll_sentence:
                        entry.pos_vec = concatenate(entry.pos_lstms)

                    blstm_forward = self.pos_bbuilders[0].initial_state()
                    blstm_backward = self.pos_bbuilders[1].initial_state()

                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        blstm_forward = blstm_forward.add_input(entry.pos_vec)
                        blstm_backward = blstm_backward.add_input(
                            rentry.pos_vec)
                        entry.pos_lstms[1] = blstm_forward.output()
                        rentry.pos_lstms[0] = blstm_backward.output()

                    concat_layer = [
                        concatenate(entry.pos_lstms)
                        for entry in conll_sentence
                    ]
                    #cap_info_sentence=[self.caplookup[entry.capInfo] for entry in conll_sentence]
                    outputFFlayer = self.ffSeqPredictor.predict_sequence(
                        concat_layer)
                    best_parentids, bestscores = self.ffSeqPredictor.viterbi_sequence(
                        outputFFlayer, self.nertrans_lookup)
                    predicted_pos_indices = [
                        np.argmax(o.value()) for o in outputFFlayer
                    ]
                    root_predicted_postags = ["O"]
                    predicted_postags = [
                        self.id2pos[idx] for idx in best_parentids
                    ]
                    for pos in predicted_postags:
                        root_predicted_postags.append(pos)
                    if iSentence < 5:
                        for word, tag in zip(conll_sentence,
                                             root_predicted_postags):
                            print("word : {}  gold : {} pred : {}".format(
                                word.form, word.pos, tag))
                    for entry, pos in zip(conll_sentence,
                                          root_predicted_postags):
                        entry.pred_pos = pos
                    dump = False

                if self.depFlag:

                    # Add predicted pos tags for parsing prediction
                    #for entry, posid in zip(conll_sentence, viterbi_pred_tagids):
                    #    entry.vec = concatenate([entry.vec, self.plookup[posid]])
                    #    entry.lstms = [entry.vec, entry.vec]
                    for entry in conll_sentence:

                        entry.lstms = [entry.vec, entry.vec]

                    if self.blstmFlag:
                        lstm_forward = self.builders[0].initial_state()
                        lstm_backward = self.builders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            lstm_forward = lstm_forward.add_input(entry.vec)
                            lstm_backward = lstm_backward.add_input(rentry.vec)
                            entry.lstms[1] = lstm_forward.output()
                            rentry.lstms[0] = lstm_backward.output()

                        if self.bibiFlag:
                            for entry in conll_sentence:
                                entry.vec = concatenate(entry.lstms)

                            blstm_forward = self.bbuilders[0].initial_state()
                            blstm_backward = self.bbuilders[1].initial_state()

                            for entry, rentry in zip(conll_sentence,
                                                     reversed(conll_sentence)):
                                blstm_forward = blstm_forward.add_input(
                                    entry.vec)
                                blstm_backward = blstm_backward.add_input(
                                    rentry.vec)

                                entry.lstms[1] = blstm_forward.output()
                                rentry.lstms[0] = blstm_backward.output()

                    scores, exprs = self.__evaluate(conll_sentence)
                    heads = decoder.parse_proj(scores)

                    # Multiple roots: heading to the previous "rooted" one
                    rootCount = 0
                    rootWid = -1
                    for index, head in enumerate(heads):
                        if head == 0:
                            rootCount += 1
                            if rootCount == 1:
                                rootWid = index
                            if rootCount > 1:
                                heads[index] = rootWid
                                rootWid = index

                    for entry, head in zip(conll_sentence, heads):
                        entry.pred_parent_id = head
                        entry.pred_relation = '_'
                        #entry.pred_pos = pos

                    if self.labelsFlag:
                        concat_layer = [
                            self.__getRelVector(conll_sentence, head,
                                                modifier + 1)
                            for modifier, head in enumerate(heads[1:])
                        ]
                        outputFFlayer = self.ffRelPredictor.predict_sequence(
                            concat_layer)
                        predicted_rel_indices = [
                            np.argmax(o.value()) for o in outputFFlayer
                        ]
                        predicted_rels = [
                            self.irels[idx] for idx in predicted_rel_indices
                        ]
                        for modifier, head in enumerate(heads[1:]):
                            conll_sentence[
                                modifier +
                                1].pred_relation = predicted_rels[modifier]

                    if self.sNerFlag and ner_epoch == 1:

                        conll_sentence[0].vec = concatenate([
                            conll_sentence[0].vec2,
                            self.rellookup[self.rels["rroot"]]
                        ])
                        for entry, pred in zip(conll_sentence[1:],
                                               predicted_rel_indices):
                            relvec = self.rellookup[pred]
                            # for entry, posid in zip(conll_sentence, viterbi_pred_tagids):
                            entry.vec = concatenate([entry.vec2, relvec])
                        for entry in conll_sentence:
                            entry.ner2_lstms = [entry.vec, entry.vec]

                        slstm_forward = self.sner_builders[0].initial_state()
                        slstm_backward = self.sner_builders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            lstm_forward = slstm_forward.add_input(entry.vec)
                            lstm_backward = slstm_backward.add_input(
                                rentry.vec)

                            entry.ner2_lstms[1] = lstm_forward.output()
                            rentry.ner2_lstms[0] = lstm_backward.output()

                        for entry in conll_sentence:
                            entry.ner2_vec = concatenate(entry.ner2_lstms)

                        sblstm_forward = self.sner_bbuilders[0].initial_state()
                        sblstm_backward = self.sner_bbuilders[1].initial_state(
                        )

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            blstm_forward = sblstm_forward.add_input(
                                entry.ner2_vec)
                            blstm_backward = sblstm_backward.add_input(
                                rentry.ner2_vec)

                            entry.ner2_lstms[1] = blstm_forward.output()
                            rentry.ner2_lstms[0] = blstm_backward.output()

                        concat_layer = [
                            dynet.dropout(concatenate(entry.ner2_lstms), 0.33)
                            for entry in conll_sentence
                        ]
                        outputFFlayer = self.ffSeqPredictor.predict_sequence(
                            concat_layer)
                        best_parentids, bestscores = self.ffSeqPredictor.viterbi_sequence(
                            outputFFlayer, self.nertrans_lookup)
                        predicted_pos_indices = [
                            np.argmax(o.value()) for o in outputFFlayer
                        ]
                        root_predicted_postags = ["O"]
                        predicted_postags = [
                            self.id2pos[idx] for idx in best_parentids
                        ]
                        for pos in predicted_postags:
                            root_predicted_postags.append(pos)
                        if iSentence < 1:
                            for word, tag in zip(conll_sentence,
                                                 root_predicted_postags):
                                print("word : {}  gold : {} pred : {}".format(
                                    word.form, word.pos, tag))
                        for entry, pos in zip(conll_sentence,
                                              root_predicted_postags):
                            entry.pred_pos = pos

                    dump = False

                renew_cg()
                if not dump:
                    yield sentence
Exemplo n.º 12
0
    def predict(self, sentence):

        for entry in sentence:
            wordvec = self.wlookup(scalar(int(self.vocab.get(
                entry.norm, 0)))) if self.wdims > 0 else None
            # if entry.pos == '$':
            # pdb.set_trace()
            if self.pdims > 0:
                posvec = self.plookup(scalar(int(
                    self.pos[entry.pos]))) if self.pdims > 0 else None

            evec = None
            if self.ExtnrEmbPath is not None:
                evec = self.elookup(
                    scalar(
                        int(
                            self.vocab.get(entry.form,
                                           self.vocab.get(entry.norm, 0)))))
            '''
            combine three embeddings
            '''
            if self.ExtnrEmbPath is not None:
                """
                combine three embeddings
                """
                # pdb.set_trace()
                entry.wordvec = wordvec
                entry.ewordvec = evec
                if self.pdims > 0:
                    entry.posvec = posvec
            else:
                entry.wordvec = wordvec
                if self.pdims > 0:
                    entry.posvec = posvec
        '''
        add lstm
        '''
        word_lstm_input = torch.stack([entry.wordvec
                                       for entry in sentence])  #len * dim
        if self.pdims > 0:
            pos_lstm_input = torch.stack([entry.posvec
                                          for entry in sentence])  #len * dim
        if self.ExtnrEmbPath is not None:
            eword_lstm_input = torch.stack(
                [entry.ewordvec for entry in sentence])

        word_lstm_out, _ = self.word_lstm.forward(word_lstm_input)
        if self.pdims > 0:
            pos_lstm_out, _ = self.pos_lstm.forward(pos_lstm_input)
        if self.ExtnrEmbPath is not None:
            eword_lstm_out, _ = self.eword_lstm.forward(eword_lstm_input)

        kl_temp = 1.0
        lp_eword_mean, lp_eword_logvar, lp_word_mean, lp_word_logvar, lp_pos_mean, lp_pos_logvar = None, None, None, None, None, None
        '''
        go through stocastic layer
        '''
        word_z = self.word_vgl.predict(word_lstm_out)

        if self.pdims > 0:
            pos_z = self.pos_vgl.predict(pos_lstm_out)

        if self.ExtnrEmbPath is not None:
            eword_z = self.eword_vgl.predict(eword_lstm_out)

        if self.ExtnrEmbPath is not None:
            if self.pdims > 0:
                scoring_input = cat([word_z, eword_z, pos_z])
            else:
                scoring_input = cat([word_z, eword_z])
        else:
            if self.pdims > 0:
                scoring_input = cat([word_z, pos_z])
            else:
                scoring_input = word_z

        scoring_input = scoring_input.squeeze(1)

        scores, exprs = self.cal_scores(scoring_input)
        heads = decoder.parse_proj(scores)

        for entry, head in zip(sentence, heads):
            entry.pred_parent_id = head
            entry.pred_relation = '_'
Exemplo n.º 13
0
    def Predict(self, conll_path, BATCH_SIZE=1):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence_batch in enumerate(
                    stream_to_batch(read_conll(conllFP), BATCH_SIZE)):

                batch_exprs = []
                sents = []
                labels = []
                for sentence in sentence_batch:
                    conll_sentence = [
                        entry for entry in sentence
                        if isinstance(entry, utils.ConllEntry)
                    ]

                    for entry in conll_sentence:
                        wordvec = self.wlookup[int(
                            self.vocab.get(entry.norm,
                                           0))] if self.wdims > 0 else None
                        posvec = self.plookup[int(
                            self.pos[entry.pos])] if self.pdims > 0 else None
                        evec = self.elookup[int(
                            self.extrnd.get(entry.form,
                                            self.extrnd.get(entry.norm, 0))
                        )] if self.external_embedding is not None else None
                        entry.vec = concatenate(
                            filter(None, [wordvec, posvec, evec]))

                        entry.lstms = [entry.vec, entry.vec]
                        entry.headfov = None
                        entry.modfov = None

                        entry.rheadfov = None
                        entry.rmodfov = None

                    if self.blstmFlag:
                        lstm_forward = self.builders[0].initial_state()
                        lstm_backward = self.builders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            lstm_forward = lstm_forward.add_input(entry.vec)
                            lstm_backward = lstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = lstm_forward.output()
                            rentry.lstms[0] = lstm_backward.output()

                        if self.bibiFlag:
                            for entry in conll_sentence:
                                entry.vec = concatenate(entry.lstms)

                            blstm_forward = self.bbuilders[0].initial_state()
                            blstm_backward = self.bbuilders[1].initial_state()

                            for entry, rentry in zip(conll_sentence,
                                                     reversed(conll_sentence)):
                                blstm_forward = blstm_forward.add_input(
                                    entry.vec)
                                blstm_backward = blstm_backward.add_input(
                                    rentry.vec)

                                entry.lstms[1] = blstm_forward.output()
                                rentry.lstms[0] = blstm_backward.output()
                    batch_exprs.append(self.__evaluate(conll_sentence, True))
                    sents.append(conll_sentence)

                _s = time.time()
                forward(batch_exprs[-1][-1])
                print "fw1:", time.time() - _s
                batch_heads = []
                _s = time.time()
                for _i, (exprs,
                         conll_sentence) in enumerate(zip(batch_exprs, sents)):
                    scores = np.array(
                        [[output.scalar_value() for output in exprsRow]
                         for exprsRow in exprs])
                    heads = decoder.parse_proj(scores)

                    for entry, head in zip(conll_sentence, heads):
                        entry.pred_parent_id = head
                        entry.pred_relation = '_'
                    batch_heads.append(heads)
                    dump = False
                print "decode:", time.time() - _s

                if self.labelsFlag:  # TODO this is currently not batched..
                    labels = []
                    _exps = []
                    for (heads, conll_sentence) in zip(batch_heads, sents):
                        labels_exprs = []
                        for modifier, head in enumerate(heads[1:]):
                            exprs = self.__evaluateLabel(
                                conll_sentence, head, modifier + 1)
                            _exps.append(exprs)
                            labels_exprs.append((head, modifier, exprs))
                        labels.append(labels_exprs)

                    _s = time.time()
                    forward(_exps)
                    print "fw-L:", time.time() - _s
                    for lbls, conll_sentence in zip(labels, sents):
                        for (head, modifier, exprs) in lbls:
                            scores = exprs.value()
                            conll_sentence[modifier +
                                           1].pred_relation = self.irels[max(
                                               enumerate(scores),
                                               key=itemgetter(1))[0]]

                renew_cg()
                if not dump:
                    for sentence in sentence_batch:
                        yield sentence
Exemplo n.º 14
0
    def Predict(self, treebanks, datasplit, options):
        char_map = {}
        if options.char_map_file:
            char_map_fh = codecs.open(options.char_map_file,encoding='utf-8')
            char_map = json.loads(char_map_fh.read())
        # should probably use a namedtuple in get_vocab to make this prettier
        _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map)

        # get external embeddings for the set of words and chars in the
        # test vocab but not in the training vocab
        test_embeddings = defaultdict(lambda: {})
        if options.word_emb_size > 0 and options.ext_word_emb_file:
            new_test_words = \
                    set(test_words) - self.feature_extractor.words.viewkeys()

            print "Number of OOV word types at test time: %i (out of %i)" % (
                len(new_test_words), len(test_words))

            if len(new_test_words) > 0:
                # no point loading embeddings if there are no words to look for
                for lang in test_langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_word_emb_file,
                        lang=lang,
                        words=new_test_words
                    )
                    test_embeddings["words"].update(embeddings)
                    if len(test_langs) > 1 and test_embeddings["words"]:
                        print "External embeddings found for %i words "\
                                "(out of %i)" % \
                                (len(test_embeddings["words"]), len(new_test_words))

        if options.char_emb_size > 0:
            new_test_chars = \
                    set(test_chars) - self.feature_extractor.chars.viewkeys()
            print "Number of OOV char types at test time: %i (out of %i)" % (
                len(new_test_chars), len(test_chars))

            if len(new_test_chars) > 0:
                for lang in test_langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_char_emb_file,
                        lang=lang,
                        words=new_test_chars,
                        chars=True
                    )
                    test_embeddings["chars"].update(embeddings)
                    if len(test_langs) > 1 and test_embeddings["chars"]:
                        print "External embeddings found for %i chars "\
                                "(out of %i)" % \
                                (len(test_embeddings["chars"]), len(new_test_chars))

        data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map)
        for iSentence, osentence in enumerate(data,1):
            sentence = deepcopy(osentence)
            self.feature_extractor.Init(options)
            conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]
            self.feature_extractor.getWordEmbeddings(conll_sentence, False, options, test_embeddings)

            scores, exprs = self.__evaluate(conll_sentence, True)
            if self.proj:
                heads = decoder.parse_proj(scores)
                #LATTICE solution to multiple roots
                # see https://github.com/jujbob/multilingual-bist-parser/blob/master/bist-parser/bmstparser/src/mstlstm.py
                ## ADD for handling multi-roots problem
                rootHead = [head for head in heads if head==0]
                if len(rootHead) != 1:
                    print "it has multi-root, changing it for heading first root for other roots"
                    rootHead = [seq for seq, head in enumerate(heads) if head == 0]
                    for seq in rootHead[1:]:heads[seq] = rootHead[0]
                ## finish to multi-roots

            else:
                heads = chuliu_edmonds_one_root(scores.T)

            for entry, head in zip(conll_sentence, heads):
                entry.pred_parent_id = head
                entry.pred_relation = '_'

            if self.labelsFlag:
                for modifier, head in enumerate(heads[1:]):
                    scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1)
                    conll_sentence[modifier+1].pred_relation = self.feature_extractor.irels[max(enumerate(scores), key=itemgetter(1))[0]]

            dy.renew_cg()

            #keep in memory the information we need, not all the vectors
            oconll_sentence = [entry for entry in osentence if isinstance(entry, utils.ConllEntry)]
            for tok_o, tok in zip(oconll_sentence, conll_sentence):
                tok_o.pred_relation = tok.pred_relation
                tok_o.pred_parent_id = tok.pred_parent_id
            yield osentence
Exemplo n.º 15
0
    def Train(self, conll_path):
        errors = 0
        batch = 0
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        shuffledData = []
        if self.train_multilingual:
            file_list = utils.readFileList(conll_path)
            for fileAndLang in file_list:
                print "Reading Training file:", fileAndLang[
                    0], " language: ", fileAndLang[1]
                with open(fileAndLang[0], "r") as conllFP:
                    shuffledData = shuffledData + list(
                        read_conll(conllFP, fileAndLang[1]))
        else:
            with open(conll_path, 'r') as conllFP:
                shuffledData = list(
                    read_conll(conllFP, self.conll_train_language))
        random.shuffle(shuffledData)

        errs = []
        lerrs = []
        eeloss = 0.0
        undefined_term = 0

        for iSentence, sentence in enumerate(shuffledData):
            if iSentence % 100 == 0 and iSentence != 0:
                print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (
                    float(eerrors)) / etotal, 'Time', time.time() - start
                start = time.time()
                eerrors = 0
                eloss = 0.0
                etotal = 0
                lerrors = 0
                ltotal = 0

            conll_sentence = [
                entry for entry in sentence
                if isinstance(entry, utils.ConllEntry)
            ]

            for entry in conll_sentence:
                c = float(self.wordsCount.get(entry.norm, 0))
                dropFlag = (random.random() < (c / (0.25 + c)))
                posvec = self.plookup[int(
                    self.pos[entry.pos])] if self.pdims > 0 else None
                xposvec = self.xplookup[int(
                    self.xpos[entry.xpos])] if self.xpdims > 0 else None

                evec = None
                ecevc = None
                lang_code = entry.language.split(
                    '_')[0] + ':' if self.multilingual_emb else ""

                # Add word and external embedding
                if self.external_embedding is not None:
                    if self.extConcateFlag:
                        wordvec = self.wlookup[int(
                            self.vocab.get(entry.norm.lower(), 0)
                        ) if dropFlag else 0] if self.wdims > 0 else None
                        evec = self.elookup[self.extrnd.get(
                            lang_code + entry.form.lower(),
                            self.extrnd.get(lang_code +
                                            entry.norm.lower(), 0)) if
                                            (dropFlag or
                                             (random.random() < 0.5)) else 0]
                    else:
                        wordvec = self.elookup[self.extrnd.get(
                            lang_code + entry.form.lower(),
                            self.extrnd.get(lang_code +
                                            entry.norm.lower(), 0)) if
                                               (dropFlag or
                                                (random.random() < 0.5)
                                                ) else 0]
                        if self.extrnd.get(
                                lang_code + entry.form.lower(),
                                self.extrnd.get(lang_code + entry.norm.lower(),
                                                0)) == 0:
                            undefined_term = undefined_term + 1
                else:
                    wordvec = self.wlookup[
                        int(self.vocab.get(entry.norm.lower(), 0)
                            ) if dropFlag else 0] if self.wdims > 0 else None

                # Add external cluster embedding
                if self.external_cluster_embedding is not None:
                    ecevc = self.eclookup[self.exctrnd.get(
                        lang_code + entry.form.lower(
                        ), self.exctrnd.get(lang_code +
                                            entry.norm.lower(), 0)) if
                                          (dropFlag or
                                           (random.random() < 0.5)) else 0]

                # Add language embedding
                langvec = self.llookup[self.languageVec_dic[
                    entry.language].lang_num] if self.add_lang_vec else None

                #                    print langvec.value()

                entry.vec = concatenate(
                    filter(None,
                           [wordvec, posvec, xposvec, evec, ecevc, langvec]))

                entry.lstms = [entry.vec, entry.vec]
                entry.headfov = None
                entry.modfov = None

                entry.rheadfov = None
                entry.rmodfov = None

            if self.blstmFlag:
                lstm_forward = self.builders[0].initial_state()
                lstm_backward = self.builders[1].initial_state()
                for entry, rentry in zip(conll_sentence,
                                         reversed(conll_sentence)):
                    lstm_forward = lstm_forward.add_input(entry.vec)
                    lstm_backward = lstm_backward.add_input(rentry.vec)

                    entry.lstms[1] = lstm_forward.output()
                    rentry.lstms[0] = lstm_backward.output()
                if self.bibiFlag:
                    for entry in conll_sentence:
                        entry.vec = concatenate(entry.lstms)
                    blstm_forward = self.bbuilders[0].initial_state()
                    blstm_backward = self.bbuilders[1].initial_state()
                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        blstm_forward = blstm_forward.add_input(entry.vec)
                        blstm_backward = blstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = blstm_forward.output()
                        rentry.lstms[0] = blstm_backward.output()

            scores, exprs = self.__evaluate(conll_sentence, True)
            gold = [entry.parent_id for entry in conll_sentence]
            heads = decoder.parse_proj(scores,
                                       gold if self.costaugFlag else None)

            if self.labelsFlag:
                for modifier, head in enumerate(gold[1:]):
                    rscores, rexprs = self.__evaluateLabel(
                        conll_sentence, head, modifier + 1)
                    goldLabelInd = self.rels[conll_sentence[modifier +
                                                            1].relation]
                    wrongLabelInd = max(((l, scr)
                                         for l, scr in enumerate(rscores)
                                         if l != goldLabelInd),
                                        key=itemgetter(1))[0]
                    if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
                        lerrs.append(rexprs[wrongLabelInd] -
                                     rexprs[goldLabelInd])

            e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
            eerrors += e
            if e > 0:
                loss = [(exprs[h][i] - exprs[g][i])
                        for i, (h, g) in enumerate(zip(heads, gold))
                        if h != g]  # * (1.0/float(e))
                eloss += (e)
                mloss += (e)
                errs.extend(loss)

            etotal += len(conll_sentence)

            if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0:
                eeloss = 0.0

                if len(errs) > 0 or len(lerrs) > 0:
                    eerrs = (esum(errs + lerrs))  #* (1.0/(float(len(errs))))
                    eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                renew_cg()

        if len(errs) > 0:
            eerrs = (esum(errs + lerrs))  #* (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []
            eeloss = 0.0

            renew_cg()
        print " # of uddefined_term= ", undefined_term
        self.trainer.update_epoch()
        print "Loss: ", mloss / iSentence
Exemplo n.º 16
0
    def Train(self, conll_path):
        errors = 0
        batch = 0
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP))
            random.shuffle(shuffledData)

            errs = []
            lerrs = []
            eeloss = 0.0

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Time', time.time()-start
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]

                for entry in conll_sentence:
                    c = float(self.wordsCount.get(entry.norm, 0))
                    dropFlag = (random.random() < (c/(0.25+c)))
                    wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None
                    posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
                    evec = None

                    if self.external_embedding is not None:
                        evec = self.elookup[self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0]
                    entry.vec = concatenate(filter(None, [wordvec, posvec, evec]))

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                gold = [entry.parent_id for entry in conll_sentence]
                heads = decoder.parse_proj(scores, gold if self.costaugFlag else None)

                if self.labelsFlag:
                    for modifier, head in enumerate(gold[1:]):
                        rscores, rexprs = self.__evaluateLabel(conll_sentence, head, modifier+1)
                        goldLabelInd = self.rels[conll_sentence[modifier+1].relation]
                        wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0]
                        if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
                            lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd])

                e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
                eerrors += e
                if e > 0:
                    loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e))
                    eloss += (e)
                    mloss += (e)
                    errs.extend(loss)

                etotal += len(conll_sentence)

                if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0:
                    eeloss = 0.0

                    if len(errs) > 0 or len(lerrs) > 0:
                        eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs))))
                        eerrs.scalar_value()
                        eerrs.backward()
                        self.trainer.update()
                        errs = []
                        lerrs = []

                    renew_cg()

        if len(errs) > 0:
            eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []
            eeloss = 0.0

            renew_cg()

        self.trainer.update_epoch()
        print "Loss: ", mloss/iSentence
Exemplo n.º 17
0
    def predict(self, sentence):
        for entry in sentence:
            wordvec = self.wlookup(scalar(int(self.vocab.get(
                entry.norm, 0)))) if self.wdims > 0 else None
            posvec = self.plookup(scalar(int(
                self.pos[entry.pos]))) if self.pdims > 0 else None
            '''ontovec = self.olookup(
                scalar(int(self.onto[entry.onto]))) if self.odims > 0 else None
            cposvec = self.clookup(
                scalar(int(self.cpos[entry.cpos]))) if self.cdims > 0 else None
            evec = self.elookup(scalar(int(self.extrnd.get(entry.form,
                                                           self.extrnd.get(entry.norm, 0))))) if self.external_embedding is not None else None'''
            #entry.vec = cat([wordvec, posvec, ontovec, cposvec, evec])
            gaze_feats = Variable(
                torch.unsqueeze(torch.Tensor(entry.gaze_feats), 0))
            entry.vec = cat([wordvec, posvec, gaze_feats])
            #entry.vec = posvec

            entry.lstms = [entry.vec, entry.vec]
            entry.headfov = None
            entry.modfov = None

            entry.rheadfov = None
            entry.rmodfov = None

        num_vec = len(sentence)
        vec_for = torch.cat([entry.vec
                             for entry in sentence]).view(num_vec, 1, -1)
        vec_back = torch.cat([entry.vec for entry in reversed(sentence)
                              ]).view(num_vec, 1, -1)
        res_for_1, self.hid_for_1 = self.lstm_for_1(vec_for, self.hid_for_1)
        res_back_1, self.hid_back_1 = self.lstm_back_1(vec_back,
                                                       self.hid_back_1)

        vec_cat = [
            cat([res_for_1[i], res_back_1[num_vec - i - 1]])
            for i in range(num_vec)
        ]

        vec_for_2 = torch.cat(vec_cat).view(num_vec, 1, -1)
        vec_back_2 = torch.cat(list(reversed(vec_cat))).view(num_vec, 1, -1)
        res_for_2, self.hid_for_2 = self.lstm_for_2(vec_for_2, self.hid_for_2)
        res_back_2, self.hid_back_2 = self.lstm_back_2(vec_back_2,
                                                       self.hid_back_2)

        for i in range(num_vec):
            sentence[i].lstms[0] = res_for_2[i]
            sentence[i].lstms[1] = res_back_2[num_vec - i - 1]

        scores, exprs = self.__evaluate(sentence, True)
        heads = decoder.parse_proj(scores)

        for entry, head in zip(sentence, heads):
            entry.pred_parent_id = head
            entry.pred_relation = '_'

        head_list = list(heads)
        for modifier, head in enumerate(head_list[1:]):
            scores, exprs = self.__evaluateLabel(sentence, head, modifier + 1)
            sentence[modifier + 1].pred_relation = self.rel_list[max(
                enumerate(scores), key=itemgetter(1))[0]]
Exemplo n.º 18
0
    def forward(self, sentence, errs, lerrs):

        for entry in sentence:
            c = float(self.wordsCount.get(entry.norm, 0))
            dropFlag = (random.random() < (c / (0.25 + c)))
            wordvec = self.wlookup(
                scalar(int(self.vocab.get(entry.norm, 0)) if dropFlag else 0)
            ) if self.wdims > 0 else None
            posvec = self.plookup(scalar(int(
                self.pos[entry.pos]))) if self.pdims > 0 else None
            evec = None
            if self.external_embedding is not None:
                evec = self.elookup(
                    scalar(
                        self.extrnd.
                        get(entry.form, self.extrnd.get(entry.norm, 0)) if (
                            dropFlag or (random.random() < 0.5)) else 0))
            entry.vec = cat([wordvec, posvec, evec])

            entry.lstms = [entry.vec, entry.vec]
            entry.headfov = None
            entry.modfov = None

            entry.rheadfov = None
            entry.rmodfov = None

        if self.blstmFlag:
            lstm_forward = RNNState(self.builders[0])
            lstm_backward = RNNState(self.builders[1])

            for entry, rentry in zip(sentence, reversed(sentence)):
                lstm_forward = lstm_forward.next(entry.vec)
                lstm_backward = lstm_backward.next(rentry.vec)

                entry.lstms[1] = lstm_forward()
                rentry.lstms[0] = lstm_backward()

            if self.bibiFlag:
                for entry in sentence:
                    entry.vec = cat(entry.lstms)

                blstm_forward = RNNState(self.bbuilders[0])
                blstm_backward = RNNState(self.bbuilders[1])

                for entry, rentry in zip(sentence, reversed(sentence)):
                    blstm_forward = blstm_forward.next(entry.vec)
                    blstm_backward = blstm_backward.next(rentry.vec)

                    entry.lstms[1] = blstm_forward()
                    rentry.lstms[0] = blstm_backward()

        scores, exprs = self.__evaluate(sentence, True)
        gold = [entry.parent_id for entry in sentence]
        heads = decoder.parse_proj(scores, gold if self.costaugFlag else None)

        if self.labelsFlag:
            for modifier, head in enumerate(gold[1:]):
                rscores, rexprs = self.__evaluateLabel(sentence, head,
                                                       modifier + 1)
                goldLabelInd = self.rels[sentence[modifier + 1].relation]
                wrongLabelInd = \
                    max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0]
                if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
                    lerrs += [rexprs[wrongLabelInd] - rexprs[goldLabelInd]]

        e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
        if e > 0:
            errs += [(exprs[h][i] - exprs[g][i])[0]
                     for i, (h, g) in enumerate(zip(heads, gold)) if h != g]
        return e
Exemplo n.º 19
0
    def forward(self, sentence):

        self.process_sentence_embeddings(sentence)

        num_vec = len(sentence)
        features_for = [entry.vec for entry in sentence]
        features_back = [entry.vec for entry in reversed(sentence)]
        vec_for = torch.cat(features_for).view(num_vec, 1, -1)
        vec_back = torch.cat(features_back).view(num_vec, 1, -1)
        res_for_1, self.hid_for_1 = self.lstm_for_1(vec_for, self.hid_for_1)
        res_back_1, self.hid_back_1 = self.lstm_back_1(vec_back,
                                                       self.hid_back_1)

        vec_cat = [
            concatenate_tensors([res_for_1[i], res_back_1[num_vec - i - 1]])
            for i in range(num_vec)
        ]

        vec_for_2 = torch.cat(vec_cat).view(num_vec, 1, -1)
        vec_back_2 = torch.cat(list(reversed(vec_cat))).view(num_vec, 1, -1)
        res_for_2, self.hid_for_2 = self.lstm_for_2(vec_for_2, self.hid_for_2)
        res_back_2, self.hid_back_2 = self.lstm_back_2(vec_back_2,
                                                       self.hid_back_2)

        for i in range(num_vec):
            sentence[i].lstms[0] = res_for_2[i]
            sentence[i].lstms[1] = res_back_2[num_vec - i - 1]

        scores, exprs = self.__evaluate(sentence)
        gold = [entry.parent_id for entry in sentence]
        heads = decoder.parse_proj(scores, gold)

        lerrs = []
        for modifier, head in enumerate(gold[1:]):

            if sentence[head].rheadfov is None:
                sentence[head].rheadfov = torch.mm(
                    concatenate_tensors(
                        [sentence[head].lstms[0], sentence[head].lstms[1]]),
                    self.rhidLayerFOH)

            if sentence[modifier + 1].rmodfov is None:
                sentence[modifier + 1].rmodfov = torch.mm(
                    concatenate_tensors([
                        sentence[modifier + 1].lstms[0],
                        sentence[modifier + 1].lstms[1]
                    ]), self.rhidLayerFOM)

            rscores, rexprs = self.__evaluateLabel(
                sentence[head].rheadfov, sentence[modifier + 1].rmodfov)
            goldLabelInd = self.rels[sentence[modifier + 1].relation]
            wrongLabelInd = max(
                ((l, scr)
                 for l, scr in enumerate(rscores) if l != goldLabelInd),
                key=itemgetter(1))[0]
            if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
                lerrs += [rexprs[wrongLabelInd] - rexprs[goldLabelInd]]

        e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
        errs = []
        if e > 0:
            errs += [(exprs[h][i] - exprs[g][i])[0]
                     for i, (h, g) in enumerate(zip(heads, gold)) if h != g]
        return e, errs, lerrs
Exemplo n.º 20
0
    def Train(self, conll_path):
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, self.c2i))
            random.shuffle(shuffledData)

            errs = []
            lerrs = []
            posErrs = []

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 500 == 0 and iSentence != 0:
                    print "Processing sentence number: %d" % iSentence, ", Loss: %.4f" % (
                                eloss / etotal), ", Time: %.2f" % (time.time() - start)
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0

                conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]

                for entry in conll_sentence:
                    c = float(self.wordsCount.get(entry.norm, 0))
                    dropFlag = (random.random() < (c / (0.25 + c)))
                    wordvec = self.wlookup[
                        int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None

                    last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])[-1]
                    rev_last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])[
                        -1]

                    entry.vec = dynet.dropout(concatenate(filter(None, [wordvec, last_state, rev_last_state])), 0.33)

                    entry.pos_lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                #POS tagging loss
                lstm_forward = self.pos_builders[0].initial_state()
                lstm_backward = self.pos_builders[1].initial_state()
                for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                    lstm_forward = lstm_forward.add_input(entry.vec)
                    lstm_backward = lstm_backward.add_input(rentry.vec)

                    entry.pos_lstms[1] = lstm_forward.output()
                    rentry.pos_lstms[0] = lstm_backward.output()

                for entry in conll_sentence:
                    entry.pos_vec = concatenate(entry.pos_lstms)

                blstm_forward = self.pos_bbuilders[0].initial_state()
                blstm_backward = self.pos_bbuilders[1].initial_state()

                for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                    blstm_forward = blstm_forward.add_input(entry.pos_vec)
                    blstm_backward = blstm_backward.add_input(rentry.pos_vec)
                    entry.pos_lstms[1] = blstm_forward.output()
                    rentry.pos_lstms[0] = blstm_backward.output()

                concat_layer = [dynet.dropout(concatenate(entry.pos_lstms), 0.33) for entry in conll_sentence]
                outputFFlayer = self.ffSeqPredictor.predict_sequence(concat_layer)
                posIDs = [self.pos.get(entry.pos) for entry in conll_sentence]
                for pred, gold in zip(outputFFlayer, posIDs):
                    posErrs.append(self.pick_neg_log(pred, gold))

                # Add predicted pos tags
                for entry, poses in zip(conll_sentence, outputFFlayer):
                    entry.vec = concatenate([entry.vec, dynet.dropout(self.plookup[np.argmax(poses.value())], 0.33)])
                    entry.lstms = [entry.vec, entry.vec]

                #Parsing losses
                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence)
                gold = [entry.parent_id for entry in conll_sentence]
                heads = decoder.parse_proj(scores, gold if self.costaugFlag else None)

                if self.labelsFlag:

                    concat_layer = [dynet.dropout(self.__getRelVector(conll_sentence, head, modifier + 1), 0.33) for
                                    modifier, head in enumerate(gold[1:])]
                    outputFFlayer = self.ffRelPredictor.predict_sequence(concat_layer)
                    relIDs = [self.rels[conll_sentence[modifier + 1].relation] for modifier, _ in enumerate(gold[1:])]
                    for pred, goldid in zip(outputFFlayer, relIDs):
                        lerrs.append(self.pick_neg_log(pred, goldid))

                e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
                eerrors += e
                if e > 0:
                    loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in enumerate(zip(heads, gold)) if h != g]  # * (1.0/float(e))
                    eloss += (e)
                    mloss += (e)
                    errs.extend(loss)

                etotal += len(conll_sentence)

                if iSentence % 1 == 0:
                    if len(errs) > 0 or len(lerrs) > 0 or len(posErrs) > 0:
                        eerrs = (esum(errs + lerrs + posErrs))
                        eerrs.scalar_value()
                        eerrs.backward()
                        self.trainer.update()
                        errs = []
                        lerrs = []
                        posErrs = []

                    renew_cg()

        print "Loss: %.4f" % (mloss / iSentence)
Exemplo n.º 21
0
    def Train(self, conll_path):
        errors = 0
        batch = 0
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP))
            random.shuffle(shuffledData)

            errs = []
            lerrs = []
            eeloss = 0.0

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (
                        float(eerrors)) / etotal, 'Time', time.time() - start
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                for entry in conll_sentence:
                    c = float(self.wordsCount.get(entry.norm, 0))
                    dropFlag = (random.random() < (c / (0.25 + c)))
                    wordvec = self.wlookup[
                        int(self.vocab.get(entry.norm, 0)
                            ) if dropFlag else 0] if self.wdims > 0 else None
                    posvec = self.plookup[int(
                        self.pos[entry.pos])] if self.pdims > 0 else None
                    evec = None

                    if self.external_embedding is not None:
                        evec = self.elookup[self.extrnd.get(
                            entry.form, self.extrnd.get(entry.norm, 0)) if
                                            (dropFlag or
                                             (random.random() < 0.5)) else 0]
                    entry.vec = concatenate(
                        filter(None, [wordvec, posvec, evec]))

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(
                                rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                gold = [entry.parent_id for entry in conll_sentence]
                heads = decoder.parse_proj(scores,
                                           gold if self.costaugFlag else None)

                if self.labelsFlag:
                    for modifier, head in enumerate(gold[1:]):
                        rscores, rexprs = self.__evaluateLabel(
                            conll_sentence, head, modifier + 1)
                        goldLabelInd = self.rels[conll_sentence[modifier +
                                                                1].relation]
                        wrongLabelInd = max(((l, scr)
                                             for l, scr in enumerate(rscores)
                                             if l != goldLabelInd),
                                            key=itemgetter(1))[0]
                        if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
                            lerrs.append(rexprs[wrongLabelInd] -
                                         rexprs[goldLabelInd])

                e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
                eerrors += e
                if e > 0:
                    loss = [(exprs[h][i] - exprs[g][i])
                            for i, (h, g) in enumerate(zip(heads, gold))
                            if h != g]  # * (1.0/float(e))
                    eloss += (e)
                    mloss += (e)
                    errs.extend(loss)

                etotal += len(conll_sentence)

                if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0:
                    eeloss = 0.0

                    if len(errs) > 0 or len(lerrs) > 0:
                        eerrs = (esum(errs + lerrs)
                                 )  #* (1.0/(float(len(errs))))
                        eerrs.scalar_value()
                        eerrs.backward()
                        self.trainer.update()
                        errs = []
                        lerrs = []

                    renew_cg()

        if len(errs) > 0:
            eerrs = (esum(errs + lerrs))  #* (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []
            eeloss = 0.0

            renew_cg()

        self.trainer.update_epoch()
        print "Loss: ", mloss / iSentence
Exemplo n.º 22
0
    def forward(self, sentence, errs, lerrs):

        for entry in sentence:
            c = float(self.wordsCount.get(entry.norm, 0))
            # dropFlag = (random.random() < (c / (0.33 + c)))
            dropFlag = (random.random() < (c / (0.25 + c)))
            wordvec = self.wlookup(
                scalar(int(self.vocab.get(entry.norm, 0)) if dropFlag else 0)
            ) if self.wdims > 0 else None
            '''ontovec = self.olookup(scalar(int(self.onto[entry.onto]) if random.random(
            ) < 0.9 else 0)) if self.odims > 0 else None
            cposvec = self.clookup(scalar(int(self.cpos[entry.cpos]) if random.random(
            ) < 0.9 else 0)) if self.cdims > 0 else None'''
            posvec = self.plookup(scalar(int(
                self.pos[entry.pos]))) if self.pdims > 0 else None
            # posvec = self.plookup(
            #     scalar(0 if dropFlag and random.random() < 0.1 else int(self.pos[entry.pos]))) if self.pdims > 0 else None
            evec = None
            if self.external_embedding is not None:
                evec = self.elookup(
                    scalar(
                        self.extrnd.
                        get(entry.form, self.extrnd.get(entry.norm, 0)) if (
                            dropFlag or (random.random() < 0.5)) else 0))

            #entry.vec = cat([wordvec, posvec, ontovec, cposvec, evec])
            gaze_feats = Variable(
                torch.unsqueeze(torch.Tensor(entry.gaze_feats), 0))
            entry.vec = cat([wordvec, posvec, gaze_feats])
            #entry.vec = posvec
            entry.lstms = [entry.vec, entry.vec]
            entry.headfov = None
            entry.modfov = None

            entry.rheadfov = None
            entry.rmodfov = None

        num_vec = len(sentence)
        vec_for = torch.cat([entry.vec
                             for entry in sentence]).view(num_vec, 1, -1)
        vec_back = torch.cat([entry.vec for entry in reversed(sentence)
                              ]).view(num_vec, 1, -1)
        res_for_1, self.hid_for_1 = self.lstm_for_1(vec_for, self.hid_for_1)
        res_back_1, self.hid_back_1 = self.lstm_back_1(vec_back,
                                                       self.hid_back_1)

        vec_cat = [
            cat([res_for_1[i], res_back_1[num_vec - i - 1]])
            for i in range(num_vec)
        ]

        vec_for_2 = torch.cat(vec_cat).view(num_vec, 1, -1)
        vec_back_2 = torch.cat(list(reversed(vec_cat))).view(num_vec, 1, -1)
        res_for_2, self.hid_for_2 = self.lstm_for_2(vec_for_2, self.hid_for_2)
        res_back_2, self.hid_back_2 = self.lstm_back_2(vec_back_2,
                                                       self.hid_back_2)

        for i in range(num_vec):
            sentence[i].lstms[0] = res_for_2[i]
            sentence[i].lstms[1] = res_back_2[num_vec - i - 1]

        scores, exprs = self.__evaluate(sentence, True)
        gold = [entry.parent_id for entry in sentence]
        heads = decoder.parse_proj(scores, gold)

        for modifier, head in enumerate(gold[1:]):
            rscores, rexprs = self.__evaluateLabel(sentence, head,
                                                   modifier + 1)
            goldLabelInd = self.rels[sentence[modifier + 1].relation]
            wrongLabelInd = \
                max(((l, scr) for l, scr in enumerate(rscores)
                     if l != goldLabelInd), key=itemgetter(1))[0]
            if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
                lerrs += [rexprs[wrongLabelInd] - rexprs[goldLabelInd]]

        e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
        if e > 0:
            errs += [(exprs[h][i] - exprs[g][i])[0]
                     for i, (h, g) in enumerate(zip(heads, gold)) if h != g]
        return e
Exemplo n.º 23
0
    def Train(self, conll_path, BATCH_SIZE=1):
        errors = 0
        batch = 0
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP))
            random.shuffle(shuffledData)

            errs = []
            lerrs = []
            eeloss = 0.0

            for iSentence, sentence_batch in enumerate(
                    stream_to_batch(shuffledData, BATCH_SIZE)):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (
                        float(eerrors)) / etotal, 'Time', time.time(
                        ) - start, (100 * BATCH_SIZE) / (time.time() - start)
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                batch_exprs = []
                sents = []
                golds = []
                labels = []
                for sentence in sentence_batch:
                    conll_sentence = [
                        entry for entry in sentence
                        if isinstance(entry, utils.ConllEntry)
                    ]
                    sents.append(conll_sentence)

                    gold = [entry.parent_id for entry in conll_sentence]
                    golds.append(gold)

                    # initialize sentence
                    for entry in conll_sentence:
                        c = float(self.wordsCount.get(entry.norm, 0))
                        dropFlag = (random.random() < (c / (0.25 + c)))
                        wordvec = self.wlookup[int(
                            self.vocab.get(entry.norm, 0)
                        ) if dropFlag else 0] if self.wdims > 0 else None
                        posvec = self.plookup[int(
                            self.pos[entry.pos])] if self.pdims > 0 else None
                        evec = None

                        if self.external_embedding is not None:
                            evec = self.elookup[self.extrnd.get(
                                entry.form, self.extrnd.get(entry.norm, 0)) if
                                                (dropFlag or
                                                 (random.random() < 0.5)
                                                 ) else 0]
                        entry.vec = concatenate(
                            filter(None, [wordvec, posvec, evec]))

                        entry.lstms = [entry.vec, entry.vec]
                        entry.headfov = None
                        entry.modfov = None

                        entry.rheadfov = None
                        entry.rmodfov = None

                    # bilstm encode
                    if self.blstmFlag:
                        lstm_forward = self.builders[0].initial_state()
                        lstm_backward = self.builders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            lstm_forward = lstm_forward.add_input(entry.vec)
                            lstm_backward = lstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = lstm_forward.output()
                            rentry.lstms[0] = lstm_backward.output()

                        if self.bibiFlag:
                            for entry in conll_sentence:
                                entry.vec = concatenate(entry.lstms)

                            blstm_forward = self.bbuilders[0].initial_state()
                            blstm_backward = self.bbuilders[1].initial_state()

                            for entry, rentry in zip(conll_sentence,
                                                     reversed(conll_sentence)):
                                blstm_forward = blstm_forward.add_input(
                                    entry.vec)
                                blstm_backward = blstm_backward.add_input(
                                    rentry.vec)

                                entry.lstms[1] = blstm_forward.output()
                                rentry.lstms[0] = blstm_backward.output()

                    # compute all arc score-expressions
                    batch_exprs.append(self.__evaluate(conll_sentence, True))

                    # labeling?
                    _exps = []
                    if self.labelsFlag:
                        labels_exprs = []
                        for modifier, head in enumerate(gold[1:]):
                            rexprs = self.__evaluateLabel(
                                conll_sentence, head, modifier + 1)
                            labels_exprs.append((rexprs, head, modifier))
                            _exps.append(rexprs)
                        labels.append(labels_exprs)

                # now do the actual scoring
                _s = time.time()
                forward(batch_exprs[-1][-1] + _exps)
                print "fw1t:", time.time() - _s
                for _i, (exprs,
                         conll_sentence) in enumerate(zip(batch_exprs, sents)):
                    scores = np.array(
                        [[output.scalar_value() for output in exprsRow]
                         for exprsRow in exprs])
                    gold = golds[_i]
                    heads = decoder.parse_proj(
                        scores, gold if self.costaugFlag else None)

                    # TODO labeling is inot batched
                    if self.labelsFlag:
                        for rexprs, head, modifier in labels[_i]:
                            rscores = rexprs.value()
                            goldLabelInd = self.rels[conll_sentence[
                                modifier + 1].relation]
                            wrongLabelInd = max(
                                ((l, scr) for l, scr in enumerate(rscores)
                                 if l != goldLabelInd),
                                key=itemgetter(1))[0]
                            if rscores[
                                    goldLabelInd] < rscores[wrongLabelInd] + 1:
                                lerrs.append(rexprs[wrongLabelInd] -
                                             rexprs[goldLabelInd])

                    e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
                    eerrors += e
                    if e > 0:
                        loss = [(exprs[h][i] - exprs[g][i])
                                for i, (h, g) in enumerate(zip(heads, gold))
                                if h != g]  # * (1.0/float(e))
                        eloss += (e)
                        mloss += (e)
                        errs.extend(loss)

                    etotal += len(conll_sentence)

                    if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0:
                        eeloss = 0.0

                if len(errs) > 0 or len(lerrs) > 0:
                    eerrs = (esum(errs + lerrs))  #* (1.0/(float(len(errs))))
                    _s = time.time()
                    eerrs.scalar_value()
                    print "fw2t", time.time() - _s
                    _s = time.time()
                    eerrs.backward()
                    print "bw2t", time.time() - _s
                    self.trainer.update()
                    errs = []
                    lerrs = []

                renew_cg()

        if len(errs) > 0:
            eerrs = (esum(errs + lerrs))  #* (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []
            eeloss = 0.0

            renew_cg()

        self.trainer.update_epoch()
        print "Loss: ", mloss / iSentence
Exemplo n.º 24
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(
                    read_conll(conllFP, self.conll_test_language)):
                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                for entry in conll_sentence:
                    posID = self.pos[entry.pos] if self.pos.has_key(
                        entry.pos) else 0
                    posvec = self.plookup[int(
                        posID)] if self.pdims > 0 else None
                    xposID = self.xpos[entry.xpos] if self.xpos.has_key(
                        entry.xpos) else 0
                    xposvec = self.xplookup[int(
                        xposID)] if self.xpdims > 0 else None
                    evec = None
                    ecevc = None
                    lang_code = entry.language.split(
                        '_')[0] + ':' if self.multilingual_emb else ""

                    if self.external_embedding is not None:
                        if self.extConcateFlag:
                            wordvec = self.wlookup[int(
                                self.vocab.get(entry.norm.lower(),
                                               0))] if self.wdims > 0 else None
                            evec = self.elookup[self.extrnd.get(
                                lang_code + entry.form.lower(),
                                self.extrnd.get(lang_code + entry.norm.lower(),
                                                0))]
                        else:
                            wordvec = self.elookup[self.extrnd.get(
                                lang_code + entry.form.lower(),
                                self.extrnd.get(lang_code + entry.norm.lower(),
                                                0))]
                    else:
                        wordvec = self.wlookup[int(
                            self.vocab.get(entry.norm.lower(),
                                           0))] if self.wdims > 0 else None

                    if self.external_cluster_embedding is not None:
                        ecevc = self.eclookup[self.exctrnd.get(
                            lang_code + entry.form.lower(),
                            self.exctrnd.get(lang_code + entry.norm.lower(),
                                             0))]

                    # Add language embedding
                    langvec = self.llookup[
                        self.languageVec_dic[entry.language].
                        lang_num] if self.add_lang_vec else None

                    entry.vec = concatenate(
                        filter(
                            None,
                            [wordvec, posvec, xposvec, evec, ecevc, langvec]))
                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(
                                rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                heads = decoder.parse_proj(scores)

                ## ADD for handling multi-roots problem
                rootHead = [head for head in heads if head == 0]
                if len(rootHead) != 1:
                    print "it has multi-root, changing it for heading first root for other roots"
                    rootHead = [
                        seq for seq, head in enumerate(heads) if head == 0
                    ]
                    for seq in rootHead[1:]:
                        heads[seq] = rootHead[0]
                ## finish to multi-roots

                for entry, head in zip(conll_sentence, heads):
                    entry.pred_parent_id = head
                    entry.pred_relation = '_'

                dump = False

                if self.labelsFlag:
                    for modifier, head in enumerate(heads[1:]):
                        scores, exprs = self.__evaluateLabel(
                            conll_sentence, head, modifier + 1)
                        conll_sentence[modifier +
                                       1].pred_relation = self.irels[max(
                                           enumerate(scores),
                                           key=itemgetter(1))[0]]

                renew_cg()
                if not dump: yield sentence
Exemplo n.º 25
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, self.c2i, self.m2i, self.t2i, self.morph_dict)):
                conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]

                if self.morphTagFlag:
                    sentence_context = []
                    last_state_char = self.char_rnn.predict_sequence([self.clookup[self.c2i["<start>"]]])[-1]
                    rev_last_state_char = self.char_rnn.predict_sequence([self.clookup[self.c2i["<start>"]]])[-1]
                    sentence_context.append(concatenate([last_state_char, rev_last_state_char]))
                    for entry in conll_sentence:
                        last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])
                        rev_last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])
                        entry.char_rnn_states = [concatenate([f,b]) for f,b in zip(last_state_char, rev_last_state_char)]
                        sentence_context.append(entry.char_rnn_states[-1])

                for idx, entry in enumerate(conll_sentence):
                    wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None

                    if self.morphTagFlag:
                        entry.vec = concatenate([wordvec, entry.char_rnn_states[-1]])
                    else:
                        last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])[-1]
                        rev_last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])[-1]
                        entry.vec = concatenate([wordvec, last_state_char, rev_last_state_char])
                
                for idx, entry in enumerate(conll_sentence):
                    if self.morphFlag:
                        if len(entry.norm) > 2:
                            if self.goldMorphFlag:
                                seg_vec = self.__getSegmentationVector(entry.norm)
                                seg_vec = dynet.vecInput(seg_vec.dim()[0][0])
                                seg_vec.set(entry.idMorphs)
                                morph_seg = utils.generate_morphs(entry.norm, seg_vec.vec_value())
                                entry.pred_seg = morph_seg
                            else:
                                seg_vec = self.__getSegmentationVector(entry.norm)
                                morph_seg = utils.generate_morphs(entry.norm, seg_vec.vec_value())
                                entry.pred_seg = seg_vec.vec_value()
                        else:
                            morph_seg = [entry.norm]
                            entry.pred_seg =  entry.idMorphs

                        entry.seg = entry.idMorphs

                        last_state_morph = self.morph_rnn.predict_sequence([self.__getMorphVector(morph) for morph in morph_seg])[-1]
                        rev_last_state_morph = self.morph_rnn.predict_sequence([self.__getMorphVector(morph) for morph in reversed(morph_seg)])[
                            -1]

                        entry.vec = concatenate([entry.vec, last_state_morph, rev_last_state_morph])
                
                morphtag_encodings = []
                for idx, entry in enumerate(conll_sentence):
                    if self.morphTagFlag:
                        if self.goldMorphTagFlag:
                            morph_tags = entry.idMorphTags
                            entry.pred_tags = entry.idMorphTags
                            entry.pred_tags_tokens = [self.i2t[m_tag_id] for m_tag_id in entry.pred_tags]
                        else:                                                    
                            word_context = [c for i, c in enumerate(sentence_context) if i - 1 != idx]
                            entry.pred_tags = self.generate(entry.char_rnn_states, word_context)
                            morph_tags = entry.pred_tags
                            entry.tags = entry.idMorphTags
                            entry.pred_tags_tokens = [self.i2t[m_tag_id] for m_tag_id in entry.pred_tags]

                        last_state_mtag = self.mtag_rnn.predict_sequence([self.tlookup[t] for t in morph_tags])[-1]
                        rev_last_state_mtag = self.mtag_rnn.predict_sequence([self.tlookup[t] for t in reversed(morph_tags)])[-1]
                        current_encoding_mtag = concatenate([last_state_mtag, rev_last_state_mtag])  
                        morphtag_encodings.append(current_encoding_mtag)

                if self.morphTagFlag:
                    forward = []
                    for idx, encoding in enumerate(morphtag_encodings):
                        if idx == 0:
                            forward.append(encoding)
                        else:
                            updated = morphtag_encodings[idx-1]*self.mtag_encoding_composition_alpha \
                                    + encoding*(1-self.mtag_encoding_composition_alpha)
                            forward.append(updated)
                    if self.mtag_encoding_composition_type == "w_sum":
                        upper_morphtag_encodings = forward
                    elif self.mtag_encoding_composition_type == "bi_w_sum":
                        backward = []
                        for idx, r_encoding in enumerate(morphtag_encodings):
                            if idx == len(morphtag_encodings) - 1:
                                backward.append(r_encoding)
                            else:
                                updated = morphtag_encodings[idx+1]*self.mtag_encoding_composition_alpha \
                                        + r_encoding*(1-self.mtag_encoding_composition_alpha)
                                backward.append(updated)
                        upper_morphtag_encodings = [f+b for f,b in zip(forward, backward)]
                    elif  self.mtag_encoding_composition_type == "bi_mlp":
                        forward = []
                        backward = []
                        for idx, encoding in enumerate(morphtag_encodings):
                            if idx != 0:
                                f = self.mtag_encoding_f_w * concatenate([encoding, morphtag_encodings[idx-1]]) \
                                            + self.mtag_encoding_f_b
                                forward.append(f)
                            else:
                                forward.append(encoding)
                            if idx != len(morphtag_encodings) - 1:
                                b = self.mtag_encoding_b_w * concatenate([encoding, morphtag_encodings[idx+1]]) \
                                            + self.mtag_encoding_b_b
                                backward.append(b)
                            else:
                                backward.append(encoding)
                        upper_morphtag_encodings = [f+b for f,b in zip(forward, backward)]
                    else:
                        upper_morphtag_encodings = morphtag_encodings

                    for entry, mtag in zip(conll_sentence, upper_morphtag_encodings):
                        entry.vec = concatenate([entry.vec, mtag])


                for idx, entry in enumerate(conll_sentence):
                    entry.pos_lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                #Predicted pos tags
                lstm_forward = self.pos_builders[0].initial_state()
                lstm_backward = self.pos_builders[1].initial_state()
                for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                    lstm_forward = lstm_forward.add_input(entry.vec)
                    lstm_backward = lstm_backward.add_input(rentry.vec)

                    entry.pos_lstms[1] = lstm_forward.output()
                    rentry.pos_lstms[0] = lstm_backward.output()

                for entry in conll_sentence:
                    entry.pos_vec = concatenate(entry.pos_lstms)

                blstm_forward = self.pos_bbuilders[0].initial_state()
                blstm_backward = self.pos_bbuilders[1].initial_state()

                for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                    blstm_forward = blstm_forward.add_input(entry.pos_vec)
                    blstm_backward = blstm_backward.add_input(rentry.pos_vec)
                    entry.pos_lstms[1] = blstm_forward.output()
                    rentry.pos_lstms[0] = blstm_backward.output()

                concat_layer = [concatenate(entry.pos_lstms) for entry in conll_sentence]
                outputFFlayer = self.ffSeqPredictor.predict_sequence(concat_layer)
                predicted_pos_indices = [np.argmax(o.value()) for o in outputFFlayer]
                predicted_postags = [self.id2pos[idx] for idx in predicted_pos_indices]

                # Add predicted pos tags for parsing prediction
                for entry, posid in zip(conll_sentence, predicted_pos_indices):
                    entry.vec = concatenate([entry.vec, self.plookup[posid]])
                    entry.lstms = [entry.vec, entry.vec]

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence)
                heads = decoder.parse_proj(scores)

                # Multiple roots: heading to the previous "rooted" one
                rootCount = 0
                rootWid = -1
                for index, head in enumerate(heads):
                    if head == 0:
                        rootCount += 1
                        if rootCount == 1:
                            rootWid = index
                        if rootCount > 1:
                            heads[index] = rootWid
                            rootWid = index

                for entry, head, pos in zip(conll_sentence, heads, predicted_postags):
                    entry.pred_parent_id = head
                    entry.pred_relation = '_'
                    entry.pred_pos = pos

                dump = False

                if self.labelsFlag:
                    concat_layer = [self.__getRelVector(conll_sentence, head, modifier + 1) for modifier, head in
                                    enumerate(heads[1:])]
                    outputFFlayer = self.ffRelPredictor.predict_sequence(concat_layer)
                    predicted_rel_indices = [np.argmax(o.value()) for o in outputFFlayer]
                    predicted_rels = [self.irels[idx] for idx in predicted_rel_indices]
                    for modifier, head in enumerate(heads[1:]):
                        conll_sentence[modifier + 1].pred_relation = predicted_rels[modifier]

                renew_cg()
                if not dump:
                    yield sentence
Exemplo n.º 26
0
    def Train(self, trainData, options):
        errors = 0
        batch = 0
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        beg = start = time.time()

        random.shuffle(trainData) # in certain cases the data will already have been shuffled after being read from file or while creating dev data

        errs = []
        lerrs = []
        eeloss = 0.0
        self.feature_extractor.Init(options)

        for iSentence, sentence in enumerate(trainData,1):
            if iSentence % 100 == 0 and iSentence != 0:
                loss_message = 'Processing sentence number: %d'%iSentence + \
                        ' Loss: %.3f'%(eloss / etotal)+ \
                        ' Errors: %.3f'%((float(eerrors)) / etotal)+\
                        ' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\
                        ' Time: %.2gs'%(time.time()-start)
                print loss_message
                start = time.time()
                eerrors = 0
                eloss = 0.0
                etotal = 0
                lerrors = 0
                ltotal = 0

            conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]
            self.feature_extractor.getWordEmbeddings(conll_sentence, True, options)

            scores, exprs = self.__evaluate(conll_sentence, True)
            gold = [entry.parent_id for entry in conll_sentence]
            if self.proj:
                heads = decoder.parse_proj(scores, gold if self.costaugFlag else None)
            else:
                if self.costaugFlag:
                    #augment the score of non-gold arcs
                    for i in range(len(scores)):
                        for j in range(len(scores)):
                            if gold[j] != i:
                                scores[i][j] += 1.
                heads = chuliu_edmonds_one_root(scores.T)
                heads[0] = -1

            if self.labelsFlag:
                for modifier, head in enumerate(gold[1:]):
                    rscores, rexprs = self.__evaluateLabel(conll_sentence, head, modifier+1)
                    goldLabelInd = self.feature_extractor.rels[conll_sentence[modifier+1].relation]
                    wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0]
                    if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
                        lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd])
                        lerrors += 1 #not quite right but gives some indication

            e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
            eerrors += e
            if e > 0:
                loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g]
                eloss += dy.esum(loss).scalar_value()
                mloss += dy.esum(loss).scalar_value()
                errs.extend(loss)

            etotal += len(conll_sentence)

            if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0:
                eeloss = 0.0

                if len(errs) > 0 or len(lerrs) > 0:
                    eerrs = (dy.esum(errs + lerrs))
                    eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                dy.renew_cg()

        if len(errs) > 0:
            eerrs = (dy.esum(errs + lerrs))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []
            eeloss = 0.0

            dy.renew_cg()

        self.trainer.update()
        print "Loss: ", mloss/iSentence
        print "Total Training Time: %.2gs"%(time.time()-beg)
Exemplo n.º 27
0
    def Train(self, conll_path):
        self.trainer.set_sparse_updates(True)
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, self.c2i, self.m2i, self.t2i, self.morph_dict))
            random.shuffle(shuffledData)

            errs = []
            lerrs = []
            posErrs = []
            segErrs = []
            mTagErrs = []

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 500 == 0 and iSentence != 0:
                    print("Processing sentence number: %d" % iSentence, ", Loss: %.4f" % (
                                eloss / etotal), ", Time: %.2f" % (time.time() - start))
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0

                conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]

                if self.morphTagFlag:
                    sentence_context = []
                    last_state_char = self.char_rnn.predict_sequence([self.clookup[self.c2i["<start>"]]])[-1]
                    rev_last_state_char = self.char_rnn.predict_sequence([self.clookup[self.c2i["<start>"]]])[-1]
                    sentence_context.append(concatenate([last_state_char, rev_last_state_char]))
                    for entry in conll_sentence:
                        last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])
                        rev_last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])
                        entry.char_rnn_states = [concatenate([f,b]) for f,b in zip(last_state_char, rev_last_state_char)]
                        sentence_context.append(entry.char_rnn_states[-1])

                for idx, entry in enumerate(conll_sentence):
                    c = float(self.wordsCount.get(entry.norm, 0))
                    dropFlag = (random.random() < (c / (0.25 + c)))
                    wordvec = self.wlookup[
                        int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None
                    if self.morphTagFlag :
                        entry.vec = dynet.dropout(concatenate([wordvec, entry.char_rnn_states[-1]]), 0.33)
                    else:
                        last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])[-1]
                        rev_last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])[-1]
                        entry.vec = dynet.dropout(concatenate([wordvec, last_state_char, rev_last_state_char]), 0.33)

                for idx, entry in enumerate(conll_sentence):
                    if self.morphFlag:
                        if len(entry.norm) > 2:
                            if self.goldMorphFlag:
                                seg_vec = self.__getSegmentationVector(entry.norm)
                                seg_vec = dynet.vecInput(seg_vec.dim()[0][0])
                                seg_vec.set(entry.idMorphs)
                                morph_seg = utils.generate_morphs(entry.norm, seg_vec.vec_value())
                            else:
                                seg_vec = self.__getSegmentationVector(entry.norm)
                                morph_seg = utils.generate_morphs(entry.norm, seg_vec.vec_value())
                                vec_gold = dynet.vecInput(seg_vec.dim()[0][0])
                                vec_gold.set(entry.idMorphs)
                                segErrs.append(self.binary_crossentropy(seg_vec,vec_gold))
                        else:
                            morph_seg = [entry.norm]

                        last_state_morph = self.morph_rnn.predict_sequence([self.__getMorphVector(morph) for morph in morph_seg])[-1]
                        rev_last_state_morph = self.morph_rnn.predict_sequence([self.__getMorphVector(morph) for morph in reversed(morph_seg)])[
                            -1]
                        encoding_morph = concatenate([last_state_morph, rev_last_state_morph])
                        entry.vec = concatenate([entry.vec, dynet.dropout(encoding_morph, 0.33)])

                morphtag_encodings = []
                for idx, entry in enumerate(conll_sentence):
                    if self.morphTagFlag:
                        if self.goldMorphTagFlag:	
                            morph_tags = entry.idMorphTags
                        else:
                            word_context = [c for i, c in enumerate(sentence_context) if i-1 != idx]
                            mTagErrs.append(
                                self.__getLossMorphTagging(entry.char_rnn_states, entry.idMorphTags, word_context))
                            predicted_sequence = self.generate(entry.char_rnn_states, word_context)
                            morph_tags = predicted_sequence

                        last_state_mtag = self.mtag_rnn.predict_sequence([self.tlookup[t] for t in morph_tags])[-1]
                        rev_last_state_mtag = \
                        self.mtag_rnn.predict_sequence([self.tlookup[t] for t in reversed(morph_tags)])[
                            -1]   
                        current_encoding_mtag = concatenate([last_state_mtag, rev_last_state_mtag])        
                        morphtag_encodings.append(current_encoding_mtag)
        
                if self.morphTagFlag:
                    forward = []
                    for idx, encoding in enumerate(morphtag_encodings):
                        if idx == 0:
                            forward.append(encoding)
                        else:
                            updated = morphtag_encodings[idx-1]*self.mtag_encoding_composition_alpha \
                                    + encoding*(1-self.mtag_encoding_composition_alpha)
                            forward.append(updated)
                    if self.mtag_encoding_composition_type == "w_sum":
                        upper_morphtag_encodings = forward
                    elif self.mtag_encoding_composition_type == "bi_w_sum":
                        backward = []
                        for idx, r_encoding in enumerate(morphtag_encodings):
                            if idx == len(morphtag_encodings) - 1:
                                backward.append(r_encoding)
                            else:
                                updated = morphtag_encodings[idx+1]*self.mtag_encoding_composition_alpha \
                                        + r_encoding*(1-self.mtag_encoding_composition_alpha)
                                backward.append(updated)
                        upper_morphtag_encodings = [f+b for f,b in zip(forward, backward)]   
                    elif  self.mtag_encoding_composition_type == "bi_mlp":
                        forward = []
                        backward = []
                        for idx, encoding in enumerate(morphtag_encodings):
                            if idx != 0:
                                f = self.mtag_encoding_f_w * concatenate([encoding, morphtag_encodings[idx-1]]) \
                                            + self.mtag_encoding_f_b
                                forward.append(f)
                            else:
                                forward.append(encoding)
                            if idx != len(morphtag_encodings) - 1:
                                b = self.mtag_encoding_b_w * concatenate([encoding, morphtag_encodings[idx+1]]) \
                                            + self.mtag_encoding_b_b
                                backward.append(b)
                            else:
                                backward.append(encoding)
                        upper_morphtag_encodings = [f+b for f,b in zip(forward, backward)]
                    else:
                        upper_morphtag_encodings = morphtag_encodings
                    for entry, mtag in zip(conll_sentence, upper_morphtag_encodings):
                        entry.vec = concatenate([entry.vec, dynet.dropout(mtag, 0.33)])

                for idx, entry in enumerate(conll_sentence):
                    entry.pos_lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                #POS tagging loss
                lstm_forward = self.pos_builders[0].initial_state()
                lstm_backward = self.pos_builders[1].initial_state()
                for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                    lstm_forward = lstm_forward.add_input(entry.vec)
                    lstm_backward = lstm_backward.add_input(rentry.vec)

                    entry.pos_lstms[1] = lstm_forward.output()
                    rentry.pos_lstms[0] = lstm_backward.output()

                for entry in conll_sentence:
                    entry.pos_vec = concatenate(entry.pos_lstms)

                blstm_forward = self.pos_bbuilders[0].initial_state()
                blstm_backward = self.pos_bbuilders[1].initial_state()

                for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                    blstm_forward = blstm_forward.add_input(entry.pos_vec)
                    blstm_backward = blstm_backward.add_input(rentry.pos_vec)
                    entry.pos_lstms[1] = blstm_forward.output()
                    rentry.pos_lstms[0] = blstm_backward.output()

                concat_layer = [dynet.dropout(concatenate(entry.pos_lstms), 0.33) for entry in conll_sentence]
                outputFFlayer = self.ffSeqPredictor.predict_sequence(concat_layer)
                posIDs = [self.pos.get(entry.pos) for entry in conll_sentence]
                for pred, gold in zip(outputFFlayer, posIDs):
                    posErrs.append(self.pick_neg_log(pred, gold))

                # Add predicted pos tags
                for entry, poses in zip(conll_sentence, outputFFlayer):
                    entry.vec = concatenate([entry.vec, dynet.dropout(self.plookup[np.argmax(poses.value())], 0.33)])
                    entry.lstms = [entry.vec, entry.vec]

                #Parsing losses
                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence)
                gold = [entry.parent_id for entry in conll_sentence]
                heads = decoder.parse_proj(scores, gold if self.costaugFlag else None)

                if self.labelsFlag:

                    concat_layer = [dynet.dropout(self.__getRelVector(conll_sentence, head, modifier + 1), 0.33) for
                                    modifier, head in enumerate(gold[1:])]
                    outputFFlayer = self.ffRelPredictor.predict_sequence(concat_layer)
                    relIDs = [self.rels[conll_sentence[modifier + 1].relation] for modifier, _ in enumerate(gold[1:])]
                    for pred, goldid in zip(outputFFlayer, relIDs):
                        lerrs.append(self.pick_neg_log(pred, goldid))

                e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
                eerrors += e
                if e > 0:
                    loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in enumerate(zip(heads, gold)) if h != g]  # * (1.0/float(e))
                    eloss += (e)
                    mloss += (e)
                    errs.extend(loss)

                etotal += len(conll_sentence)

                if iSentence % 1 == 0:
                    if len(errs) > 0 or len(lerrs) > 0 or len(posErrs) > 0 or len(segErrs) > 0 or len(mTagErrs) > 0:
                        eerrs = (esum(errs + lerrs + posErrs + segErrs + mTagErrs))
                        eerrs.scalar_value()
                        eerrs.backward()
                        self.trainer.update()
                        errs = []
                        lerrs = []
                        posErrs = []
                        segErrs = []
                        mTagErrs = []

                    renew_cg()

        print("Loss: %.4f" % (mloss / iSentence))
Exemplo n.º 28
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP)):
                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                for entry in conll_sentence:
                    wordvec = self.wlookup[int(self.vocab.get(
                        entry.norm, 0))] if self.wdims > 0 else None
                    posvec = self.plookup[int(
                        self.pos[entry.pos])] if self.pdims > 0 else None
                    evec = self.elookup[int(
                        self.extrnd.get(entry.form,
                                        self.extrnd.get(entry.norm, 0))
                    )] if self.external_embedding is not None else None
                    entry.vec = concatenate(
                        filter(None, [wordvec, posvec, evec]))

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(
                                rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                heads = decoder.parse_proj(scores)

                for entry, head in zip(conll_sentence, heads):
                    entry.pred_parent_id = head
                    entry.pred_relation = '_'

                dump = False

                if self.labelsFlag:
                    for modifier, head in enumerate(heads[1:]):
                        scores, exprs = self.__evaluateLabel(
                            conll_sentence, head, modifier + 1)
                        conll_sentence[modifier +
                                       1].pred_relation = self.irels[max(
                                           enumerate(scores),
                                           key=itemgetter(1))[0]]

                renew_cg()
                if not dump:
                    yield sentence
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, self.c2i)):
                conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]

                for entry in conll_sentence:
                    wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None
                    evec = self.elookup[int(self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)))] if self.external_embedding is not None else None
                    
                    last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])[-1]
                    rev_last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])[-1]

                    # char_state = dynet.noise(concatenate([last_state, rev_last_state]), 0.2)
                    # morph_logit = self.charSeqPredictor.predict_sequence(char_state)
                    # morphID = self.morphs.get(entry.feats)
                    # morphErrs.append(self.pick_neg_log(morph_logit, morphID))
                    # morph_emb = None
                    # for i in morph_logit:
                    #     morph_emb += i * self.mlookup(i)
                      
                    entry.vec = concatenate(filter(None, [wordvec, evec, last_state, rev_last_state]))
                    entry.ch_vec = concatenate([dynet.noise(fe,0.2) for fe in filter(None, [last_state, rev_last_state])])
                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:

                    morcat_layer = [entry.ch_vec for entry in conll_sentence]
                    morph_logits = self.charSeqPredictor.predict_sequence(morcat_layer)
                    predicted_morph_idx = [np.argmax(o.value()) for o in morph_logits]
                    predicted_morphs = [self.id2morph[idx] for idx in predicted_morph_idx]

                    for builder in self.pos_builder:
                        builder.disable_dropout()
                    lstm_forward = self.pos_builder[0].initial_state()
                    lstm_backward = self.pos_builder[1].initial_state()

                    for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    pos_embed = []
                    concat_layer = [concatenate(entry.lstms) for entry in conll_sentence]
                    outputFFlayer = self.ffSeqPredictor.predict_sequence(concat_layer)
                    predicted_posIDs = [np.argmax(o.value()) for o in outputFFlayer]  
                    predicted_postags = [self.id2pos[idx] for idx in predicted_posIDs]
                    for predID, pred in zip(predicted_posIDs, outputFFlayer):
                        if self.gold_pos:
                            pos_embed.append(self.plookup[predID])
                        else:
                            pos_embed.append(soft_embed(pred.value(), self.plookup))
                            
                    for entry in conll_sentence:
                        entry.vec = concatenate(entry.lstms)
                    for builder in self.dep_builders:
                        builder.disable_dropout()
                    blstm_forward = self.dep_builders[0].initial_state()
                    blstm_backward = self.dep_builders[1].initial_state()

                    for entry, rentry, pembed, revpembed in zip(conll_sentence, reversed(conll_sentence),
                                                                pos_embed, reversed(pos_embed)):
                        blstm_forward = blstm_forward.add_input(concatenate([entry.vec, pembed]))
                        blstm_backward = blstm_backward.add_input(concatenate([rentry.vec, revpembed]))

                        entry.lstms[1] = blstm_forward.output()
                        rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                heads = decoder.parse_proj(scores)
                
                #Multiple roots: heading to the previous "rooted" one
                rootCount = 0
                rootWid = -1
                for index, head in enumerate(heads):
                    if head == 0:
                        rootCount += 1
                        if rootCount == 1:
                            rootWid = index
                        if rootCount > 1:    
                            heads[index] = rootWid
                            rootWid = index
                        
                
                for entry, head, pos, feats in zip(conll_sentence, heads, predicted_postags, predicted_morphs):
                    entry.pred_parent_id = head
                    entry.pred_relation = '_'
                    entry.pred_pos = pos
                    entry.pred_feats = feats

                dump = False

                if self.labelsFlag:
                    for modifier, head in enumerate(heads[1:]):
                        scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1)
                        conll_sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]]

                renew_cg()
                if not dump:
                    yield sentence
Exemplo n.º 30
0
    def parse(self, indices, arcs=None, pos_indices=None):
        states = self.states(indices, pos_indices)
        scores = np.array(self.score_arcs(states))

        return parse_proj(scores, arcs)
Exemplo n.º 31
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP)):
                self.hid2Layer = parameter(self.model["hidden2-layer"])
                self.hid2Bias = parameter(self.model["hidden2-bias"])

                self.hidLayerFOM = parameter(self.model["hidden-layer-fom"])
                self.hidLayerFOH = parameter(self.model["hidden-layer-foh"])
                self.hidBias = parameter(self.model["hidden-bias"])

                self.outLayer = parameter(self.model["output-layer"])

                if self.labelsFlag:
                    self.rhid2Layer = parameter(self.model["rhidden2-layer"])
                    self.rhid2Bias = parameter(self.model["rhidden2-bias"])

                    self.rhidLayerFOM = parameter(self.model["rhidden-layer-fom"])
                    self.rhidLayerFOH = parameter(self.model["rhidden-layer-foh"])
                    self.rhidBias = parameter(self.model["rhidden-bias"])

                    self.routLayer = parameter(self.model["routput-layer"])
                    self.routBias = parameter(self.model["routput-bias"])


                for entry in sentence:
                    wordvec = lookup(self.model["word-lookup"], int(self.vocab.get(entry.norm, 0))) if self.wdims > 0 else None
                    posvec = lookup(self.model["pos-lookup"], int(self.pos[entry.pos])) if self.pdims > 0 else None
                    evec = lookup(self.model["extrn-lookup"], int(self.vocab.get(entry.norm, 0))) if self.external_embedding is not None else None
                    entry.vec = concatenate(filter(None, [wordvec, posvec, evec]))

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(sentence, reversed(sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(sentence, reversed(sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(sentence, True)
                heads = decoder.parse_proj(scores) 

                for entry, head in zip(sentence, heads):
                    entry.pred_parent_id = head
                    entry.pred_relation = '_'

                dump = False

                if self.labelsFlag:
                    for modifier, head in enumerate(heads[1:]):
                        scores, exprs = self.__evaluateLabel(sentence, head, modifier+1)
                        sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]]

                renew_cg()
                if not dump:
                    yield sentence