def forward(self, sentences, errs, lerrs): tmp = time.time() self.getWordEmbeddings(sentences, True) self.ebd += time.time() - tmp for sentence in sentences: tmp = time.time() scores, exprs = self.__evaluate(sentence, True) self.evl += time.time() - tmp gold = [entry.parent_id for entry in sentence] heads = decoder.parse_proj(scores, gold) for modifier, head in enumerate(gold[1:]): tmp = time.time() rscores, rexprs = self.__evaluateLabel(sentence, head, modifier + 1) self.evl += time.time() - tmp goldLabelInd = self.rels[sentence[modifier + 1].relation] wrongLabelInd = \ max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs += [rexprs[wrongLabelInd] - rexprs[goldLabelInd]] e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) if e > 0: errs += [(exprs[h][i] - exprs[g][i])[0] for i, (h, g) in enumerate(zip(heads, gold)) if h != g] return e
def predict(self, sentence): self.process_sentence_embeddings(sentence) num_vec = len(sentence) vec_for = torch.cat([entry.vec for entry in sentence]).view(num_vec, 1, -1) vec_back = torch.cat([entry.vec for entry in reversed(sentence) ]).view(num_vec, 1, -1) res_for_1, self.hid_for_1 = self.lstm_for_1(vec_for, self.hid_for_1) res_back_1, self.hid_back_1 = self.lstm_back_1(vec_back, self.hid_back_1) vec_cat = [ concatenate_tensors([res_for_1[i], res_back_1[num_vec - i - 1]]) for i in range(num_vec) ] vec_for_2 = torch.cat(vec_cat).view(num_vec, 1, -1) vec_back_2 = torch.cat(list(reversed(vec_cat))).view(num_vec, 1, -1) res_for_2, self.hid_for_2 = self.lstm_for_2(vec_for_2, self.hid_for_2) res_back_2, self.hid_back_2 = self.lstm_back_2(vec_back_2, self.hid_back_2) for i in range(num_vec): sentence[i].lstms[0] = res_for_2[i] sentence[i].lstms[1] = res_back_2[num_vec - i - 1] scores, exprs = self.__evaluate(sentence, True) heads = decoder.parse_proj(scores) for entry, head in zip(sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_'
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP)): conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] for entry in conll_sentence: wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None evec = self.elookup[int(self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)))] if self.external_embedding is not None else None entry.vec = concatenate(filter(None, [wordvec, posvec, evec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input(rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) heads = decoder.parse_proj(scores) for entry, head in zip(conll_sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' dump = False if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1) conll_sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]] renew_cg() if not dump: yield sentence
def predict(self, sentence): for entry in sentence: wordvec = self.wlookup(scalar(int(self.vocab.get( entry.norm, 0)))) if self.wdims > 0 else None posvec = self.plookup(scalar(int( self.pos[entry.pos]))) if self.pdims > 0 else None evec = self.elookup( scalar( int( self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)))) ) if self.external_embedding is not None else None entry.vec = cat([wordvec, posvec, evec]) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = RNNState(self.builders[0]) lstm_backward = RNNState(self.builders[1]) for entry, rentry in zip(sentence, reversed(sentence)): lstm_forward = lstm_forward.next(entry.vec) lstm_backward = lstm_backward.next(rentry.vec) entry.lstms[1] = lstm_forward() rentry.lstms[0] = lstm_backward() if self.bibiFlag: for entry in sentence: entry.vec = cat(entry.lstms) blstm_forward = RNNState(self.bbuilders[0]) blstm_backward = RNNState(self.bbuilders[1]) for entry, rentry in zip(sentence, reversed(sentence)): blstm_forward = blstm_forward.next(entry.vec) blstm_backward = blstm_backward.next(rentry.vec) entry.lstms[1] = blstm_forward() rentry.lstms[0] = blstm_backward() scores, exprs = self.__evaluate(sentence, True) heads = decoder.parse_proj(scores) for entry, head in zip(sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel(sentence, head, modifier + 1) sentence[modifier + 1].pred_relation = self.irels[max( enumerate(scores), key=itemgetter(1))[0]]
def arc_loss(self, gold_arcs, arc_scores): errors = [] arc_scores_values = np.array([[j.value() for j in i] for i in arc_scores]) arcs = parse_proj(arc_scores_values, gold_arcs) for i in range(len(gold_arcs)): if gold_arcs[i] != arcs[i]: error = arc_scores[arcs[i]][i] - arc_scores[gold_arcs[i]][i] errors.append(error) return errors
def predict(self, sentences): self.getWordEmbeddings(sentences, False) for sentence in sentences: scores, exprs = self.__evaluate(sentence, True) heads = decoder.parse_proj(scores) for entry, head in zip(sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' head_list = list(heads) for modifier, head in enumerate(head_list[1:]): scores, exprs = self.__evaluateLabel(sentence, head, modifier + 1) sentence[modifier + 1].pred_relation = self.irels[max( enumerate(scores), key=itemgetter(1))[0]]
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll_predict(conllFP, self.c2i, self.wordsCount)): conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] for entry in conll_sentence: wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])[-1] rev_last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])[ -1] entry.vec = concatenate(filter(None, [wordvec, last_state, rev_last_state])) entry.pos_lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None #Predicted pos tags lstm_forward = self.pos_builders[0].initial_state() lstm_backward = self.pos_builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.pos_lstms[1] = lstm_forward.output() rentry.pos_lstms[0] = lstm_backward.output() for entry in conll_sentence: entry.pos_vec = concatenate(entry.pos_lstms) blstm_forward = self.pos_bbuilders[0].initial_state() blstm_backward = self.pos_bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.pos_vec) blstm_backward = blstm_backward.add_input(rentry.pos_vec) entry.pos_lstms[1] = blstm_forward.output() rentry.pos_lstms[0] = blstm_backward.output() concat_layer = [concatenate(entry.pos_lstms) for entry in conll_sentence] outputFFlayer = self.ffSeqPredictor.predict_sequence(concat_layer) predicted_pos_indices = [np.argmax(o.value()) for o in outputFFlayer] predicted_postags = [self.id2pos[idx] for idx in predicted_pos_indices] # Add predicted pos tags for parsing prediction for entry, posid in zip(conll_sentence, predicted_pos_indices): entry.vec = concatenate([entry.vec, self.plookup[posid]]) entry.lstms = [entry.vec, entry.vec] if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input(rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence) heads = decoder.parse_proj(scores) # Multiple roots: heading to the previous "rooted" one rootCount = 0 rootWid = -1 for index, head in enumerate(heads): if head == 0: rootCount += 1 if rootCount == 1: rootWid = index if rootCount > 1: heads[index] = rootWid rootWid = index for entry, head, pos in zip(conll_sentence, heads, predicted_postags): entry.pred_parent_id = head entry.pred_relation = '_' entry.pred_pos = pos dump = False if self.labelsFlag: concat_layer = [self.__getRelVector(conll_sentence, head, modifier + 1) for modifier, head in enumerate(heads[1:])] outputFFlayer = self.ffRelPredictor.predict_sequence(concat_layer) predicted_rel_indices = [np.argmax(o.value()) for o in outputFFlayer] predicted_rels = [self.irels[idx] for idx in predicted_rel_indices] for modifier, head in enumerate(heads[1:]): conll_sentence[modifier + 1].pred_relation = predicted_rels[modifier] renew_cg() if not dump: yield sentence
def Train(self, conll_path): errors = 0 batch = 0 eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP, self.c2i)) random.shuffle(shuffledData) errs = [] lerrs = [] posErrs = [] eeloss = 0.0 for iSentence, sentence in enumerate(shuffledData): if iSentence % 500 == 0 and iSentence != 0: print "Processing sentence number: %d" % iSentence, ", Loss: %.2f" % ( eloss / etotal), ", Time: %.2f" % (time.time() - start) start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for entry in conll_sentence: c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c / (0.25 + c))) wordvec = self.wlookup[ int(self.vocab.get(entry.norm, 0) ) if dropFlag else 0] if self.wdims > 0 else None evec = None if self.external_embedding is not None: evec = self.elookup[self.extrnd.get( entry.form, self.extrnd.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0] #entry.vec = concatenate(filter(None, [wordvec, evec])) last_state = self.char_rnn.predict_sequence( [self.clookup[c] for c in entry.idChars])[-1] rev_last_state = self.char_rnn.predict_sequence( [self.clookup[c] for c in reversed(entry.idChars)])[-1] entry.vec = concatenate([ dynet.noise(fe, 0.2) for fe in filter( None, [wordvec, evec, last_state, rev_last_state]) ]) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input( rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) gold = [entry.parent_id for entry in conll_sentence] heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) if self.labelsFlag: for modifier, head in enumerate(gold[1:]): rscores, rexprs = self.__evaluateLabel( conll_sentence, head, modifier + 1) goldLabelInd = self.rels[conll_sentence[modifier + 1].relation] wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd]) e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) eloss += (e) mloss += (e) errs.extend(loss) etotal += len(conll_sentence) concat_layer = [ concatenate(entry.lstms) for entry in conll_sentence ] concat_layer = [dynet.noise(fe, 0.2) for fe in concat_layer] outputFFlayer = self.ffSeqPredictor.predict_sequence( concat_layer) posIDs = [self.pos.get(entry.pos) for entry in conll_sentence] for pred, gold in zip(outputFFlayer, posIDs): posErrs.append(self.pick_neg_log(pred, gold)) if iSentence % 1 == 0 or len(errs) > 0 or len( lerrs) > 0 or len(posErrs) > 0: eeloss = 0.0 if len(errs) > 0 or len(lerrs) > 0 or len(posErrs) > 0: eerrs = (esum(errs + lerrs + posErrs) ) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] posErrs = [] renew_cg() if len(errs) > 0: eerrs = (esum(errs + lerrs + posErrs)) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] posErrs = [] eeloss = 0.0 renew_cg() self.trainer.update() print "Loss: %.2f" % (mloss / iSentence)
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP)): self.hid2Layer = parameter(self.model["hidden2-layer"]) self.hid2Bias = parameter(self.model["hidden2-bias"]) self.hidLayerFOM = parameter(self.model["hidden-layer-fom"]) self.hidLayerFOH = parameter(self.model["hidden-layer-foh"]) self.hidBias = parameter(self.model["hidden-bias"]) self.outLayer = parameter(self.model["output-layer"]) if self.labelsFlag: self.rhid2Layer = parameter(self.model["rhidden2-layer"]) self.rhid2Bias = parameter(self.model["rhidden2-bias"]) self.rhidLayerFOM = parameter(self.model["rhidden-layer-fom"]) self.rhidLayerFOH = parameter(self.model["rhidden-layer-foh"]) self.rhidBias = parameter(self.model["rhidden-bias"]) self.routLayer = parameter(self.model["routput-layer"]) self.routBias = parameter(self.model["routput-bias"]) for entry in sentence: wordvec = lookup(self.model["word-lookup"], int(self.vocab.get(entry.norm, 0))) if self.wdims > 0 else None posvec = lookup(self.model["pos-lookup"], int(self.pos[entry.pos])) if self.pdims > 0 else None evec = lookup(self.model["extrn-lookup"], int(self.vocab.get(entry.norm, 0))) if self.external_embedding is not None else None entry.vec = concatenate(filter(None, [wordvec, posvec, evec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(sentence, reversed(sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(sentence, reversed(sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input(rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(sentence, True) heads = decoder.parse_proj(scores) for entry, head in zip(sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' dump = False if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel(sentence, head, modifier+1) sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]] renew_cg() if not dump: yield sentence
def Train(self, conll_path, dep_epoch=0, ner_epoch=0): eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 start = time.time() dep_epoch = dep_epoch ner_epoch = ner_epoch with open(conll_path, 'r') as conllFP: if ner_epoch == 0: read_conll_nerdep = read_conll(conllFP, self.c2i) else: read_conll_nerdep = read_conll_ner(conllFP, self.c2i) shuffledData = list(read_conll_nerdep) random.shuffle(shuffledData) errs = [] lerrs = [] posErrs = 0 postrErrs = [] nertr2Errs = [] ner2Errs = dynet.inputVector([0]) startind = 0 e = 0 for iSentence, sentence in enumerate(shuffledData): if iSentence % 500 == 0 and iSentence != 0: print "Processing sentence number: %d" % iSentence, ", Loss: %.4f" % ( eloss / etotal), ", Time: %.2f" % (time.time() - start) start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for entry in conll_sentence: c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c / (0.25 + c))) capvec = self.caps_lookup[entry.capInfo] wordvec = self.wlookup[ int(self.vocab.get(entry.norm, 0) ) if dropFlag else 0] if self.wdims > 0 else None last_state = self.char_rnn.predict_sequence( [self.clookup[c] for c in entry.idChars])[-1] rev_last_state = self.char_rnn.predict_sequence( [self.clookup[c] for c in reversed(entry.idChars)])[-1] entry.vec = dynet.dropout( concatenate( filter( None, [wordvec, last_state, rev_last_state, capvec ])), 0.33) entry.vec2 = entry.vec entry.pos_lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if not self.depFlag: #NER tagging loss lstm_forward = self.pos_builders[0].initial_state() lstm_backward = self.pos_builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.pos_lstms[1] = lstm_forward.output() rentry.pos_lstms[0] = lstm_backward.output() for entry in conll_sentence: entry.pos_vec = concatenate(entry.pos_lstms) blstm_forward = self.pos_bbuilders[0].initial_state() blstm_backward = self.pos_bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.pos_vec) blstm_backward = blstm_backward.add_input( rentry.pos_vec) entry.pos_lstms[1] = blstm_forward.output() rentry.pos_lstms[0] = blstm_backward.output() concat_layer = [ dynet.dropout(concatenate(entry.pos_lstms), 0.33) for entry in conll_sentence ] cap_info_sentence = [ self.caps_lookup[entry.capInfo] for entry in conll_sentence ] outputFFlayer = self.ffSeqPredictor.predict_sequence( concat_layer) posIDs = [ self.pos.get(entry.pos) for entry in conll_sentence ] posErrs = (self.forward_score(outputFFlayer) - self.pick_gold_score(outputFFlayer, posIDs)) ##dependency Flag if self.depFlag: # Add predicted ner tags #for entry, poses in zip(conll_sentence, outputFFlayer): # entry.vec = concatenate([entry.vec, dynet.dropout(self.plookup[np.argmax(poses.value())], 0.33)]) for entry in conll_sentence: entry.lstms = [entry.vec, entry.vec] #Parsing losses if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input( entry.vec) blstm_backward = blstm_backward.add_input( rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence) gold = [entry.parent_id for entry in conll_sentence] heads = decoder.parse_proj( scores, gold if self.costaugFlag else None) if self.labelsFlag: concat_layer = [ dynet.dropout( self.__getRelVector(conll_sentence, head, modifier + 1), 0.33) for modifier, head in enumerate(gold[1:]) ] outputFFlayer = self.ffRelPredictor.predict_sequence( concat_layer) if dep_epoch == 1: relIDs = [ self.rels[conll_sentence[modifier + 1].relation] for modifier, _ in enumerate(gold[1:]) ] for pred, goldid in zip(outputFFlayer, relIDs): lerrs.append(self.pick_neg_log(pred, goldid)) if dep_epoch == 1: e = sum( [1 for h, g in zip(heads[1:], gold[1:]) if h != g]) if self.sNerFlag and ner_epoch == 1: conll_sentence[0].vec = concatenate([ conll_sentence[0].vec2, self.rellookup[self.rels["rroot"]] ]) for entry, pred in zip(conll_sentence[1:], outputFFlayer): relvec = self.rellookup[np.argmax(pred.value())] entry.vec = concatenate( [entry.vec2, dynet.dropout(relvec, 0.33)]) for entry in conll_sentence: entry.ner2_lstms = [entry.vec, entry.vec] slstm_forward = self.sner_builders[0].initial_state() slstm_backward = self.sner_builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = slstm_forward.add_input(entry.vec) lstm_backward = slstm_backward.add_input( rentry.vec) entry.ner2_lstms[1] = lstm_forward.output() rentry.ner2_lstms[0] = lstm_backward.output() for entry in conll_sentence: entry.ner2_vec = concatenate(entry.ner2_lstms) sblstm_forward = self.sner_bbuilders[0].initial_state() sblstm_backward = self.sner_bbuilders[1].initial_state( ) for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = sblstm_forward.add_input( entry.ner2_vec) blstm_backward = sblstm_backward.add_input( rentry.ner2_vec) entry.ner2_lstms[1] = blstm_forward.output() rentry.ner2_lstms[0] = blstm_backward.output() concat_layer = [ dynet.dropout(concatenate(entry.ner2_lstms), 0.33) for entry in conll_sentence ] outputFFlayer = self.ffSeqPredictor.predict_sequence( concat_layer) posIDs = [ self.pos.get(entry.pos) for entry in conll_sentence ] gold_score = self.pick_gold_score( outputFFlayer, posIDs) ner2Errs = (self.forward_score(outputFFlayer) - gold_score) if iSentence < 5: print("ner and dep loss") if ner2Errs != 0: print(ner2Errs.value()) else: print(0) if dep_epoch != 0: print(esum(lerrs).value()) else: print(0) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) eloss += (e) mloss += (e) errs.extend(loss) etotal += len(conll_sentence) if iSentence % 1 == 0: if len(errs) > 0 or len(lerrs) > 0 or posErrs > 0 or len( postrErrs) > 0 or ner2Errs > 0 or len( nertr2Errs) > 0: eerrs = 0 if len(errs + lerrs + postrErrs + nertr2Errs) > 0: eerrs = esum(errs + lerrs + postrErrs + nertr2Errs) eerrs += (posErrs + ner2Errs) #print(eerrs.value()) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] e = 0 lerrs = [] posErrs = [] postrErrs = [] ner2Errs = [] nertr2Errs = [] posErrs = 0 ner2Errs = 0 renew_cg() print "Loss: %.4f" % (mloss / iSentence)
def Predict(self, conll_path, dep_epoch=1, ner_epoch=1): with open(conll_path, 'r') as conllFP: if ner_epoch == 0: read_conll_nerdep = read_conll_predict(conllFP, self.c2i, self.wordsCount) else: read_conll_nerdep = read_conll_predict_ner( conllFP, self.c2i, self.wordsCount) for iSentence, sentence in enumerate(read_conll_nerdep): conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for entry in conll_sentence: capvec = self.caps_lookup[entry.capInfo] wordvec = self.wlookup[int(self.vocab.get( entry.norm, 0))] if self.wdims > 0 else None last_state = self.char_rnn.predict_sequence( [self.clookup[c] for c in entry.idChars])[-1] rev_last_state = self.char_rnn.predict_sequence( [self.clookup[c] for c in reversed(entry.idChars)])[-1] entry.vec = concatenate( filter(None, [wordvec, last_state, rev_last_state, capvec])) entry.vec2 = concatenate( filter(None, [wordvec, last_state, rev_last_state, capvec])) entry.pos_lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if not self.depFlag: #Predicted pos tags lstm_forward = self.pos_builders[0].initial_state() lstm_backward = self.pos_builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.pos_lstms[1] = lstm_forward.output() rentry.pos_lstms[0] = lstm_backward.output() for entry in conll_sentence: entry.pos_vec = concatenate(entry.pos_lstms) blstm_forward = self.pos_bbuilders[0].initial_state() blstm_backward = self.pos_bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.pos_vec) blstm_backward = blstm_backward.add_input( rentry.pos_vec) entry.pos_lstms[1] = blstm_forward.output() rentry.pos_lstms[0] = blstm_backward.output() concat_layer = [ concatenate(entry.pos_lstms) for entry in conll_sentence ] #cap_info_sentence=[self.caplookup[entry.capInfo] for entry in conll_sentence] outputFFlayer = self.ffSeqPredictor.predict_sequence( concat_layer) best_parentids, bestscores = self.ffSeqPredictor.viterbi_sequence( outputFFlayer, self.nertrans_lookup) predicted_pos_indices = [ np.argmax(o.value()) for o in outputFFlayer ] root_predicted_postags = ["O"] predicted_postags = [ self.id2pos[idx] for idx in best_parentids ] for pos in predicted_postags: root_predicted_postags.append(pos) if iSentence < 5: for word, tag in zip(conll_sentence, root_predicted_postags): print("word : {} gold : {} pred : {}".format( word.form, word.pos, tag)) for entry, pos in zip(conll_sentence, root_predicted_postags): entry.pred_pos = pos dump = False if self.depFlag: # Add predicted pos tags for parsing prediction #for entry, posid in zip(conll_sentence, viterbi_pred_tagids): # entry.vec = concatenate([entry.vec, self.plookup[posid]]) # entry.lstms = [entry.vec, entry.vec] for entry in conll_sentence: entry.lstms = [entry.vec, entry.vec] if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input( entry.vec) blstm_backward = blstm_backward.add_input( rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence) heads = decoder.parse_proj(scores) # Multiple roots: heading to the previous "rooted" one rootCount = 0 rootWid = -1 for index, head in enumerate(heads): if head == 0: rootCount += 1 if rootCount == 1: rootWid = index if rootCount > 1: heads[index] = rootWid rootWid = index for entry, head in zip(conll_sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' #entry.pred_pos = pos if self.labelsFlag: concat_layer = [ self.__getRelVector(conll_sentence, head, modifier + 1) for modifier, head in enumerate(heads[1:]) ] outputFFlayer = self.ffRelPredictor.predict_sequence( concat_layer) predicted_rel_indices = [ np.argmax(o.value()) for o in outputFFlayer ] predicted_rels = [ self.irels[idx] for idx in predicted_rel_indices ] for modifier, head in enumerate(heads[1:]): conll_sentence[ modifier + 1].pred_relation = predicted_rels[modifier] if self.sNerFlag and ner_epoch == 1: conll_sentence[0].vec = concatenate([ conll_sentence[0].vec2, self.rellookup[self.rels["rroot"]] ]) for entry, pred in zip(conll_sentence[1:], predicted_rel_indices): relvec = self.rellookup[pred] # for entry, posid in zip(conll_sentence, viterbi_pred_tagids): entry.vec = concatenate([entry.vec2, relvec]) for entry in conll_sentence: entry.ner2_lstms = [entry.vec, entry.vec] slstm_forward = self.sner_builders[0].initial_state() slstm_backward = self.sner_builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = slstm_forward.add_input(entry.vec) lstm_backward = slstm_backward.add_input( rentry.vec) entry.ner2_lstms[1] = lstm_forward.output() rentry.ner2_lstms[0] = lstm_backward.output() for entry in conll_sentence: entry.ner2_vec = concatenate(entry.ner2_lstms) sblstm_forward = self.sner_bbuilders[0].initial_state() sblstm_backward = self.sner_bbuilders[1].initial_state( ) for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = sblstm_forward.add_input( entry.ner2_vec) blstm_backward = sblstm_backward.add_input( rentry.ner2_vec) entry.ner2_lstms[1] = blstm_forward.output() rentry.ner2_lstms[0] = blstm_backward.output() concat_layer = [ dynet.dropout(concatenate(entry.ner2_lstms), 0.33) for entry in conll_sentence ] outputFFlayer = self.ffSeqPredictor.predict_sequence( concat_layer) best_parentids, bestscores = self.ffSeqPredictor.viterbi_sequence( outputFFlayer, self.nertrans_lookup) predicted_pos_indices = [ np.argmax(o.value()) for o in outputFFlayer ] root_predicted_postags = ["O"] predicted_postags = [ self.id2pos[idx] for idx in best_parentids ] for pos in predicted_postags: root_predicted_postags.append(pos) if iSentence < 1: for word, tag in zip(conll_sentence, root_predicted_postags): print("word : {} gold : {} pred : {}".format( word.form, word.pos, tag)) for entry, pos in zip(conll_sentence, root_predicted_postags): entry.pred_pos = pos dump = False renew_cg() if not dump: yield sentence
def predict(self, sentence): for entry in sentence: wordvec = self.wlookup(scalar(int(self.vocab.get( entry.norm, 0)))) if self.wdims > 0 else None # if entry.pos == '$': # pdb.set_trace() if self.pdims > 0: posvec = self.plookup(scalar(int( self.pos[entry.pos]))) if self.pdims > 0 else None evec = None if self.ExtnrEmbPath is not None: evec = self.elookup( scalar( int( self.vocab.get(entry.form, self.vocab.get(entry.norm, 0))))) ''' combine three embeddings ''' if self.ExtnrEmbPath is not None: """ combine three embeddings """ # pdb.set_trace() entry.wordvec = wordvec entry.ewordvec = evec if self.pdims > 0: entry.posvec = posvec else: entry.wordvec = wordvec if self.pdims > 0: entry.posvec = posvec ''' add lstm ''' word_lstm_input = torch.stack([entry.wordvec for entry in sentence]) #len * dim if self.pdims > 0: pos_lstm_input = torch.stack([entry.posvec for entry in sentence]) #len * dim if self.ExtnrEmbPath is not None: eword_lstm_input = torch.stack( [entry.ewordvec for entry in sentence]) word_lstm_out, _ = self.word_lstm.forward(word_lstm_input) if self.pdims > 0: pos_lstm_out, _ = self.pos_lstm.forward(pos_lstm_input) if self.ExtnrEmbPath is not None: eword_lstm_out, _ = self.eword_lstm.forward(eword_lstm_input) kl_temp = 1.0 lp_eword_mean, lp_eword_logvar, lp_word_mean, lp_word_logvar, lp_pos_mean, lp_pos_logvar = None, None, None, None, None, None ''' go through stocastic layer ''' word_z = self.word_vgl.predict(word_lstm_out) if self.pdims > 0: pos_z = self.pos_vgl.predict(pos_lstm_out) if self.ExtnrEmbPath is not None: eword_z = self.eword_vgl.predict(eword_lstm_out) if self.ExtnrEmbPath is not None: if self.pdims > 0: scoring_input = cat([word_z, eword_z, pos_z]) else: scoring_input = cat([word_z, eword_z]) else: if self.pdims > 0: scoring_input = cat([word_z, pos_z]) else: scoring_input = word_z scoring_input = scoring_input.squeeze(1) scores, exprs = self.cal_scores(scoring_input) heads = decoder.parse_proj(scores) for entry, head in zip(sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_'
def Predict(self, conll_path, BATCH_SIZE=1): with open(conll_path, 'r') as conllFP: for iSentence, sentence_batch in enumerate( stream_to_batch(read_conll(conllFP), BATCH_SIZE)): batch_exprs = [] sents = [] labels = [] for sentence in sentence_batch: conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for entry in conll_sentence: wordvec = self.wlookup[int( self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None posvec = self.plookup[int( self.pos[entry.pos])] if self.pdims > 0 else None evec = self.elookup[int( self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)) )] if self.external_embedding is not None else None entry.vec = concatenate( filter(None, [wordvec, posvec, evec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input( entry.vec) blstm_backward = blstm_backward.add_input( rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() batch_exprs.append(self.__evaluate(conll_sentence, True)) sents.append(conll_sentence) _s = time.time() forward(batch_exprs[-1][-1]) print "fw1:", time.time() - _s batch_heads = [] _s = time.time() for _i, (exprs, conll_sentence) in enumerate(zip(batch_exprs, sents)): scores = np.array( [[output.scalar_value() for output in exprsRow] for exprsRow in exprs]) heads = decoder.parse_proj(scores) for entry, head in zip(conll_sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' batch_heads.append(heads) dump = False print "decode:", time.time() - _s if self.labelsFlag: # TODO this is currently not batched.. labels = [] _exps = [] for (heads, conll_sentence) in zip(batch_heads, sents): labels_exprs = [] for modifier, head in enumerate(heads[1:]): exprs = self.__evaluateLabel( conll_sentence, head, modifier + 1) _exps.append(exprs) labels_exprs.append((head, modifier, exprs)) labels.append(labels_exprs) _s = time.time() forward(_exps) print "fw-L:", time.time() - _s for lbls, conll_sentence in zip(labels, sents): for (head, modifier, exprs) in lbls: scores = exprs.value() conll_sentence[modifier + 1].pred_relation = self.irels[max( enumerate(scores), key=itemgetter(1))[0]] renew_cg() if not dump: for sentence in sentence_batch: yield sentence
def Predict(self, treebanks, datasplit, options): char_map = {} if options.char_map_file: char_map_fh = codecs.open(options.char_map_file,encoding='utf-8') char_map = json.loads(char_map_fh.read()) # should probably use a namedtuple in get_vocab to make this prettier _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map) # get external embeddings for the set of words and chars in the # test vocab but not in the training vocab test_embeddings = defaultdict(lambda: {}) if options.word_emb_size > 0 and options.ext_word_emb_file: new_test_words = \ set(test_words) - self.feature_extractor.words.viewkeys() print "Number of OOV word types at test time: %i (out of %i)" % ( len(new_test_words), len(test_words)) if len(new_test_words) > 0: # no point loading embeddings if there are no words to look for for lang in test_langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_word_emb_file, lang=lang, words=new_test_words ) test_embeddings["words"].update(embeddings) if len(test_langs) > 1 and test_embeddings["words"]: print "External embeddings found for %i words "\ "(out of %i)" % \ (len(test_embeddings["words"]), len(new_test_words)) if options.char_emb_size > 0: new_test_chars = \ set(test_chars) - self.feature_extractor.chars.viewkeys() print "Number of OOV char types at test time: %i (out of %i)" % ( len(new_test_chars), len(test_chars)) if len(new_test_chars) > 0: for lang in test_langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_char_emb_file, lang=lang, words=new_test_chars, chars=True ) test_embeddings["chars"].update(embeddings) if len(test_langs) > 1 and test_embeddings["chars"]: print "External embeddings found for %i chars "\ "(out of %i)" % \ (len(test_embeddings["chars"]), len(new_test_chars)) data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map) for iSentence, osentence in enumerate(data,1): sentence = deepcopy(osentence) self.feature_extractor.Init(options) conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] self.feature_extractor.getWordEmbeddings(conll_sentence, False, options, test_embeddings) scores, exprs = self.__evaluate(conll_sentence, True) if self.proj: heads = decoder.parse_proj(scores) #LATTICE solution to multiple roots # see https://github.com/jujbob/multilingual-bist-parser/blob/master/bist-parser/bmstparser/src/mstlstm.py ## ADD for handling multi-roots problem rootHead = [head for head in heads if head==0] if len(rootHead) != 1: print "it has multi-root, changing it for heading first root for other roots" rootHead = [seq for seq, head in enumerate(heads) if head == 0] for seq in rootHead[1:]:heads[seq] = rootHead[0] ## finish to multi-roots else: heads = chuliu_edmonds_one_root(scores.T) for entry, head in zip(conll_sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1) conll_sentence[modifier+1].pred_relation = self.feature_extractor.irels[max(enumerate(scores), key=itemgetter(1))[0]] dy.renew_cg() #keep in memory the information we need, not all the vectors oconll_sentence = [entry for entry in osentence if isinstance(entry, utils.ConllEntry)] for tok_o, tok in zip(oconll_sentence, conll_sentence): tok_o.pred_relation = tok.pred_relation tok_o.pred_parent_id = tok.pred_parent_id yield osentence
def Train(self, conll_path): errors = 0 batch = 0 eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 start = time.time() shuffledData = [] if self.train_multilingual: file_list = utils.readFileList(conll_path) for fileAndLang in file_list: print "Reading Training file:", fileAndLang[ 0], " language: ", fileAndLang[1] with open(fileAndLang[0], "r") as conllFP: shuffledData = shuffledData + list( read_conll(conllFP, fileAndLang[1])) else: with open(conll_path, 'r') as conllFP: shuffledData = list( read_conll(conllFP, self.conll_train_language)) random.shuffle(shuffledData) errs = [] lerrs = [] eeloss = 0.0 undefined_term = 0 for iSentence, sentence in enumerate(shuffledData): if iSentence % 100 == 0 and iSentence != 0: print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', ( float(eerrors)) / etotal, 'Time', time.time() - start start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for entry in conll_sentence: c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c / (0.25 + c))) posvec = self.plookup[int( self.pos[entry.pos])] if self.pdims > 0 else None xposvec = self.xplookup[int( self.xpos[entry.xpos])] if self.xpdims > 0 else None evec = None ecevc = None lang_code = entry.language.split( '_')[0] + ':' if self.multilingual_emb else "" # Add word and external embedding if self.external_embedding is not None: if self.extConcateFlag: wordvec = self.wlookup[int( self.vocab.get(entry.norm.lower(), 0) ) if dropFlag else 0] if self.wdims > 0 else None evec = self.elookup[self.extrnd.get( lang_code + entry.form.lower(), self.extrnd.get(lang_code + entry.norm.lower(), 0)) if (dropFlag or (random.random() < 0.5)) else 0] else: wordvec = self.elookup[self.extrnd.get( lang_code + entry.form.lower(), self.extrnd.get(lang_code + entry.norm.lower(), 0)) if (dropFlag or (random.random() < 0.5) ) else 0] if self.extrnd.get( lang_code + entry.form.lower(), self.extrnd.get(lang_code + entry.norm.lower(), 0)) == 0: undefined_term = undefined_term + 1 else: wordvec = self.wlookup[ int(self.vocab.get(entry.norm.lower(), 0) ) if dropFlag else 0] if self.wdims > 0 else None # Add external cluster embedding if self.external_cluster_embedding is not None: ecevc = self.eclookup[self.exctrnd.get( lang_code + entry.form.lower( ), self.exctrnd.get(lang_code + entry.norm.lower(), 0)) if (dropFlag or (random.random() < 0.5)) else 0] # Add language embedding langvec = self.llookup[self.languageVec_dic[ entry.language].lang_num] if self.add_lang_vec else None # print langvec.value() entry.vec = concatenate( filter(None, [wordvec, posvec, xposvec, evec, ecevc, langvec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input(rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) gold = [entry.parent_id for entry in conll_sentence] heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) if self.labelsFlag: for modifier, head in enumerate(gold[1:]): rscores, rexprs = self.__evaluateLabel( conll_sentence, head, modifier + 1) goldLabelInd = self.rels[conll_sentence[modifier + 1].relation] wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd]) e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) eloss += (e) mloss += (e) errs.extend(loss) etotal += len(conll_sentence) if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0: eeloss = 0.0 if len(errs) > 0 or len(lerrs) > 0: eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() if len(errs) > 0: eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] eeloss = 0.0 renew_cg() print " # of uddefined_term= ", undefined_term self.trainer.update_epoch() print "Loss: ", mloss / iSentence
def Train(self, conll_path): errors = 0 batch = 0 eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP)) random.shuffle(shuffledData) errs = [] lerrs = [] eeloss = 0.0 for iSentence, sentence in enumerate(shuffledData): if iSentence % 100 == 0 and iSentence != 0: print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Time', time.time()-start start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] for entry in conll_sentence: c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c/(0.25+c))) wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None evec = None if self.external_embedding is not None: evec = self.elookup[self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0] entry.vec = concatenate(filter(None, [wordvec, posvec, evec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input(rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) gold = [entry.parent_id for entry in conll_sentence] heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) if self.labelsFlag: for modifier, head in enumerate(gold[1:]): rscores, rexprs = self.__evaluateLabel(conll_sentence, head, modifier+1) goldLabelInd = self.rels[conll_sentence[modifier+1].relation] wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd]) e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) eloss += (e) mloss += (e) errs.extend(loss) etotal += len(conll_sentence) if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0: eeloss = 0.0 if len(errs) > 0 or len(lerrs) > 0: eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() if len(errs) > 0: eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] eeloss = 0.0 renew_cg() self.trainer.update_epoch() print "Loss: ", mloss/iSentence
def predict(self, sentence): for entry in sentence: wordvec = self.wlookup(scalar(int(self.vocab.get( entry.norm, 0)))) if self.wdims > 0 else None posvec = self.plookup(scalar(int( self.pos[entry.pos]))) if self.pdims > 0 else None '''ontovec = self.olookup( scalar(int(self.onto[entry.onto]))) if self.odims > 0 else None cposvec = self.clookup( scalar(int(self.cpos[entry.cpos]))) if self.cdims > 0 else None evec = self.elookup(scalar(int(self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0))))) if self.external_embedding is not None else None''' #entry.vec = cat([wordvec, posvec, ontovec, cposvec, evec]) gaze_feats = Variable( torch.unsqueeze(torch.Tensor(entry.gaze_feats), 0)) entry.vec = cat([wordvec, posvec, gaze_feats]) #entry.vec = posvec entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None num_vec = len(sentence) vec_for = torch.cat([entry.vec for entry in sentence]).view(num_vec, 1, -1) vec_back = torch.cat([entry.vec for entry in reversed(sentence) ]).view(num_vec, 1, -1) res_for_1, self.hid_for_1 = self.lstm_for_1(vec_for, self.hid_for_1) res_back_1, self.hid_back_1 = self.lstm_back_1(vec_back, self.hid_back_1) vec_cat = [ cat([res_for_1[i], res_back_1[num_vec - i - 1]]) for i in range(num_vec) ] vec_for_2 = torch.cat(vec_cat).view(num_vec, 1, -1) vec_back_2 = torch.cat(list(reversed(vec_cat))).view(num_vec, 1, -1) res_for_2, self.hid_for_2 = self.lstm_for_2(vec_for_2, self.hid_for_2) res_back_2, self.hid_back_2 = self.lstm_back_2(vec_back_2, self.hid_back_2) for i in range(num_vec): sentence[i].lstms[0] = res_for_2[i] sentence[i].lstms[1] = res_back_2[num_vec - i - 1] scores, exprs = self.__evaluate(sentence, True) heads = decoder.parse_proj(scores) for entry, head in zip(sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' head_list = list(heads) for modifier, head in enumerate(head_list[1:]): scores, exprs = self.__evaluateLabel(sentence, head, modifier + 1) sentence[modifier + 1].pred_relation = self.rel_list[max( enumerate(scores), key=itemgetter(1))[0]]
def forward(self, sentence, errs, lerrs): for entry in sentence: c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c / (0.25 + c))) wordvec = self.wlookup( scalar(int(self.vocab.get(entry.norm, 0)) if dropFlag else 0) ) if self.wdims > 0 else None posvec = self.plookup(scalar(int( self.pos[entry.pos]))) if self.pdims > 0 else None evec = None if self.external_embedding is not None: evec = self.elookup( scalar( self.extrnd. get(entry.form, self.extrnd.get(entry.norm, 0)) if ( dropFlag or (random.random() < 0.5)) else 0)) entry.vec = cat([wordvec, posvec, evec]) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = RNNState(self.builders[0]) lstm_backward = RNNState(self.builders[1]) for entry, rentry in zip(sentence, reversed(sentence)): lstm_forward = lstm_forward.next(entry.vec) lstm_backward = lstm_backward.next(rentry.vec) entry.lstms[1] = lstm_forward() rentry.lstms[0] = lstm_backward() if self.bibiFlag: for entry in sentence: entry.vec = cat(entry.lstms) blstm_forward = RNNState(self.bbuilders[0]) blstm_backward = RNNState(self.bbuilders[1]) for entry, rentry in zip(sentence, reversed(sentence)): blstm_forward = blstm_forward.next(entry.vec) blstm_backward = blstm_backward.next(rentry.vec) entry.lstms[1] = blstm_forward() rentry.lstms[0] = blstm_backward() scores, exprs = self.__evaluate(sentence, True) gold = [entry.parent_id for entry in sentence] heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) if self.labelsFlag: for modifier, head in enumerate(gold[1:]): rscores, rexprs = self.__evaluateLabel(sentence, head, modifier + 1) goldLabelInd = self.rels[sentence[modifier + 1].relation] wrongLabelInd = \ max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs += [rexprs[wrongLabelInd] - rexprs[goldLabelInd]] e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) if e > 0: errs += [(exprs[h][i] - exprs[g][i])[0] for i, (h, g) in enumerate(zip(heads, gold)) if h != g] return e
def forward(self, sentence): self.process_sentence_embeddings(sentence) num_vec = len(sentence) features_for = [entry.vec for entry in sentence] features_back = [entry.vec for entry in reversed(sentence)] vec_for = torch.cat(features_for).view(num_vec, 1, -1) vec_back = torch.cat(features_back).view(num_vec, 1, -1) res_for_1, self.hid_for_1 = self.lstm_for_1(vec_for, self.hid_for_1) res_back_1, self.hid_back_1 = self.lstm_back_1(vec_back, self.hid_back_1) vec_cat = [ concatenate_tensors([res_for_1[i], res_back_1[num_vec - i - 1]]) for i in range(num_vec) ] vec_for_2 = torch.cat(vec_cat).view(num_vec, 1, -1) vec_back_2 = torch.cat(list(reversed(vec_cat))).view(num_vec, 1, -1) res_for_2, self.hid_for_2 = self.lstm_for_2(vec_for_2, self.hid_for_2) res_back_2, self.hid_back_2 = self.lstm_back_2(vec_back_2, self.hid_back_2) for i in range(num_vec): sentence[i].lstms[0] = res_for_2[i] sentence[i].lstms[1] = res_back_2[num_vec - i - 1] scores, exprs = self.__evaluate(sentence) gold = [entry.parent_id for entry in sentence] heads = decoder.parse_proj(scores, gold) lerrs = [] for modifier, head in enumerate(gold[1:]): if sentence[head].rheadfov is None: sentence[head].rheadfov = torch.mm( concatenate_tensors( [sentence[head].lstms[0], sentence[head].lstms[1]]), self.rhidLayerFOH) if sentence[modifier + 1].rmodfov is None: sentence[modifier + 1].rmodfov = torch.mm( concatenate_tensors([ sentence[modifier + 1].lstms[0], sentence[modifier + 1].lstms[1] ]), self.rhidLayerFOM) rscores, rexprs = self.__evaluateLabel( sentence[head].rheadfov, sentence[modifier + 1].rmodfov) goldLabelInd = self.rels[sentence[modifier + 1].relation] wrongLabelInd = max( ((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs += [rexprs[wrongLabelInd] - rexprs[goldLabelInd]] e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) errs = [] if e > 0: errs += [(exprs[h][i] - exprs[g][i])[0] for i, (h, g) in enumerate(zip(heads, gold)) if h != g] return e, errs, lerrs
def Train(self, conll_path): eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP, self.c2i)) random.shuffle(shuffledData) errs = [] lerrs = [] posErrs = [] for iSentence, sentence in enumerate(shuffledData): if iSentence % 500 == 0 and iSentence != 0: print "Processing sentence number: %d" % iSentence, ", Loss: %.4f" % ( eloss / etotal), ", Time: %.2f" % (time.time() - start) start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] for entry in conll_sentence: c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c / (0.25 + c))) wordvec = self.wlookup[ int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])[-1] rev_last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])[ -1] entry.vec = dynet.dropout(concatenate(filter(None, [wordvec, last_state, rev_last_state])), 0.33) entry.pos_lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None #POS tagging loss lstm_forward = self.pos_builders[0].initial_state() lstm_backward = self.pos_builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.pos_lstms[1] = lstm_forward.output() rentry.pos_lstms[0] = lstm_backward.output() for entry in conll_sentence: entry.pos_vec = concatenate(entry.pos_lstms) blstm_forward = self.pos_bbuilders[0].initial_state() blstm_backward = self.pos_bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.pos_vec) blstm_backward = blstm_backward.add_input(rentry.pos_vec) entry.pos_lstms[1] = blstm_forward.output() rentry.pos_lstms[0] = blstm_backward.output() concat_layer = [dynet.dropout(concatenate(entry.pos_lstms), 0.33) for entry in conll_sentence] outputFFlayer = self.ffSeqPredictor.predict_sequence(concat_layer) posIDs = [self.pos.get(entry.pos) for entry in conll_sentence] for pred, gold in zip(outputFFlayer, posIDs): posErrs.append(self.pick_neg_log(pred, gold)) # Add predicted pos tags for entry, poses in zip(conll_sentence, outputFFlayer): entry.vec = concatenate([entry.vec, dynet.dropout(self.plookup[np.argmax(poses.value())], 0.33)]) entry.lstms = [entry.vec, entry.vec] #Parsing losses if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input(rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence) gold = [entry.parent_id for entry in conll_sentence] heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) if self.labelsFlag: concat_layer = [dynet.dropout(self.__getRelVector(conll_sentence, head, modifier + 1), 0.33) for modifier, head in enumerate(gold[1:])] outputFFlayer = self.ffRelPredictor.predict_sequence(concat_layer) relIDs = [self.rels[conll_sentence[modifier + 1].relation] for modifier, _ in enumerate(gold[1:])] for pred, goldid in zip(outputFFlayer, relIDs): lerrs.append(self.pick_neg_log(pred, goldid)) e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) eloss += (e) mloss += (e) errs.extend(loss) etotal += len(conll_sentence) if iSentence % 1 == 0: if len(errs) > 0 or len(lerrs) > 0 or len(posErrs) > 0: eerrs = (esum(errs + lerrs + posErrs)) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] posErrs = [] renew_cg() print "Loss: %.4f" % (mloss / iSentence)
def Train(self, conll_path): errors = 0 batch = 0 eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP)) random.shuffle(shuffledData) errs = [] lerrs = [] eeloss = 0.0 for iSentence, sentence in enumerate(shuffledData): if iSentence % 100 == 0 and iSentence != 0: print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', ( float(eerrors)) / etotal, 'Time', time.time() - start start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for entry in conll_sentence: c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c / (0.25 + c))) wordvec = self.wlookup[ int(self.vocab.get(entry.norm, 0) ) if dropFlag else 0] if self.wdims > 0 else None posvec = self.plookup[int( self.pos[entry.pos])] if self.pdims > 0 else None evec = None if self.external_embedding is not None: evec = self.elookup[self.extrnd.get( entry.form, self.extrnd.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0] entry.vec = concatenate( filter(None, [wordvec, posvec, evec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input( rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) gold = [entry.parent_id for entry in conll_sentence] heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) if self.labelsFlag: for modifier, head in enumerate(gold[1:]): rscores, rexprs = self.__evaluateLabel( conll_sentence, head, modifier + 1) goldLabelInd = self.rels[conll_sentence[modifier + 1].relation] wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd]) e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) eloss += (e) mloss += (e) errs.extend(loss) etotal += len(conll_sentence) if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0: eeloss = 0.0 if len(errs) > 0 or len(lerrs) > 0: eerrs = (esum(errs + lerrs) ) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() if len(errs) > 0: eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] eeloss = 0.0 renew_cg() self.trainer.update_epoch() print "Loss: ", mloss / iSentence
def forward(self, sentence, errs, lerrs): for entry in sentence: c = float(self.wordsCount.get(entry.norm, 0)) # dropFlag = (random.random() < (c / (0.33 + c))) dropFlag = (random.random() < (c / (0.25 + c))) wordvec = self.wlookup( scalar(int(self.vocab.get(entry.norm, 0)) if dropFlag else 0) ) if self.wdims > 0 else None '''ontovec = self.olookup(scalar(int(self.onto[entry.onto]) if random.random( ) < 0.9 else 0)) if self.odims > 0 else None cposvec = self.clookup(scalar(int(self.cpos[entry.cpos]) if random.random( ) < 0.9 else 0)) if self.cdims > 0 else None''' posvec = self.plookup(scalar(int( self.pos[entry.pos]))) if self.pdims > 0 else None # posvec = self.plookup( # scalar(0 if dropFlag and random.random() < 0.1 else int(self.pos[entry.pos]))) if self.pdims > 0 else None evec = None if self.external_embedding is not None: evec = self.elookup( scalar( self.extrnd. get(entry.form, self.extrnd.get(entry.norm, 0)) if ( dropFlag or (random.random() < 0.5)) else 0)) #entry.vec = cat([wordvec, posvec, ontovec, cposvec, evec]) gaze_feats = Variable( torch.unsqueeze(torch.Tensor(entry.gaze_feats), 0)) entry.vec = cat([wordvec, posvec, gaze_feats]) #entry.vec = posvec entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None num_vec = len(sentence) vec_for = torch.cat([entry.vec for entry in sentence]).view(num_vec, 1, -1) vec_back = torch.cat([entry.vec for entry in reversed(sentence) ]).view(num_vec, 1, -1) res_for_1, self.hid_for_1 = self.lstm_for_1(vec_for, self.hid_for_1) res_back_1, self.hid_back_1 = self.lstm_back_1(vec_back, self.hid_back_1) vec_cat = [ cat([res_for_1[i], res_back_1[num_vec - i - 1]]) for i in range(num_vec) ] vec_for_2 = torch.cat(vec_cat).view(num_vec, 1, -1) vec_back_2 = torch.cat(list(reversed(vec_cat))).view(num_vec, 1, -1) res_for_2, self.hid_for_2 = self.lstm_for_2(vec_for_2, self.hid_for_2) res_back_2, self.hid_back_2 = self.lstm_back_2(vec_back_2, self.hid_back_2) for i in range(num_vec): sentence[i].lstms[0] = res_for_2[i] sentence[i].lstms[1] = res_back_2[num_vec - i - 1] scores, exprs = self.__evaluate(sentence, True) gold = [entry.parent_id for entry in sentence] heads = decoder.parse_proj(scores, gold) for modifier, head in enumerate(gold[1:]): rscores, rexprs = self.__evaluateLabel(sentence, head, modifier + 1) goldLabelInd = self.rels[sentence[modifier + 1].relation] wrongLabelInd = \ max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs += [rexprs[wrongLabelInd] - rexprs[goldLabelInd]] e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) if e > 0: errs += [(exprs[h][i] - exprs[g][i])[0] for i, (h, g) in enumerate(zip(heads, gold)) if h != g] return e
def Train(self, conll_path, BATCH_SIZE=1): errors = 0 batch = 0 eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP)) random.shuffle(shuffledData) errs = [] lerrs = [] eeloss = 0.0 for iSentence, sentence_batch in enumerate( stream_to_batch(shuffledData, BATCH_SIZE)): if iSentence % 100 == 0 and iSentence != 0: print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', ( float(eerrors)) / etotal, 'Time', time.time( ) - start, (100 * BATCH_SIZE) / (time.time() - start) start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 batch_exprs = [] sents = [] golds = [] labels = [] for sentence in sentence_batch: conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] sents.append(conll_sentence) gold = [entry.parent_id for entry in conll_sentence] golds.append(gold) # initialize sentence for entry in conll_sentence: c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c / (0.25 + c))) wordvec = self.wlookup[int( self.vocab.get(entry.norm, 0) ) if dropFlag else 0] if self.wdims > 0 else None posvec = self.plookup[int( self.pos[entry.pos])] if self.pdims > 0 else None evec = None if self.external_embedding is not None: evec = self.elookup[self.extrnd.get( entry.form, self.extrnd.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5) ) else 0] entry.vec = concatenate( filter(None, [wordvec, posvec, evec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None # bilstm encode if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input( entry.vec) blstm_backward = blstm_backward.add_input( rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() # compute all arc score-expressions batch_exprs.append(self.__evaluate(conll_sentence, True)) # labeling? _exps = [] if self.labelsFlag: labels_exprs = [] for modifier, head in enumerate(gold[1:]): rexprs = self.__evaluateLabel( conll_sentence, head, modifier + 1) labels_exprs.append((rexprs, head, modifier)) _exps.append(rexprs) labels.append(labels_exprs) # now do the actual scoring _s = time.time() forward(batch_exprs[-1][-1] + _exps) print "fw1t:", time.time() - _s for _i, (exprs, conll_sentence) in enumerate(zip(batch_exprs, sents)): scores = np.array( [[output.scalar_value() for output in exprsRow] for exprsRow in exprs]) gold = golds[_i] heads = decoder.parse_proj( scores, gold if self.costaugFlag else None) # TODO labeling is inot batched if self.labelsFlag: for rexprs, head, modifier in labels[_i]: rscores = rexprs.value() goldLabelInd = self.rels[conll_sentence[ modifier + 1].relation] wrongLabelInd = max( ((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[ goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd]) e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) eloss += (e) mloss += (e) errs.extend(loss) etotal += len(conll_sentence) if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0: eeloss = 0.0 if len(errs) > 0 or len(lerrs) > 0: eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) _s = time.time() eerrs.scalar_value() print "fw2t", time.time() - _s _s = time.time() eerrs.backward() print "bw2t", time.time() - _s self.trainer.update() errs = [] lerrs = [] renew_cg() if len(errs) > 0: eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] eeloss = 0.0 renew_cg() self.trainer.update_epoch() print "Loss: ", mloss / iSentence
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate( read_conll(conllFP, self.conll_test_language)): conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for entry in conll_sentence: posID = self.pos[entry.pos] if self.pos.has_key( entry.pos) else 0 posvec = self.plookup[int( posID)] if self.pdims > 0 else None xposID = self.xpos[entry.xpos] if self.xpos.has_key( entry.xpos) else 0 xposvec = self.xplookup[int( xposID)] if self.xpdims > 0 else None evec = None ecevc = None lang_code = entry.language.split( '_')[0] + ':' if self.multilingual_emb else "" if self.external_embedding is not None: if self.extConcateFlag: wordvec = self.wlookup[int( self.vocab.get(entry.norm.lower(), 0))] if self.wdims > 0 else None evec = self.elookup[self.extrnd.get( lang_code + entry.form.lower(), self.extrnd.get(lang_code + entry.norm.lower(), 0))] else: wordvec = self.elookup[self.extrnd.get( lang_code + entry.form.lower(), self.extrnd.get(lang_code + entry.norm.lower(), 0))] else: wordvec = self.wlookup[int( self.vocab.get(entry.norm.lower(), 0))] if self.wdims > 0 else None if self.external_cluster_embedding is not None: ecevc = self.eclookup[self.exctrnd.get( lang_code + entry.form.lower(), self.exctrnd.get(lang_code + entry.norm.lower(), 0))] # Add language embedding langvec = self.llookup[ self.languageVec_dic[entry.language]. lang_num] if self.add_lang_vec else None entry.vec = concatenate( filter( None, [wordvec, posvec, xposvec, evec, ecevc, langvec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input( rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) heads = decoder.parse_proj(scores) ## ADD for handling multi-roots problem rootHead = [head for head in heads if head == 0] if len(rootHead) != 1: print "it has multi-root, changing it for heading first root for other roots" rootHead = [ seq for seq, head in enumerate(heads) if head == 0 ] for seq in rootHead[1:]: heads[seq] = rootHead[0] ## finish to multi-roots for entry, head in zip(conll_sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' dump = False if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel( conll_sentence, head, modifier + 1) conll_sentence[modifier + 1].pred_relation = self.irels[max( enumerate(scores), key=itemgetter(1))[0]] renew_cg() if not dump: yield sentence
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP, self.c2i, self.m2i, self.t2i, self.morph_dict)): conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] if self.morphTagFlag: sentence_context = [] last_state_char = self.char_rnn.predict_sequence([self.clookup[self.c2i["<start>"]]])[-1] rev_last_state_char = self.char_rnn.predict_sequence([self.clookup[self.c2i["<start>"]]])[-1] sentence_context.append(concatenate([last_state_char, rev_last_state_char])) for entry in conll_sentence: last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars]) rev_last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)]) entry.char_rnn_states = [concatenate([f,b]) for f,b in zip(last_state_char, rev_last_state_char)] sentence_context.append(entry.char_rnn_states[-1]) for idx, entry in enumerate(conll_sentence): wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None if self.morphTagFlag: entry.vec = concatenate([wordvec, entry.char_rnn_states[-1]]) else: last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])[-1] rev_last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])[-1] entry.vec = concatenate([wordvec, last_state_char, rev_last_state_char]) for idx, entry in enumerate(conll_sentence): if self.morphFlag: if len(entry.norm) > 2: if self.goldMorphFlag: seg_vec = self.__getSegmentationVector(entry.norm) seg_vec = dynet.vecInput(seg_vec.dim()[0][0]) seg_vec.set(entry.idMorphs) morph_seg = utils.generate_morphs(entry.norm, seg_vec.vec_value()) entry.pred_seg = morph_seg else: seg_vec = self.__getSegmentationVector(entry.norm) morph_seg = utils.generate_morphs(entry.norm, seg_vec.vec_value()) entry.pred_seg = seg_vec.vec_value() else: morph_seg = [entry.norm] entry.pred_seg = entry.idMorphs entry.seg = entry.idMorphs last_state_morph = self.morph_rnn.predict_sequence([self.__getMorphVector(morph) for morph in morph_seg])[-1] rev_last_state_morph = self.morph_rnn.predict_sequence([self.__getMorphVector(morph) for morph in reversed(morph_seg)])[ -1] entry.vec = concatenate([entry.vec, last_state_morph, rev_last_state_morph]) morphtag_encodings = [] for idx, entry in enumerate(conll_sentence): if self.morphTagFlag: if self.goldMorphTagFlag: morph_tags = entry.idMorphTags entry.pred_tags = entry.idMorphTags entry.pred_tags_tokens = [self.i2t[m_tag_id] for m_tag_id in entry.pred_tags] else: word_context = [c for i, c in enumerate(sentence_context) if i - 1 != idx] entry.pred_tags = self.generate(entry.char_rnn_states, word_context) morph_tags = entry.pred_tags entry.tags = entry.idMorphTags entry.pred_tags_tokens = [self.i2t[m_tag_id] for m_tag_id in entry.pred_tags] last_state_mtag = self.mtag_rnn.predict_sequence([self.tlookup[t] for t in morph_tags])[-1] rev_last_state_mtag = self.mtag_rnn.predict_sequence([self.tlookup[t] for t in reversed(morph_tags)])[-1] current_encoding_mtag = concatenate([last_state_mtag, rev_last_state_mtag]) morphtag_encodings.append(current_encoding_mtag) if self.morphTagFlag: forward = [] for idx, encoding in enumerate(morphtag_encodings): if idx == 0: forward.append(encoding) else: updated = morphtag_encodings[idx-1]*self.mtag_encoding_composition_alpha \ + encoding*(1-self.mtag_encoding_composition_alpha) forward.append(updated) if self.mtag_encoding_composition_type == "w_sum": upper_morphtag_encodings = forward elif self.mtag_encoding_composition_type == "bi_w_sum": backward = [] for idx, r_encoding in enumerate(morphtag_encodings): if idx == len(morphtag_encodings) - 1: backward.append(r_encoding) else: updated = morphtag_encodings[idx+1]*self.mtag_encoding_composition_alpha \ + r_encoding*(1-self.mtag_encoding_composition_alpha) backward.append(updated) upper_morphtag_encodings = [f+b for f,b in zip(forward, backward)] elif self.mtag_encoding_composition_type == "bi_mlp": forward = [] backward = [] for idx, encoding in enumerate(morphtag_encodings): if idx != 0: f = self.mtag_encoding_f_w * concatenate([encoding, morphtag_encodings[idx-1]]) \ + self.mtag_encoding_f_b forward.append(f) else: forward.append(encoding) if idx != len(morphtag_encodings) - 1: b = self.mtag_encoding_b_w * concatenate([encoding, morphtag_encodings[idx+1]]) \ + self.mtag_encoding_b_b backward.append(b) else: backward.append(encoding) upper_morphtag_encodings = [f+b for f,b in zip(forward, backward)] else: upper_morphtag_encodings = morphtag_encodings for entry, mtag in zip(conll_sentence, upper_morphtag_encodings): entry.vec = concatenate([entry.vec, mtag]) for idx, entry in enumerate(conll_sentence): entry.pos_lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None #Predicted pos tags lstm_forward = self.pos_builders[0].initial_state() lstm_backward = self.pos_builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.pos_lstms[1] = lstm_forward.output() rentry.pos_lstms[0] = lstm_backward.output() for entry in conll_sentence: entry.pos_vec = concatenate(entry.pos_lstms) blstm_forward = self.pos_bbuilders[0].initial_state() blstm_backward = self.pos_bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.pos_vec) blstm_backward = blstm_backward.add_input(rentry.pos_vec) entry.pos_lstms[1] = blstm_forward.output() rentry.pos_lstms[0] = blstm_backward.output() concat_layer = [concatenate(entry.pos_lstms) for entry in conll_sentence] outputFFlayer = self.ffSeqPredictor.predict_sequence(concat_layer) predicted_pos_indices = [np.argmax(o.value()) for o in outputFFlayer] predicted_postags = [self.id2pos[idx] for idx in predicted_pos_indices] # Add predicted pos tags for parsing prediction for entry, posid in zip(conll_sentence, predicted_pos_indices): entry.vec = concatenate([entry.vec, self.plookup[posid]]) entry.lstms = [entry.vec, entry.vec] if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input(rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence) heads = decoder.parse_proj(scores) # Multiple roots: heading to the previous "rooted" one rootCount = 0 rootWid = -1 for index, head in enumerate(heads): if head == 0: rootCount += 1 if rootCount == 1: rootWid = index if rootCount > 1: heads[index] = rootWid rootWid = index for entry, head, pos in zip(conll_sentence, heads, predicted_postags): entry.pred_parent_id = head entry.pred_relation = '_' entry.pred_pos = pos dump = False if self.labelsFlag: concat_layer = [self.__getRelVector(conll_sentence, head, modifier + 1) for modifier, head in enumerate(heads[1:])] outputFFlayer = self.ffRelPredictor.predict_sequence(concat_layer) predicted_rel_indices = [np.argmax(o.value()) for o in outputFFlayer] predicted_rels = [self.irels[idx] for idx in predicted_rel_indices] for modifier, head in enumerate(heads[1:]): conll_sentence[modifier + 1].pred_relation = predicted_rels[modifier] renew_cg() if not dump: yield sentence
def Train(self, trainData, options): errors = 0 batch = 0 eloss = 0.0 mloss = 0.0 eerrors = 0 lerrors = 0 etotal = 0 beg = start = time.time() random.shuffle(trainData) # in certain cases the data will already have been shuffled after being read from file or while creating dev data errs = [] lerrs = [] eeloss = 0.0 self.feature_extractor.Init(options) for iSentence, sentence in enumerate(trainData,1): if iSentence % 100 == 0 and iSentence != 0: loss_message = 'Processing sentence number: %d'%iSentence + \ ' Loss: %.3f'%(eloss / etotal)+ \ ' Errors: %.3f'%((float(eerrors)) / etotal)+\ ' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\ ' Time: %.2gs'%(time.time()-start) print loss_message start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] self.feature_extractor.getWordEmbeddings(conll_sentence, True, options) scores, exprs = self.__evaluate(conll_sentence, True) gold = [entry.parent_id for entry in conll_sentence] if self.proj: heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) else: if self.costaugFlag: #augment the score of non-gold arcs for i in range(len(scores)): for j in range(len(scores)): if gold[j] != i: scores[i][j] += 1. heads = chuliu_edmonds_one_root(scores.T) heads[0] = -1 if self.labelsFlag: for modifier, head in enumerate(gold[1:]): rscores, rexprs = self.__evaluateLabel(conll_sentence, head, modifier+1) goldLabelInd = self.feature_extractor.rels[conll_sentence[modifier+1].relation] wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd]) lerrors += 1 #not quite right but gives some indication e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g] eloss += dy.esum(loss).scalar_value() mloss += dy.esum(loss).scalar_value() errs.extend(loss) etotal += len(conll_sentence) if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0: eeloss = 0.0 if len(errs) > 0 or len(lerrs) > 0: eerrs = (dy.esum(errs + lerrs)) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] dy.renew_cg() if len(errs) > 0: eerrs = (dy.esum(errs + lerrs)) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] eeloss = 0.0 dy.renew_cg() self.trainer.update() print "Loss: ", mloss/iSentence print "Total Training Time: %.2gs"%(time.time()-beg)
def Train(self, conll_path): self.trainer.set_sparse_updates(True) eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP, self.c2i, self.m2i, self.t2i, self.morph_dict)) random.shuffle(shuffledData) errs = [] lerrs = [] posErrs = [] segErrs = [] mTagErrs = [] for iSentence, sentence in enumerate(shuffledData): if iSentence % 500 == 0 and iSentence != 0: print("Processing sentence number: %d" % iSentence, ", Loss: %.4f" % ( eloss / etotal), ", Time: %.2f" % (time.time() - start)) start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] if self.morphTagFlag: sentence_context = [] last_state_char = self.char_rnn.predict_sequence([self.clookup[self.c2i["<start>"]]])[-1] rev_last_state_char = self.char_rnn.predict_sequence([self.clookup[self.c2i["<start>"]]])[-1] sentence_context.append(concatenate([last_state_char, rev_last_state_char])) for entry in conll_sentence: last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars]) rev_last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)]) entry.char_rnn_states = [concatenate([f,b]) for f,b in zip(last_state_char, rev_last_state_char)] sentence_context.append(entry.char_rnn_states[-1]) for idx, entry in enumerate(conll_sentence): c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c / (0.25 + c))) wordvec = self.wlookup[ int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None if self.morphTagFlag : entry.vec = dynet.dropout(concatenate([wordvec, entry.char_rnn_states[-1]]), 0.33) else: last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])[-1] rev_last_state_char = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])[-1] entry.vec = dynet.dropout(concatenate([wordvec, last_state_char, rev_last_state_char]), 0.33) for idx, entry in enumerate(conll_sentence): if self.morphFlag: if len(entry.norm) > 2: if self.goldMorphFlag: seg_vec = self.__getSegmentationVector(entry.norm) seg_vec = dynet.vecInput(seg_vec.dim()[0][0]) seg_vec.set(entry.idMorphs) morph_seg = utils.generate_morphs(entry.norm, seg_vec.vec_value()) else: seg_vec = self.__getSegmentationVector(entry.norm) morph_seg = utils.generate_morphs(entry.norm, seg_vec.vec_value()) vec_gold = dynet.vecInput(seg_vec.dim()[0][0]) vec_gold.set(entry.idMorphs) segErrs.append(self.binary_crossentropy(seg_vec,vec_gold)) else: morph_seg = [entry.norm] last_state_morph = self.morph_rnn.predict_sequence([self.__getMorphVector(morph) for morph in morph_seg])[-1] rev_last_state_morph = self.morph_rnn.predict_sequence([self.__getMorphVector(morph) for morph in reversed(morph_seg)])[ -1] encoding_morph = concatenate([last_state_morph, rev_last_state_morph]) entry.vec = concatenate([entry.vec, dynet.dropout(encoding_morph, 0.33)]) morphtag_encodings = [] for idx, entry in enumerate(conll_sentence): if self.morphTagFlag: if self.goldMorphTagFlag: morph_tags = entry.idMorphTags else: word_context = [c for i, c in enumerate(sentence_context) if i-1 != idx] mTagErrs.append( self.__getLossMorphTagging(entry.char_rnn_states, entry.idMorphTags, word_context)) predicted_sequence = self.generate(entry.char_rnn_states, word_context) morph_tags = predicted_sequence last_state_mtag = self.mtag_rnn.predict_sequence([self.tlookup[t] for t in morph_tags])[-1] rev_last_state_mtag = \ self.mtag_rnn.predict_sequence([self.tlookup[t] for t in reversed(morph_tags)])[ -1] current_encoding_mtag = concatenate([last_state_mtag, rev_last_state_mtag]) morphtag_encodings.append(current_encoding_mtag) if self.morphTagFlag: forward = [] for idx, encoding in enumerate(morphtag_encodings): if idx == 0: forward.append(encoding) else: updated = morphtag_encodings[idx-1]*self.mtag_encoding_composition_alpha \ + encoding*(1-self.mtag_encoding_composition_alpha) forward.append(updated) if self.mtag_encoding_composition_type == "w_sum": upper_morphtag_encodings = forward elif self.mtag_encoding_composition_type == "bi_w_sum": backward = [] for idx, r_encoding in enumerate(morphtag_encodings): if idx == len(morphtag_encodings) - 1: backward.append(r_encoding) else: updated = morphtag_encodings[idx+1]*self.mtag_encoding_composition_alpha \ + r_encoding*(1-self.mtag_encoding_composition_alpha) backward.append(updated) upper_morphtag_encodings = [f+b for f,b in zip(forward, backward)] elif self.mtag_encoding_composition_type == "bi_mlp": forward = [] backward = [] for idx, encoding in enumerate(morphtag_encodings): if idx != 0: f = self.mtag_encoding_f_w * concatenate([encoding, morphtag_encodings[idx-1]]) \ + self.mtag_encoding_f_b forward.append(f) else: forward.append(encoding) if idx != len(morphtag_encodings) - 1: b = self.mtag_encoding_b_w * concatenate([encoding, morphtag_encodings[idx+1]]) \ + self.mtag_encoding_b_b backward.append(b) else: backward.append(encoding) upper_morphtag_encodings = [f+b for f,b in zip(forward, backward)] else: upper_morphtag_encodings = morphtag_encodings for entry, mtag in zip(conll_sentence, upper_morphtag_encodings): entry.vec = concatenate([entry.vec, dynet.dropout(mtag, 0.33)]) for idx, entry in enumerate(conll_sentence): entry.pos_lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None #POS tagging loss lstm_forward = self.pos_builders[0].initial_state() lstm_backward = self.pos_builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.pos_lstms[1] = lstm_forward.output() rentry.pos_lstms[0] = lstm_backward.output() for entry in conll_sentence: entry.pos_vec = concatenate(entry.pos_lstms) blstm_forward = self.pos_bbuilders[0].initial_state() blstm_backward = self.pos_bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.pos_vec) blstm_backward = blstm_backward.add_input(rentry.pos_vec) entry.pos_lstms[1] = blstm_forward.output() rentry.pos_lstms[0] = blstm_backward.output() concat_layer = [dynet.dropout(concatenate(entry.pos_lstms), 0.33) for entry in conll_sentence] outputFFlayer = self.ffSeqPredictor.predict_sequence(concat_layer) posIDs = [self.pos.get(entry.pos) for entry in conll_sentence] for pred, gold in zip(outputFFlayer, posIDs): posErrs.append(self.pick_neg_log(pred, gold)) # Add predicted pos tags for entry, poses in zip(conll_sentence, outputFFlayer): entry.vec = concatenate([entry.vec, dynet.dropout(self.plookup[np.argmax(poses.value())], 0.33)]) entry.lstms = [entry.vec, entry.vec] #Parsing losses if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input(rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence) gold = [entry.parent_id for entry in conll_sentence] heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) if self.labelsFlag: concat_layer = [dynet.dropout(self.__getRelVector(conll_sentence, head, modifier + 1), 0.33) for modifier, head in enumerate(gold[1:])] outputFFlayer = self.ffRelPredictor.predict_sequence(concat_layer) relIDs = [self.rels[conll_sentence[modifier + 1].relation] for modifier, _ in enumerate(gold[1:])] for pred, goldid in zip(outputFFlayer, relIDs): lerrs.append(self.pick_neg_log(pred, goldid)) e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) eloss += (e) mloss += (e) errs.extend(loss) etotal += len(conll_sentence) if iSentence % 1 == 0: if len(errs) > 0 or len(lerrs) > 0 or len(posErrs) > 0 or len(segErrs) > 0 or len(mTagErrs) > 0: eerrs = (esum(errs + lerrs + posErrs + segErrs + mTagErrs)) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] posErrs = [] segErrs = [] mTagErrs = [] renew_cg() print("Loss: %.4f" % (mloss / iSentence))
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP)): conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for entry in conll_sentence: wordvec = self.wlookup[int(self.vocab.get( entry.norm, 0))] if self.wdims > 0 else None posvec = self.plookup[int( self.pos[entry.pos])] if self.pdims > 0 else None evec = self.elookup[int( self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)) )] if self.external_embedding is not None else None entry.vec = concatenate( filter(None, [wordvec, posvec, evec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input( rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) heads = decoder.parse_proj(scores) for entry, head in zip(conll_sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' dump = False if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel( conll_sentence, head, modifier + 1) conll_sentence[modifier + 1].pred_relation = self.irels[max( enumerate(scores), key=itemgetter(1))[0]] renew_cg() if not dump: yield sentence
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP, self.c2i)): conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] for entry in conll_sentence: wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None evec = self.elookup[int(self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)))] if self.external_embedding is not None else None last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in entry.idChars])[-1] rev_last_state = self.char_rnn.predict_sequence([self.clookup[c] for c in reversed(entry.idChars)])[-1] # char_state = dynet.noise(concatenate([last_state, rev_last_state]), 0.2) # morph_logit = self.charSeqPredictor.predict_sequence(char_state) # morphID = self.morphs.get(entry.feats) # morphErrs.append(self.pick_neg_log(morph_logit, morphID)) # morph_emb = None # for i in morph_logit: # morph_emb += i * self.mlookup(i) entry.vec = concatenate(filter(None, [wordvec, evec, last_state, rev_last_state])) entry.ch_vec = concatenate([dynet.noise(fe,0.2) for fe in filter(None, [last_state, rev_last_state])]) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: morcat_layer = [entry.ch_vec for entry in conll_sentence] morph_logits = self.charSeqPredictor.predict_sequence(morcat_layer) predicted_morph_idx = [np.argmax(o.value()) for o in morph_logits] predicted_morphs = [self.id2morph[idx] for idx in predicted_morph_idx] for builder in self.pos_builder: builder.disable_dropout() lstm_forward = self.pos_builder[0].initial_state() lstm_backward = self.pos_builder[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() pos_embed = [] concat_layer = [concatenate(entry.lstms) for entry in conll_sentence] outputFFlayer = self.ffSeqPredictor.predict_sequence(concat_layer) predicted_posIDs = [np.argmax(o.value()) for o in outputFFlayer] predicted_postags = [self.id2pos[idx] for idx in predicted_posIDs] for predID, pred in zip(predicted_posIDs, outputFFlayer): if self.gold_pos: pos_embed.append(self.plookup[predID]) else: pos_embed.append(soft_embed(pred.value(), self.plookup)) for entry in conll_sentence: entry.vec = concatenate(entry.lstms) for builder in self.dep_builders: builder.disable_dropout() blstm_forward = self.dep_builders[0].initial_state() blstm_backward = self.dep_builders[1].initial_state() for entry, rentry, pembed, revpembed in zip(conll_sentence, reversed(conll_sentence), pos_embed, reversed(pos_embed)): blstm_forward = blstm_forward.add_input(concatenate([entry.vec, pembed])) blstm_backward = blstm_backward.add_input(concatenate([rentry.vec, revpembed])) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) heads = decoder.parse_proj(scores) #Multiple roots: heading to the previous "rooted" one rootCount = 0 rootWid = -1 for index, head in enumerate(heads): if head == 0: rootCount += 1 if rootCount == 1: rootWid = index if rootCount > 1: heads[index] = rootWid rootWid = index for entry, head, pos, feats in zip(conll_sentence, heads, predicted_postags, predicted_morphs): entry.pred_parent_id = head entry.pred_relation = '_' entry.pred_pos = pos entry.pred_feats = feats dump = False if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1) conll_sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]] renew_cg() if not dump: yield sentence
def parse(self, indices, arcs=None, pos_indices=None): states = self.states(indices, pos_indices) scores = np.array(self.score_arcs(states)) return parse_proj(scores, arcs)