def embed(self, x): if self.word_dropout > 0.0 and self.word_id_mask is None: batch_size = len(x) if xnmt.batcher.is_batched(x) else 1 self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)] emb_e = dy.parameter(self.embeddings) # single mode if not xnmt.batcher.is_batched(x): if self.train and self.word_id_mask and x in self.word_id_mask[0]: ret = dy.zeros((self.emb_dim,)) else: ret = dy.pick(emb_e, index=x) if self.fix_norm != None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm # minibatch mode else: ret = dy.concatenate_to_batch([dy.pick(emb_e, index=xi) for xi in x]) if self.fix_norm != None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(len(x))): dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(len(x))]), batched=True) ret = dy.cmult(ret, dropout_mask) if self.train and self.weight_noise > 0.0: ret = dy.noise(ret, self.weight_noise) return ret
def forward_unlabeled(self, features, correct_output, pred_output): init_alphas = [-1e10] * self.num_labels init_alphas[self.label2idx[START]] = 0 for_expr = dy.inputVector(init_alphas) for pos, obs in enumerate(features): alphas_t = [] if correct_output[pos] != self.o_id: for next_tag in range(self.num_labels): obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.num_labels) next_tag_expr = for_expr + self.transition[next_tag] + obs_broadcast + dy.inputVector([self.lambda_l] * self.num_labels) \ if next_tag != pred_output[pos] else for_expr + self.transition[next_tag] + obs_broadcast alphas_t.append(max_score(next_tag_expr)) else: for next_tag in range(self.num_labels): obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.num_labels) next_tag_expr = for_expr + self.transition[next_tag] + obs_broadcast + dy.inputVector([self.lambda_u] * self.num_labels) \ if next_tag != pred_output[pos] else for_expr + self.transition[next_tag] + obs_broadcast alphas_t.append(max_score(next_tag_expr)) for_expr = dy.concatenate(alphas_t) # for_expr = dy.max_dim(alphas_t) # dy.emax() terminal_expr = for_expr + self.transition[self.label2idx[STOP]] alpha = max_score(terminal_expr) return alpha
def viterbi(self, observations): backpointers = [] init_pis = [0, 0] forward_mess = dy.inputVector(init_pis) transitions = [self.transitions[idx] for idx in range(2)] for i in range(len(observations) - 1): bp_t = [] pi_t = [] for next_tag in range(2): next_tag_expr = forward_mess + transitions[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) bp_t.append(best_tag_id) pi_t.append(dy.pick(next_tag_expr, best_tag_id)) forward_mess = dy.concatenate(pi_t) + observations[i] backpointers.append(bp_t) # find the highrst scoring final state and the corresponding score best_tag_id = np.argmax(forward_mess.npvalue()) path_score = dy.pick(forward_mess, best_tag_id) # backtracking best_path = [best_tag_id] for bp_t in reversed(backpointers): best_tag_id = bp_t[best_tag_id] best_path.append(best_tag_id) best_path.pop() best_path.reverse() return best_path, path_score
def learn(self, seq): for entry in seq: if entry.upos != 'NUM' and entry.upos != 'PROPN': losses = [] unilemma = unicode(entry.lemma, 'utf-8') n_chars = len(unilemma) softmax_output_list = self._predict(entry.word, entry.upos, entry.xpos, entry.attrs, num_chars=n_chars + 1, gs_chars=unilemma) # print unilemma.encode('utf-8')#, softmax_output_list for softmax, char in zip(softmax_output_list[:-1], unilemma): char_index = -1 if char.lower() == char: casing = 0 else: casing = 1 char = char.lower() if char in self.encodings.char2int: char_index = self.encodings.char2int[char] if char_index != -1: losses.append(-dy.log(dy.pick(softmax[0], char_index))) losses.append(-dy.log(dy.pick(softmax[1], casing))) # print np.argmax(softmax[0].npvalue()), char_index, softmax losses.append(-dy.log( dy.pick(softmax_output_list[-1][0], len(self.encodings.char2int)))) loss = dy.esum(losses) self.losses.append(loss)
def viterbi_decoding(self, observations): backpointers = [] init_vvars = [-1e10] * self.dim_output init_vvars[self.sp_s] = 0 for_expr = dy.inputVector(init_vvars) trans_exprs = [self.trans[idx] for idx in range(self.dim_output)] for obs in observations: bptrs_t = [] vvars_t = [] for next_tag in range(self.dim_output): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) bptrs_t.append(best_tag_id) vvars_t.append(dy.pick(next_tag_expr, best_tag_id)) for_expr = dy.concatenate(vvars_t) + obs backpointers.append(bptrs_t) terminal_expr = for_expr + trans_exprs[self.sp_e] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dy.pick(terminal_expr, best_tag_id) best_path = [best_tag_id] for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() best_path.reverse() assert start == self.sp_s return best_path, path_score
def forward_backward(self, observations): init_alphas = [0, 0] forward_mess = dy.inputVector(init_alphas) alpha = [] for i in range(len(observations) - 1): alphas_t = [] for next_tag in range(2): obs_broadcast = dy.concatenate( [dy.pick(observations[i], next_tag)] * 2) next_tag_expr = forward_mess + self.transitions[ next_tag] + obs_broadcast alphas_t.append(self.log_sum_exp(next_tag_expr)) forward_mess = dy.concatenate(alphas_t) alpha.append(forward_mess) init_betas = [0, 0] backward_mess = dy.inputVector(init_betas) beta = [] for i in range(len(observations) - 1): beta_t = [] for next_tag in range(2): obs = observations[len(observations) - i - 1] next_tag_expr = backward_mess + self.transitions[next_tag] + obs beta_t.append(self.log_sum_exp(next_tag_expr)) backward_mess = dy.concatenate(beta_t) beta.append(backward_mess) mu = [x + y for x, y in zip(alpha, beta[::-1])] # compute marginal probablities prob = [dy.pick(dy.softmax(w), 1) for w in mu] return prob
def viterbi(emissions, transition, start_idx, end_idx, norm=False): n_tags = emissions[0].dim()[0][0] backpointers = [] inits = [-1e4] * n_tags inits[start_idx] = 0 alphas = dy.inputVector(inits) alphas = dy.log_softmax(alphas) if norm else alphas for emission in emissions: next_vars = dy.colwise_add(dy.transpose(transition), alphas) best_tags = np.argmax(next_vars.npvalue(), 0) v_t = dy.max_dim(next_vars, 0) alphas = v_t + emission backpointers.append(best_tags) terminal_expr = alphas + dy.pick(transition, end_idx) best_tag = np.argmax(terminal_expr.npvalue()) path_score = dy.pick(terminal_expr, best_tag) best_path = [best_tag] for bp_t in reversed(backpointers): best_tag = bp_t[best_tag] best_path.append(best_tag) _ = best_path.pop() best_path.reverse() return best_path, path_score
def sample_token(self, sentence): """ Samples a token from the conditional distrib @param sentence: a list of token indexes @return the index of the sampled token """ ctxt = sentence[-3:] if self.tied: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.embedding_matrix) embeddings = [dy.pick(E, x) for x in ctxt] xdense = dy.concatenate(embeddings) ypred = dy.softmax(E * dy.tanh(W * xdense)) Ypred = np.array(ypred.value()) Ypred /= Ypred.sum() #fixes numerical instabilities return choice(self.lexicon_size, p=Ypred) else: dy.renew_cg() O = dy.parameter(self.output_weights) W = dy.parameter(self.hidden_weights) E = dy.parameter(self.embedding_matrix) for x, y in zip(X, Y): embeddings = [dy.pick(E, x) for x in ctxt] xdense = dy.concatenate(embeddings) ypred = dy.softmax(O * dy.tanh(W * xdense), y) Ypred = np.array(ypred.value()) Ypred /= Ypred.sum() #fixes numerical instabilities return choice(self.lexicon_size, p=Ypred)
def decode(self, emissions): """Viterbi decode to find the best sequence. :param emissions: List[dy.Expression] Returns: List[int], dy.Expression ((1,), B) """ if self.add_ends: emissions = CRF._prep_input(emissions) backpointers = [] transitions = self.transitions inits = [-1e4] * self.n_tags inits[self.start_idx] = 0 alphas = dy.inputVector(inits) for emission in emissions: next_vars = dy.colwise_add(dy.transpose(transitions), alphas) best_tags = np.argmax(next_vars.npvalue(), 0) v_t = dy.max_dim(next_vars, 0) alphas = v_t + emission backpointers.append(best_tags) terminal_expr = alphas + dy.pick(transitions, self.end_idx) best_tag = np.argmax(terminal_expr.npvalue()) path_score = dy.pick(terminal_expr, best_tag) best_path = [best_tag] for bp_t in reversed(backpointers): best_tag = bp_t[best_tag] best_path.append(best_tag) _ = best_path.pop() best_path.reverse() return best_path, path_score
def viterbi_decoding(self, observations): backpointers = [] init_vvars = [-1e10] * (self.n_tags + 2) init_vvars[self.b_id] = 0 # <Start> has all the probability for_expr = dynet.inputVector(init_vvars) trans_exprs = [self.transitions[idx] for idx in range(self.n_tags + 2)] for obs in observations: bptrs_t = [] vvars_t = [] for next_tag in range(self.n_tags + 2): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) bptrs_t.append(best_tag_id) vvars_t.append(dynet.pick(next_tag_expr, best_tag_id)) for_expr = dynet.concatenate(vvars_t) + obs backpointers.append(bptrs_t) # Perform final transition to terminal terminal_expr = for_expr + trans_exprs[self.e_id] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dynet.pick(terminal_expr, best_tag_id) # Reverse over the backpointers to get the best path best_path = [best_tag_id ] # Start with the tag that was best for terminal for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() # Remove the start symbol best_path.reverse() assert start == self.b_id # Return best path and best path's score return best_path, path_score
def pick_gold_score(self, preds, golds): score = 0 prev_tag = len(self.pos) for pred, gold in zip(preds, golds): score += dynet.pick(pred, gold) + dynet.pick( self.nertrans_lookup[gold], prev_tag) prev_tag = gold score += dynet.pick(self.nertrans_lookup[len(self.pos) + 1], prev_tag) return score
def learn(self, labels, f_cont, f_disc): #trunchiem pauzele: f_cont = f_cont[labels[0].stop * self.dataset.sample_rate / 1000 - 80:labels[-1].start * self.dataset.sample_rate / 1000 + 80] f_disc = f_disc[labels[0].stop * self.dataset.sample_rate / 1000 - 80:labels[-1].start * self.dataset.sample_rate / 1000 + 80] clipped = False if self.clip != 0 and len(f_cont) > self.clip: clipped = True f_cont = f_cont[:self.clip] f_disc = f_disc[:self.clip] num_batches = (len(f_cont) + 1) / self.config.batch_size if (len(f_cont) + 1) % self.config.batch_size != 0: num_batches += 1 att = None total_loss = 0 last_proc = 0 last_decoder_state = None for iBatch in range(num_batches): proc = iBatch * 100 / num_batches while last_proc + 10 < proc: last_proc += 10 sys.stdout.write(" " + str(last_proc)) sys.stdout.flush() dy.renew_cg() start_sample = iBatch * self.config.batch_size stop_sample = start_sample + self.config.batch_size if stop_sample > len(f_disc) + 1: stop_sample = len(f_disc) + 1 pred_samples, att, samples_cont, rnn = self._predict( labels, start_sample, stop_sample, f_cont, att, last_decoder_state, False) last_decoder_state = [s.value() for s in rnn.s()] losses = [] for iSample in range(stop_sample - start_sample): if iSample + start_sample != len(f_cont): losses.append(-dy.log( dy.pick(pred_samples[iSample], f_disc[iSample + start_sample]))) elif not clipped: losses.append(-dy.log(dy.pick( pred_samples[-1], 256))) #special end of sequence loss = dy.esum(losses) total_loss += loss.value() loss.backward() self.trainer.update() p_one = 1 if clipped: p_one = 0 return total_loss, len(f_disc) + p_one
def train(self, trainning_set): losses = [] for datapoint in trainning_set: sentence = datapoint[0] chars = datapoint[6] pos = datapoint[5] entity = datapoint[2] triggers = datapoint[3] rules = datapoint[-1] features = self.encode_sentence(sentence, pos, chars) labels = datapoint[4] entity_vec = features[entity] contexts = self.entity_attend(features, entity_vec) for i, c in enumerate(contexts): if i != entity: h_t = dy.concatenate([c, entity_vec]) hidden = dy.tanh(self.lb.expr() * h_t + self.lb_bias.expr()) out_vector = dy.softmax(self.lb2.expr() * hidden + self.lb2_bias.expr()) if i in triggers: label = labels[triggers.index(i)] else: label = 0 losses.append(-dy.log(dy.pick(out_vector, label))) if i in triggers and len(rules[triggers.index(i)]) > 1: # Get decoding losses last_output_embeddings = self.pattern_embeddings[0] context = c s = self.decoder_lstm.initial_state().add_input( dy.concatenate([context, last_output_embeddings])) for pattern in rules[triggers.index(i)]: h_t = s.output() context, A = self.attend(contexts, h_t) # p_gen = dy.logistic(self.gen_c * context + self.gen_h * h_t + self.gen_i * # dy.concatenate([context, last_output_embeddings]) + self.gen_bias) out_vector = self.pt.expr() * dy.concatenate( [context, h_t]) + self.pt_bias.expr() probs = dy.softmax(out_vector) losses.append(-dy.log(dy.pick(probs, pattern))) last_output_embeddings = self.pattern_embeddings[ pattern] s = s.add_input( dy.concatenate( [context, last_output_embeddings])) try: loss = dy.esum(losses) loss.backward() self.trainer.update() dy.renew_cg() losses = [] except: pass
def score_sentence(self, observations, tags): assert len(observations) == len(tags) score_seq = [0] score = dy.scalarInput(0) tags = [t2i["<START>"]] + tags for i, obs in enumerate(observations): score = score + dy.pick(self.transitions[tags[i+1]], tags[i]) + dy.pick(obs, tags[i+1]) score_seq.append(score.value()) score = score + dy.pick(self.transitions[t2i["<STOP>"]], tags[-1]) return score
def learn(self, seq): # remove compound words tmp = [] for ss in seq: if not ss.is_compound_entry: tmp.append(ss) seq = tmp arc_matrix, aux_arc_matrix, proj_labels, softmax_morphology = self._predict_arc( seq, runtime=False) gold_heads = [entry.head for entry in seq] gold_labels = [entry.label for entry in seq] softmax_labels = self._predict_label(gold_heads, proj_labels, runtime=False) losses = [] for gold_head, gold_label, arc_probs, softmax_label, entry in zip( gold_heads, gold_labels, arc_matrix[1:], softmax_labels, seq): label_index = self.encodings.label2int[gold_label] losses.append(-dy.log(arc_probs[gold_head])) losses.append(-dy.log(dy.pick(softmax_label, label_index))) if not self.config.predict_morphology: for gold_head, aux_probs, entry in zip(gold_heads, aux_arc_matrix[1:], seq): losses.append(-dy.log(aux_probs[gold_head]) * self.aux_softmax_weight) else: for softmax_morph, entry in zip(softmax_morphology, seq): loss_upos = -dy.log( dy.pick(softmax_morph[0], self.encodings.upos2int[entry.upos])) losses.append(loss_upos * (self.aux_softmax_weight / 3)) if len( self.encodings.xpos2int ) > 1: # stability check (some languages are missing attributes or XPOS, resulting in numerical overflow during backpropagation loss_xpos = -dy.log( dy.pick(softmax_morph[1], self.encodings.xpos2int[entry.xpos])) losses.append(loss_xpos * (self.aux_softmax_weight / 3)) if len( self.encodings.attrs2int ) > 1: # stability check (some languages are missing attributes or XPOS, resulting in numerical overflow during backpropagation loss_attrs = -dy.log( dy.pick(softmax_morph[2], self.encodings.attrs2int[entry.attrs])) losses.append(loss_attrs * (self.aux_softmax_weight / 3)) loss = dy.esum(losses) self.batch_loss.append(loss)
def score_sentence(self, score_vecs, tags): assert (len(score_vecs) == len(tags)) tags.insert(0, START_TAG) # add start total = dy.scalarInput(.0) for i, obs in enumerate(score_vecs): # transition to next from i and emission next_tag = tags[i + 1] total += dy.pick(self.trans_mat[next_tag], tags[i]) + dy.pick( obs, next_tag) total += dy.pick(self.trans_mat[END_TAG], tags[-1]) return total
def score_sentence(self, features, t_features, tags): score = dy.scalarInput(0) tags = [self.begin_tag] + tags for i, feat in enumerate(features): score = (score + dy.pick(t_features[i][tags[i + 1]], tags[i]) + dy.pick(feat, tags[i + 1])) # Last transition to end tag from last tag score = score + dy.pick(t_features[-1][self.end_tag], tags[-1]) return score
def forward_labeled(self, features, tags): score = dy.scalarInput(0) tags = [self.label2idx[w] for w in tags] tags = [self.label2idx[START]] + tags for i, obs in enumerate(features): score = score + dy.pick(self.transition[tags[i + 1]], tags[i]) + dy.pick(obs, tags[i + 1]) labeled_score = score + dy.pick(self.transition[self.label2idx[STOP]], tags[-1]) return labeled_score
def score_sentence(self, observations, tags): assert len(observations) == len(tags) #score_seq = [0] score = dy.scalarInput(0) tags = [self.t2i["<SOS>"]] + [self.t2i[t] for t in tags] for i, obs in enumerate(observations): #+ dy.pick(dy.lookup(self.transitions, tags[i+1]),tags[i]) score = score + dy.pick(self.transitions[tags[i + 1]], tags[i]) + dy.pick(obs, tags[i + 1]) #score_seq.append(score.value()) score = score + dy.pick(self.transitions[self.t2i["<EOS>"]], tags[-1]) return score
def score_sentence(self, observations, tags): if len(tags) == 0: tags = [-1] * len(observations) assert len(observations) == len(tags) score_seq = [0] score = dy.scalarInput(0) tags = [self.vocab.START_ID] + tags for i, obs in enumerate(observations): score = score + dy.pick(self.transitions[tags[i + 1]], tags[i]) + dy.pick(obs, tags[i + 1]) score_seq.append(score.value()) score = score + dy.pick(self.transitions[self.vocab.END_ID], tags[-1]) return score
def forced_decoding(vecs, tags): # Initialize for_expr = dy.scalarInput(0) for_tag = S_T # Perform the forward pass through the sentence for i, vec in enumerate(vecs): my_tag = vt.w2i[tags[i]] for_expr = for_expr + dy.pick(TRANS_LOOKUP[my_tag], for_tag) + vec[my_tag] for_tag = my_tag for_expr = for_expr + dy.pick(TRANS_LOOKUP[S_T], for_tag) return for_expr
def train(inputs, targets, encoder, decoder, trainer, max_length=MAX_LENGTH): dy.renew_cg() encoder_hidden = encoder.initHidden() input_length = len(inputs) target_length = len(targets) encoder_outputs = [dy.zeros(hidden_dim) for _ in range(max_length)] losses = [] for i in range(input_length): encoder_output, encoder_hidden = encoder(inputs[i], encoder_hidden) encoder_outputs[i] = encoder_output encoder_outputs = dy.concatenate(encoder_outputs, 1) decoder_input = SOS_token decoder_hidden = encoder_hidden if r.random() < teacher_forcing_ratio: use_teacher_forcing = True else: use_teacher_forcing = False if use_teacher_forcing: for i in range(target_length): decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs, dropout=True) losses.append(-dy.log(dy.pick(decoder_output, targets[i]))) decoder_input = targets[i] else: for i in range(target_length): decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs, dropout=True) losses.append(-dy.log(dy.pick(decoder_output, targets[i]))) probs = decoder_output.vec_value() decoder_input = probs.index(max(probs)) if decoder_input == EOS_token: break loss = dy.esum(losses) / len(losses) loss.backward() trainer.update() return loss.value()
def __getitem__(self, key): """Get a single item. Returns: sequence item (expression); does not result in explicit conversion to list """ if self.expr_list: return self.expr_list[key] else: if key < 0: key += len(self) if self.expr_tensor: return dy.pick(self.expr_tensor, key, dim=len(self.expr_tensor.dim()[0])-1) else: return dy.pick(self.expr_transposed_tensor, key, dim=0)
def score_sentence(self, observations, tags): if not len(observations) == len(tags): raise AssertionError("len(observations) != len(tags)") score_seq = [0] score = dy.scalarInput(0) tags = [self.sp_s] + tags for i, obs in enumerate(observations): score = score + dy.pick(self.trans[tags[i + 1]], tags[i]) + dy.pick(obs, tags[i + 1]) score_seq.append(score.value()) score = score + dy.pick(self.trans[self.sp_e], tags[-1]) return score
def train(self, rnnlm, train_quatrains, dev_quatrains): min_dev_loss = sys.maxsize for i in tqdm(range(self.epochs), desc='Training'): losses = [] tqdm.write('Epoch {}'.format(i)) total_loss = 0 state = rnnlm.initialize() for count, quatrain in enumerate(train_quatrains): for token, (next_word, _, _, _) in zip(quatrain, quatrain[1:]): state, probs = rnnlm.add_input(state, token) loss = -dy.log(dy.pick(probs, next_word)) losses.append(loss) if count % self.BATCH_SIZE == 0: loss = dy.esum(losses) total_loss += loss.value() loss.backward() self.trainer.update() losses = [] dy.renew_cg() state = rnnlm.initialize() #if (count + 1)% 4 == 0: # dy.renew_cg() # state = rnnlm.initialize() dev_loss = 0 state = rnnlm.initialize() for count, quatrain in enumerate(dev_quatrains): for token, (next_word, _, _, _) in zip(quatrain, quatrain[1:]): state, probs = rnnlm.add_input(state, token) loss = -dy.log(dy.pick(probs, next_word)) dev_loss += loss.value() if (count + 1) % 4 == 0: dy.renew_cg() state = rnnlm.initialize() tqdm.write('Dev loss: {}'.format(dev_loss)) if dev_loss < min_dev_loss: tqdm.write('Best dev loss. Saving parameters...') self.pc.save('model.pt') min_loss = dev_loss else: tqdm.write('Not brst dev loss. Restarting with smaller...') self.lr = self.lr * .5 self.trainer.restart(self.lr) tqdm.write('Training Loss: {}'.format(total_loss)) rnnlm.generate(rnnlm.initialize())
def predict_next_best_action(self,config,prev_action,sentence): """ Predicts the next best couple (configuration,action) @param config: the current configuration @param sentence: the sentence to parse @return a couple (next_config, action_taken) """ S,F,B,A,prefix_score = config if F is None and len(B) > 0 : #lexical action unk_token = self.word_codes[ArcEagerGenerativeParser.UNKNOWN_TOKEN] next_word = self.word_codes.get(sentence[B[0]],unk_token) X = self.make_representation(config,None,sentence,structural=False) if self.tied: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) embeddings = [dy.pick(E, xidx) for xidx in X] xdense = dy.concatenate(embeddings) pred = dy.pickneglogsoftmax(E * dy.tanh( W * xdense ),next_word) C = self.generate(config,local_score= -pred.value()) action = (ArcEagerGenerativeParser.GENERATE,sentence[B[0]]) return (C,action) else: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) O = dy.parameter(self.output_embeddings) embeddings = [dy.pick(E, xidx) for xidx in X] xdense = dy.concatenate(embeddings) pred = dy.pickneglogsoftmax(O * dy.tanh( W * xdense ),next_word) C = self.generate(config,local_score= -pred.value()) action = (ArcEagerGenerativeParser.GENERATE,sentence[B[0]]) return (C,action) else: #structural action X = self.make_representation(config,None,sentence,structural=True) dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) A = dy.parameter(self.action_weights) embeddings = [dy.pick(E, xidx) for xidx in X] xdense = dy.concatenate(embeddings) preds = dy.softmax(A * dy.tanh( W * xdense )).npvalue() action_mask = self.mask_actions(config,prev_action,len(sentence)) max_idx = np.argmax(preds*action_mask) score = log(preds[max_idx]) C = self.actions[max_idx](config,local_score=score) #this just execs the predicted action.. action = self.rev_action_codes[max_idx] return (C,action)
def attend(blstm_outputs, h_t, W_c, v_a, W__a, U__a): # iterate through input states to compute alphas # print 'computing scores...' # scores = [W_a * pc.concatenate([h_t, h_input]) for h_input in blstm_outputs] scores = [v_a * pc.tanh(W__a * h_t + U__a * h_input) for h_input in blstm_outputs] # print 'computed scores' # normalize to alphas using softmax # print 'computing alphas...' alphas = pc.softmax(pc.concatenate(scores)) # print 'computed alphas...' # compute c using alphas # print 'computing c...' # import time # s = time.time() # dim = len(blstm_outputs[0].vec_value()) # stacked_alphas = pc.concatenate_cols([alphas for j in xrange(dim)]) # stacked_vecs = pc.concatenate_cols([h_input for h_input in blstm_outputs]) # c = pc.esum(pc.cwise_multiply(stacked_vecs, stacked_alphas)) # print "stack time:", time.time() - s # s = time.time() c = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # print "pick time:", time.time() - s # print 'computed c' # print 'c len is {}'.format(len(c.vec_value())) # compute output state h~ using c and the decoder's h (global attention variation from Loung and Manning 2015) # print 'computing h~...' h_output = pc.tanh(W_c * pc.concatenate([h_t, c])) # print 'len of h_output is {}'.format(len(h_output.vec_value())) # print 'computed h~' return h_output, alphas, W__a.value()
def train(self, words, lemmas, gold, bad): dy.renew_cg() W = dy.parameter(self.pW) b = dy.parameter(self.pb) losses = [] gold_scores = [] bad_scores = [] for item in gold: lf, denotation = item[0], item[1] feature = self.extract_feature(words, lemmas, lf, denotation) feature_vec = dy.vecInput(self.nfeatures) feature_vec.set(feature) gold_scores.append(W * feature_vec + b) for item in bad: lf, denotation = item[0], item[1] feature = self.extract_feature(words, lemmas, lf, denotation) feature_vec = dy.vecInput(self.nfeatures) feature_vec.set(feature) bad_scores.append(W * feature_vec + b) log_prob = dy.log_softmax(dy.concatenate(gold_scores + bad_scores)) for i in range(len(gold_scores)): losses.append(dy.pick(log_prob, i)) return -dy.esum(losses)
def get_loss(self, input_sentence, label): dy.renew_cg() w = dy.parameter(self.w) b1 = dy.parameter(self.b1) u = dy.parameter(self.u) b2 = dy.parameter(self.b2) embedded = self.embed_sentence(input_sentence) encoded = self.encoded_sentence(embedded) acc_lstm = self.run_lstm(self.accecptor_lstm.initial_state(), encoded) mlp_input = acc_lstm[-1] h = dy.tanh((w * mlp_input) + b1) y_pred = dy.softmax((u * h) + b2) loss = -dy.log(dy.pick(y_pred, label)) return loss
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(vectors) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def forward(self, observations): def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dy.pick(scores, argmax_score) max_score_expr_broadcast = dy.concatenate([max_score_expr] * self.tagset_size) return max_score_expr + dy.log( dy.sum_cols( dy.transpose(dy.exp(scores - max_score_expr_broadcast)))) init_alphas = [-1e10] * self.tagset_size init_alphas[t2i[START_TAG]] = 0 for_expr = dy.inputVector(init_alphas) for obs in observations: alphas_t = [] for next_tag in range(self.tagset_size): obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.tagset_size) next_tag_expr = for_expr + self.transitions[ next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr)) for_expr = dy.concatenate(alphas_t) terminal_expr = for_expr + self.transitions[t2i["<STOP>"]] alpha = log_sum_exp(terminal_expr) return alpha
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(vectors) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input( dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings])) loss = [] for char in output: # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate( [attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def _forward(self, emissions): """Viterbi forward to calculate all path scores. :param emissions: List[dy.Expression] Returns: dy.Expression ((1,), B) """ init_alphas = [-1e4] * self.n_tags init_alphas[self.start_idx] = 0 alphas = dy.inputVector(init_alphas) transitions = self.transitions # len(emissions) == T for emission in emissions: add_emission = dy.colwise_add(transitions, emission) scores = dy.colwise_add(dy.transpose(add_emission), alphas) # dy.logsumexp takes a list of dy.Expression and computes logsumexp # elementwise across the lists so for example the logsumexp is calculated # for [0] in each list. This means we want the scores for a given # transition scores for a tag to be in the columns alphas = dy.logsumexp([x for x in scores]) last_alpha = alphas + dy.pick(transitions, self.end_idx) alpha = dy.logsumexp([x for x in last_alpha]) return alpha
def score_sentence(self, emissions, tags): """Get the score of a given sentence. :param emissions: List[dy.Expression ((H,), B)] :param tags: List[int] Returns: dy.Expression ((1,), B) """ tags = np.concatenate((np.array([self.start_idx], dtype=int), tags)) score = dy.scalarInput(0) transitions = self.transitions for i, e in enumerate(emissions): # Due to Dynet being column based it is best to use the transition # matrix so that x -> y is T[y, x]. score += dy.pick(dy.pick(transitions, tags[i + 1]), tags[i]) + dy.pick(e, tags[i + 1]) score += dy.pick(dy.pick(transitions, self.end_idx), tags[-1]) return score
def attend2(blstm_outputs, s_prev, y_feedback, v_a, W_a, U_a, U_o, V_o, C_o): # attention mechanism - Bahdanau style # iterate through input states to compute alphas # print 'computing scores...' # W_a: hidden x hidden, U_a: hidden x 2 hidden, v_a: hidden, each score: scalar scores = [v_a * pc.tanh(W_a * s_prev + U_a * h_j) for h_j in blstm_outputs] alphas = pc.softmax(pc.concatenate(scores)) # c_i: 2 hidden c_i = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # U_o = 2l x hidden, V_o = 2l x input, C_o = 2l x 2 hidden attention_output_vector = U_o * s_prev + V_o * y_feedback + C_o * c_i return attention_output_vector, alphas
def transduce(self, inputs, train): xs = inputs[:self.max_length] if not xs: return [] for i in range(self.lstm_layers): for n, d in ("f", 1), ("b", -1): Wr, br, Wh = [self.params["%s%d%s" % (p, i, n)] for p in ("Wr", "br", "Wh")] hs_ = self.params["rnn%d%s" % (i, n)].initial_state().transduce(xs[::d]) hs = [hs_[0]] for t in range(1, len(hs_)): r = dy.logistic(Wr * dy.concatenate([hs[t - 1], xs[t]]) + br) hs.append(dy.cmult(r, hs_[t]) + dy.cmult(1 - r, Wh * xs[t])) xs = hs if train: x = dy.dropout_dim(dy.concatenate(xs, 1), 1, self.dropout) xs = [dy.pick(x, i, 1) for i in range(len(xs))] return xs
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: vector = dy.concatenate([attend(vectors, s), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def create_network_return_loss(inputs, expected_output): ''' inputs is a list of numbers ''' dy.renew_cg() W = dy.parameter(pW) # from parameters to expressions b = dy.parameter(pB) if(len(inputs) > documentLength): inputs = inputs[0:documentLength] emb_vectors = [lookup[i] for i in inputs] while(len(emb_vectors) < documentLength): pad = dy.vecInput(embDimension) pad.set(np.zeros(embDimension)) emb_vectors.append(pad) net_input = dy.concatenate(emb_vectors) net_output = dy.softmax( (W*net_input) + b) loss = -dy.log(dy.pick(net_output, expected_output)) return loss
def CalculateLossForWord(word_obj, fValidation=False, fRunning=False): dy.renew_cg() if not fRunning: gold_lang = word_obj['tag'] # add a bos before and after seq = ['*BOS*'] + list(word_obj['word']) + ['*BOS*'] # get all the char encodings for the daf char_embeds = [let_enc(let) for let in seq] # run it through the bilstm char_bilstm_outputs = bilstm(char_embeds) bilistm_output = dy.concatenate([char_bilstm_outputs[0],char_bilstm_outputs[-1]]) mlp_input = bilistm_output mlp_out = lang_mlp(mlp_input) predicted_lang = lang_tags[np.argmax(mlp_out.npvalue())] confidence = (mlp_out.npvalue()[:2] / np.sum(mlp_out.npvalue()[:2])).tolist() #skip ambiguous # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: loss = -dy.log(dy.pick(mlp_out, gold_lang)) # otherwise, set the answer to be the argmax elif not fRunning and fValidation: loss = None lang_conf_matrix(np.argmax(mlp_out.npvalue()), gold_lang) else: return predicted_lang,confidence pos_prec = 1 if predicted_lang == lang_tags[gold_lang] else 0 tagged_word = { 'word': word_obj['word'], 'tag': predicted_lang, 'confidence':confidence, 'gold_tag':lang_tags[gold_lang]} if fValidation: return pos_prec, tagged_word return loss, pos_prec
# regular lookup a = lp[1].npvalue() b = lp[2].npvalue() c = lp[3].npvalue() # batch lookup instead of single elements. # two ways of doing this. abc1 = dy.lookup_batch(lp, [1,2,3]) print(abc1.npvalue()) abc2 = lp.batch([1,2,3]) print(abc2.npvalue()) print(np.hstack([a,b,c])) # use pick and pickneglogsoftmax in batch mode # (must be used in conjunction with lookup_batch): print("\nPick") W = dy.parameter( m.add_parameters((5, 10)) ) h = W * lp.batch([1,2,3]) print(h.npvalue()) print(dy.pick_batch(h,[1,2,3]).npvalue()) print(dy.pick(W*lp[1],1).value(), dy.pick(W*lp[2],2).value(), dy.pick(W*lp[3],3).value()) # using pickneglogsoftmax_batch print("\nPick neg log softmax") print((-dy.log(dy.softmax(h))).npvalue()) print(dy.pickneglogsoftmax_batch(h,[1,2,3]).npvalue())
def parse(self, t, oracle_actions=None): dy.renew_cg() self.NULL_REP = self.WORDS_LOOKUP[self.nwords-1] if oracle_actions: oracle_actions = list(oracle_actions) oracle_actions.reverse() toks = list(t) toks.reverse() stack = [] buffer = [] W1 = dy.parameter(self.pW1) b1 = dy.parameter(self.pb1) W_act = dy.parameter(self.pW_act) b_act = dy.parameter(self.pb_act) losses = [] for tok in toks: tok_embedding = self.WORDS_LOOKUP[tok] buffer.append(Head(self.vocab.i2w[tok], tok_embedding)) while not (len(stack) == 1 and len(buffer) == 0): # based on parser state, get valid actions valid_actions = [] if len(buffer) > 0: # can only reduce if elements in buffer valid_actions += [SHIFT] if len(stack) >= 2: # can only shift if 2 elements on stack valid_actions += [REDUCE_L, REDUCE_R] # compute probability of each of the actions and choose an action # either from the oracle or if there is no oracle, based on the model action = valid_actions[0] log_probs = None if len(valid_actions) > 1: representations = self.extract_features(stack, buffer) h = dy.cube(W1*dy.concatenate(representations) + b1) logits = W_act * h + b_act log_probs = dy.log_softmax(logits, valid_actions) if oracle_actions is None: action = max(enumerate(log_probs.vec_value()), key=itemgetter(1))[0] if oracle_actions is not None: action = oracle_actions.pop() if log_probs is not None: # append the action-specific loss losses.append(dy.pick(log_probs, action)) # execute the action to update the parser state if action == SHIFT: token = buffer.pop() stack.append(token) else: # one of the reduce actions right = stack.pop() left = stack.pop() head, modifier = (left, right) if action == REDUCE_R else (right, left) #add the tokens and their embeddings into the children list if action == REDUCE_R: head.add_child(modifier, 'right') else: head.add_child(modifier, 'left') stack.append(head) if oracle_actions is None: print('{0} --> {1}'.format(head.word, modifier.word)) # the head of the tree that remains at the top of the stack is now the root if oracle_actions is None: head = stack.pop().word print('ROOT --> {0}'.format(head)) return -dy.esum(losses) if losses else None
def CalculateLossForDaf(daf, fValidation=False, fRunning=False): dy.renew_cg() tagged_daf = {"words":[],"file":daf["file"]} daf = daf["words"] # add a bos before and after seq = ['*BOS*'] + list(' '.join([word for word, _, _, _ in daf])) + ['*BOS*'] # get all the char encodings for the daf char_embeds = [let_enc(let) for let in seq] # run it through the bilstm char_bilstm_outputs = bilstm(char_embeds) # now iterate and get all the separate word representations by concatenating the bilstm output # before and after the word word_bilstm_outputs = [] iLet_start = 0 for iLet, char in enumerate(seq): # if it is a bos, check if it's at the end of the sequence if char == '*BOS*': if iLet + 1 == len(seq): char = ' ' else: continue # if we are at a space, take this bilstm output and the one at the letter start if char == ' ': cur_word_bilstm_output = dy.concatenate([char_bilstm_outputs[iLet_start], char_bilstm_outputs[iLet]]) # add it in word_bilstm_outputs.append(cur_word_bilstm_output) # set the iLet_start ocunter to here iLet_start = iLet # safe-check, make sure word bilstm outputs length is the same as the daf if len(word_bilstm_outputs) != len(daf): log_message('Size mismatch!! word_bilstm_outputs: ' + str(len(word_bilstm_outputs)) + ', daf: ' + str(len(daf))) prev_pos_lstm_state = prev_pos_lstm.initial_state().add_input(pos_enc('*BOS*')) all_losses = [] pos_prec = 0.0 rough_pos_prec = 0.0 pos_items = 0 class_prec = 0.0 class_items = 0.0 # now iterate through the bilstm outputs, and each word in the daf for (word, gold_word_class, gold_word_pos, gold_word_lang), bilstm_output in zip(daf, word_bilstm_outputs): should_backprop = gold_word_class == 1 # create the mlp input, a concatenate of the bilstm output and of the prev pos output mlp_input = dy.concatenate([bilstm_output, prev_pos_lstm_state.output()]) # run through the class mlp class_mlp_output = class_mlp(mlp_input) predicted_word_class = np.argmax(class_mlp_output.npvalue()) confidence = np.max(class_mlp_output.npvalue()) / np.sum(class_mlp_output.npvalue()) # prec if should_backprop: class_prec += 1 if predicted_word_class == gold_word_class else 0 class_items += 1 # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: if should_backprop: all_losses.append(-dy.log(dy.pick(class_mlp_output, gold_word_class))) word_class_ans = gold_word_class # otherwise, set the answer to be the argmax else: word_class_ans = predicted_word_class # if the word_class answer is 1, do the pos! # alternatively, if validating and it's aramic, do the pos! if word_class_ans or (fValidation and gold_word_lang) or (fRunning and gold_word_lang): # run the pos mlp output pos_mlp_output = pos_mlp(mlp_input) try: temp_pos_array = pos_mlp_output.npvalue() possible_pos_array = np.zeros(temp_pos_array.shape) pos_list = pos_hashtable[word] # pos_list.add('') #concat 'unknown' as possible pos possible_pos_indices = [pos_vocab[temp_pos] for temp_pos in pos_list] possible_pos_array[possible_pos_indices] = temp_pos_array[possible_pos_indices] except KeyError: possible_pos_array = pos_mlp_output.npvalue() # if fValidation: # possible_pos_array[pos_vocab['']] = 0.0 # don't allow validation to guess UNK b/c it never trained against that TODO this makes sense, right? predicted_word_pos = pos_vocab.getItem(np.argmax(possible_pos_array)) confidence = np.max(possible_pos_array) / np.sum(possible_pos_array) # prec if should_backprop: pos_prec += 1 if predicted_word_pos == gold_word_pos else 0 rough_pos_prec += 1 if predicted_word_pos[0] == gold_word_pos[0] else 0 # you got at least the rough pos right pos_items += 1 # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: if should_backprop: all_losses.append(-dy.log(dy.pick(pos_mlp_output, pos_vocab[gold_word_pos]))) word_pos_ans = gold_word_pos # otherwise, set the answer to be the argmax elif not fRunning and fValidation: if should_backprop: pos_conf_matrix(pos_vocab[predicted_word_pos], pos_vocab[gold_word_pos]) word_pos_ans = predicted_word_pos else: word_pos_ans = predicted_word_pos # run through the prev-pos-mlp predicted = predicted_word_pos prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc(word_pos_ans)) # if the answer is 0, put a '' through the prev-pos lstm else: predicted = 'UNK' prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc('')) tagged_daf["words"].append({"word":word,"gold_pos":gold_word_pos,"gold_class":gold_word_class,"predicted":predicted,"confidence":confidence, "lang": gold_word_lang}) if fRunning: return tagged_daf pos_prec = pos_prec / pos_items if pos_items > 0 else None rough_pos_prec = rough_pos_prec / pos_items if pos_items > 0 else None class_prec = class_prec / class_items if class_items > 0 else None if fValidation: return class_prec, pos_prec,tagged_daf, rough_pos_prec total_loss = dy.esum(all_losses) if len(all_losses) > 0 else None return total_loss, class_prec, pos_prec, rough_pos_prec
def CalculateLossForDaf(daf, fValidation=False, fRunning=False): dy.renew_cg() tagged_daf = {"words": []} # add a bos before and after seq = ['*BOS*'] + list(' '.join([word for word, _ in daf])) + ['*BOS*'] # get all the char encodings for the daf char_embeds = [let_enc(let) for let in seq] # run it through the bilstm char_bilstm_outputs = bilstm(char_embeds) # now iterate and get all the separate word representations by concatenating the bilstm output # before and after the word word_bilstm_outputs = [] iLet_start = 0 for iLet, char in enumerate(seq): # if it is a bos, check if it's at the end of the sequence if char == '*BOS*': if iLet + 1 == len(seq): char = ' ' else: continue # if we are at a space, take this bilstm output and the one at the letter start if char == ' ': cur_word_bilstm_output = dy.concatenate([char_bilstm_outputs[iLet_start], char_bilstm_outputs[iLet]]) # add it in word_bilstm_outputs.append(cur_word_bilstm_output) # set the iLet_start ocunter to here iLet_start = iLet # safe-check, make sure word bilstm outputs length is the same as the daf if len(word_bilstm_outputs) != len(daf): log_message('Size mismatch!! word_bilstm_outputs: ' + str(len(word_bilstm_outputs)) + ', daf: ' + str(len(daf))) prev_lang_lstm_state = prev_lang_lstm.initial_state().add_input(lang_enc('*BOS*')) all_losses = [] lang_prec = 0.0 lang_items = 0 # now iterate through the bilstm outputs, and each word in the daf for (word, gold_word_lang), bilstm_output in zip(daf, word_bilstm_outputs): # create the mlp input, a concatenate of the bilstm output and of the prev pos output mlp_input = dy.concatenate([bilstm_output, prev_lang_lstm_state.output()]) # run through the class mlp lang_mlp_output = lang_mlp(mlp_input) predicted_word_lang = lang_vocab.getItem(np.argmax(lang_mlp_output.npvalue())) confidence = np.max(lang_mlp_output.npvalue()) / np.sum(lang_mlp_output.npvalue()) lang_prec += 1 if predicted_word_lang == gold_word_lang else 0 lang_items += 1 tagged_daf["words"].append( {"word": word, "predicted_lang": predicted_word_lang, "confidence": confidence}) # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: all_losses.append(-dy.log(dy.pick(lang_mlp_output, lang_vocab[gold_word_lang]))) word_pos_ans = gold_word_lang # otherwise, set the answer to be the argmax elif not fRunning and fValidation: lang_conf_matrix(lang_vocab[predicted_word_lang], lang_vocab[gold_word_lang]) word_pos_ans = predicted_word_lang else: continue # run through the prev-pos-mlp prev_lang_lstm_state = prev_lang_lstm_state.add_input(lang_enc(word_pos_ans)) # prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc('')) lang_prec = lang_prec / lang_items if lang_items > 0 else None # class_prec = class_prec / class_items if class_items > 0 else None if fValidation: return lang_prec, tagged_daf if fRunning: return tagged_daf total_loss = dy.esum(all_losses) if len(all_losses) > 0 else None return total_loss, lang_prec