def evaluate(): wordss, tagss, lengths = test_helper.gen_batch().__next__() sentence_in = prepare_sequence(wordss, word_to_ix) target_tag_seqs = prepare_sequence(tagss, tag_to_ix) predict_scores, predict_tag_seqs = model(sentence_in, lengths) for tag in ['a', 'b', 'c']: f1_score(target_tag_seqs, predict_tag_seqs, tag, tag_to_ix, lengths)
def pre_losses(self): if not self.all_pre_preds: return None pre_preds = np.concatenate(self.all_pre_preds) labels = np.concatenate(self.all_labels) micro = util.f1_score(pre_preds, labels, 0.5) macro = util.f1_score(pre_preds, labels, 0.5, average='macro') return micro, macro
def find_context_index(question, answer, contexts): context_list = splitSentence(contexts) print(context_list) print("the len od list is " + len(context_list).__str__()) sim = 0.0 qas = answer + " " + question for i in range(len(context_list)): print("precessing the " + i.__str__() + " th ") if sim < f1_score(qas, context_list[i]): sim = f1_score(qas, context_list[i]) index = i print("get the best index" + index.__str__()) print("the best index is:" + index.__str__()) print("the best context is" + context_list[index]) return i + 1
def calc_f1score(): prs = [0.529, 0.5892] rrs = [0.6609, 0.6548] idx = [200, 800] for (i, pr, rr) in zip(idx, prs, rrs): print(i, pr, rr, round(f1_score(pr, rr), 4))
def decode_validate(model, sess, q_valid, reverse_src_vocab, reverse_tgt_vocab, save_dir, epoch, sample=5, print_decode=False): print_decode = print_decode if print_decode else FLAGS.print_decode num_decoded = 0 # add f1, em measure on this decoding f1 = 0. em = 0. saved_list = [] # since we did beam-decode, I can measure EM on the top-5 result with open(pjoin(save_dir, "valid_decode_e" + str(epoch) + ".txt"), "wb") as f: for source_tokens, source_mask, target_tokens, target_mask in pair_iter( q_valid, 1, FLAGS.input_len, FLAGS.query_len): # transpose them because how this model is set up source_tokens, source_mask, target_tokens, target_mask = source_tokens.T, source_mask.T, target_tokens.T, target_mask.T # seems like detokenize can handle batch src_sent = detokenize(source_tokens, reverse_src_vocab) tgt_sent = detokenize(target_tokens, reverse_tgt_vocab) # Encode encoder_output = model.encode(sess, source_tokens, source_mask) # Decode beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size) # De-tokenize beam_strs = detokenize(beam_toks, reverse_tgt_vocab, decode=True) best_str = beam_strs[0] # we can also get probability on them num_decoded += 1 f1 += f1_score(best_str, " ".join(tgt_sent[1:])) # tgt_sent's first array element is always [""] em += exact_match_score(best_str, " ".join(tgt_sent[1:])) if num_decoded <= sample: logging.info("input: {}".format(" ".join(src_sent))) logging.info("truth: {}".format(" ".join(tgt_sent[1:]))) logging.info("decoded: {}".format(best_str)) logging.info("") saved_list.append({ "input": src_sent, "truth": tgt_sent[1:], "decoded": best_str }) return float(f1) / float(num_decoded), float(em) / float( num_decoded), saved_list
def output(self, step, train=True): p, r, f = util.f1_score(self.probs, self.labels, 0.5) ap = util.auc_pr(self.probs, self.labels) try: auc = util.auc_roc(self.probs, self.labels) except ValueError: auc = float('nan') print( "S:%d. Precision: %.4f, Recall: %.4f, F-score: %.4f, AUC(PR): %.4f, AUC(ROC): %.4f, " % (step, p, r, f, ap, auc))
def losses(self, perclass=False, train=False): if not self.all_probs: return None probs = np.concatenate(self.all_probs) labels = np.concatenate(self.all_labels) # micro-averaged stats p, r, f = util.f1_score(probs, labels, 0.5) ap = util.auc_pr(probs, labels) try: auc = util.auc_roc(probs, labels) except ValueError: auc = float('nan') micro = [p, r, f, ap, auc] # macro-averaged stats p, r, f = util.f1_score(probs, labels, 0.5, average='macro') ap = util.auc_pr(probs, labels, average='macro') try: auc = util.auc_roc(probs, labels, average='macro') except ValueError: auc = float('nan') macro = [p, r, f, ap, auc] return micro, macro
def evaluate_answer(self, session, q, rev_src_vocab, rev_tgt_vocab, sample=100, print_every=100): # this is teacher-forcing evaluation, not even greedy decode f1 = 0. em = 0. size = 0. # python list: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # but must make sure EOS_ID is in there otherwise this throws an error for inp_tokens, inp_mask, query_tokens, query_mask in pair_iter(q, self.batch_size, self.inp_len, self.query_len): # decoder_output = self.decode(session, inp_tokens, inp_mask, query_tokens, query_mask) encoder_output = self.get_encode(session, inp_tokens, inp_mask) decoder_output = self.decode_greedy_batch(session, encoder_output, self.batch_size) print(decoder_output) # decoder_tokens = np.argmax(decoder_output, axis=-1) # those are batched right now # decoder_tokens = np.squeeze(decoder_tokens) * query_mask # query_tokens = query_tokens * query_mask batch_size = inp_tokens.shape[0] # query_len = np.sum(query_mask, axis=1) for i in range(batch_size): decoder_token = self.detokenize(decoder_output[i,:], rev_tgt_vocab) query_token = self.detokenize(query_tokens[i,:], rev_tgt_vocab) f1 += f1_score(decoder_token, query_token) em += exact_match_score(decoder_token, query_token) size += 1 if size % print_every == 0: decoded_parse = decoder_token true_parse = query_token decoded_input = [rev_src_vocab[j] for j in inp_tokens[i, :] if j != data_util.PAD_ID] print("input: {}".format(" ".join(decoded_input))) print("decoded result: {}".format(decoded_parse)) print("ground truth result: {}".format(true_parse)) if size >= sample: break f1 /= size em /= size return f1, em
def output(self, step, train=True): p, r, f = util.f1_score(self.probs, self.labels, 0.5) ap = util.auc_pr(self.probs, self.labels) try: auc = util.auc_roc(self.probs, self.labels) except ValueError: auc = float('nan') loss_str = "GS:%d, S:%d. Loss: %.4f, Precision: %.4f, Recall: %.4f, F-score: %.4f, " \ "AUC(PR): %.4f, AUC(ROC): %.4f" % (self.global_step, step, self.loss, p, r, f, ap, auc) pr_strs = [] for k in self.config.pr_at_k: pk = util.precision_at_k(self.probs, self.labels, k) rk = util.recall_at_k(self.probs, self.labels, k) pr_strs.append("Precision@%d: %.4f, Recall@%d: %.4f" % (k, pk, k, rk)) pr_str = ', '.join(pr_strs) wps_str = "WPS: %.2f" % self.wps print(', '.join([loss_str, pr_str, wps_str]))
def run_session(self, notes, lengths, labels, train=True): n_words = lengths.sum() start = time.time() notes = notes.tolist() lengths = lengths.tolist() X_raw = [] for note, length in zip(notes, lengths): if not length: break note = note[1:length - 1] out_note = [] for word in note: out_note.append(self.vocab.vocab[word]) X_raw.append(' '.join(out_note)) data = self.model.vectorizer.transform(X_raw, copy=False).toarray() labels = labels[:len(X_raw)] ops = [self.model.loss, self.model.probs, self.model.global_step] if train: ops.append(self.model.train_op) ret = self.session.run(ops, feed_dict={ self.model.data: data, self.model.labels: labels }) self.loss, self.probs, self.global_step = ret[:3] self.labels = labels # TODO remove this and use AUC(PR) to determine best hyperparameters: if self.config.bow_search and not train: prf = {} for thres in np.arange(0.1, 0.75, 0.1): prf[int(thres * 10)] = util.f1_score(self.probs, labels, thres, average=None)[-1] self.current_stats.append(prf) end = time.time() self.wps = n_words / (end - start) self.accumulate()
def group_eval(pred='./performance/result_val_sort.csv', label='../Data/val.csv'): pred_df = pd.read_csv(pred) label_df = pd.read_csv(label) pred_v = pred_df.values label_v = label_df.values print(pred_v.shape, label_v.shape) row, col = pred_v.shape prs = [] rrs = [] sum_preds = np.sum(pred_v, axis=1) sum_labels = np.sum(label_v, axis=1) for i in range(row): cnt = 0 for j in range(col): if pred_v[i][j] and label_v[i][j]: cnt += 1 pr = cnt / sum_preds[i] if sum_preds[i] > 0 else 0 rr = cnt / sum_labels[i] if sum_labels[i] > 0 else 0 prs.append(pr) rrs.append(rr) idxs = np.argsort(cnts)[::-1] f1s = [] for i, pr, rr in zip(idxs, np.array(prs)[idxs], np.array(rrs)[idxs]): f1 = f1_score(pr, rr) f1s.append(f1) print(i, round(100 * pr, 2), round(100 * rr, 2), round(100 * f1, 2)) print('Macro rr: {}, pr: {}, f1: {}.'.format( np.round(100 * np.average(rrs), 2), np.round(100 * np.average(prs), 2), np.round(100 * np.average(f1s), 2))) cnt = 0 for i in range(row): for j in range(col): if pred_v[i][j] and label_v[i][j]: cnt += 1 pr = cnt / np.sum(pred_v) rr = cnt / np.sum(label_v) print('Micro rr: {}, pr: {}, f1: {}.'.format( round(100 * rr, 2), round(100 * pr, 2), round(100 * f1_score(rr, pr), 2))) arr_perts = sum_labels[idxs] / col # for i in range(len(idxs)): # print(idxs[i], sum_labels[idxs][i], round(100*arr_perts[i], 2)) arr_f1s = np.array(f1s) arr_rrs = np.array(rrs)[idxs] arr_prs = np.array(prs)[idxs] plt.figure() plt.title(pred.split('/')[-1]) plt.plot(arr_prs, 'g-', label='pr') plt.plot(arr_rrs, 'b-', label='rr') plt.plot(arr_f1s, 'r-', label='f1') plt.plot(arr_perts, 'k.', label='percentage') plt.xticks(range(28), idxs) plt.legend() plt.grid() plt.xlim([0, 30]) plt.ylim([0, 1]) plt.show()
def pre_output(self, step, train=True): p, r, f = util.f1_score(self.pre_preds, self.labels, 0.5) print("GS:%d, S:%d. Precision: %.4f, Recall: %.4f, F-score: %.4f" % (self.global_step, step, p, r, f))
def losses(self, perclass=False, train=False, max_samples_in_chunk=(30000, 50000)): '''Return the accumulated losses''' if not self.all_losses: return None if train: max_samples_in_chunk = max_samples_in_chunk[0] else: max_samples_in_chunk = max_samples_in_chunk[1] max_batches_in_chunk = max_samples_in_chunk / self.config.batch_size loss = np.mean(self.all_losses) splits = int(0.999 + (len(self.all_probs) / max_batches_in_chunk)) chunk_size = int(0.999 + (len(self.all_probs) / splits)) ret_micro = [] ret_macro = [] ret_perclass = [] for i in xrange(0, len(self.all_probs), chunk_size): all_probs = self.all_probs[i:i + chunk_size] all_labels = self.all_labels[i:i + chunk_size] probs = np.concatenate(all_probs) labels = np.concatenate(all_labels) if self.config.test_labels > 0: probs = probs[:, :self.config.test_labels] labels = labels[:, :self.config.test_labels] # micro-averaged stats p, r, f = util.f1_score(probs, labels, 0.5) ap = util.auc_pr(probs, labels) try: auc = util.auc_roc(probs, labels) except ValueError: auc = float('nan') micro = [p, r, f, ap, auc] for k in self.config.pr_at_k: if train: # don't spend time on this for train set pk = float('nan') rk = float('nan') else: pk = util.precision_at_k(probs, labels, k) rk = util.recall_at_k(probs, labels, k) micro.extend([pk, rk]) # macro-averaged stats p, r, f = util.f1_score(probs, labels, 0.5, average='macro') if self.config.macro_auc: ap = util.auc_pr(probs, labels, average='macro') try: auc = util.auc_roc(probs, labels, average='macro') except ValueError: auc = float('nan') else: ap, auc = float('nan'), float('nan') macro = [p, r, f, ap, auc] # non-avereged stats for plotting if perclass: p, r, f = util.f1_score(probs, labels, 0.5, average=None) ap = util.auc_pr(probs, labels, average=None) try: auc = util.auc_roc(probs, labels, average=None) except ValueError: auc = float('nan') perclass = [p, r, f, ap, auc] else: perclass = float('nan') ret_micro.append(micro) ret_macro.append(macro) ret_perclass.append(perclass) if train: break return (loss, np.mean(ret_micro, 0), np.mean(ret_macro, 0), np.mean(ret_perclass, 0))
def train(net, trainIter, validIter, config): DEVICE = config['DEVICE'] modelSavePath = config['modelSavePath'] epochNum = config['model']['epochNum'] learningRate = config['model']['learningRate'] earlyStop = config['model']['earlyStop'] #权重初始化 for name, value in net.named_parameters(): if 'pretrainedModel' not in name: if value.dim() > 1: nn.init.xavier_uniform_(value) # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # bert_param_no = [value for name, value in net.named_parameters() if name in no_decay and 'bertModel' in name] # bert_param_yes = [value for name, value in net.named_parameters() if name not in no_decay and 'bertModel' in name] # other_param_no = [value for name, value in net.named_parameters() if name in no_decay and 'bertModel' not in name] # other_param_yes = [value for name, value in net.named_parameters() if name not in no_decay and 'bertModel' not in name] # optimizer_grouped_parameters = [ # {'params': bert_param_yes, 'weight_decay': 0.01, 'lr': learningRate}, # {'params': bert_param_no, 'weight_decay': 0.0, 'lr': learningRate}, # {'params': other_param_yes, 'weight_decay': 0.01, 'lr': 0.001}, # {'params': other_param_no, 'weight_decay': 0.0, 'lr': 0.001}] bert_params = [ value for name, value in net.named_parameters() if 'pretrainedModel' in name ] other_params = [ value for name, value in net.named_parameters() if 'pretrainedModel' not in name ] params = [{ 'params': bert_params, 'lr': 5e-5 }, { 'params': other_params, 'lr': learningRate }] optimizer = AdamW(params, eps=1e-8) earlyNumber, beforeLoss = 0, sys.maxsize trainLossSave, validLossSave, f1ScoreSave, accurateSave, recallSave = 0, 0, 0, 0, 0 for epoch in range(epochNum): print('第%d次迭代\n' % (epoch + 1)) #训练 net.train() trainLoss, number = 0, 0 for batchSentence, batchTag, _, _ in tqdm(trainIter): batchSentence = batchSentence.to(DEVICE) batchTag = batchTag.to(DEVICE) net.zero_grad() loss = net(batchSentence, batchTag) #多卡训练 if torch.cuda.device_count() > 1: loss = loss.mean() loss.backward() #梯度裁剪 nn.utils.clip_grad_norm_(net.parameters(), 1.0) optimizer.step() trainLoss += loss.item() number += 1 trainLoss = trainLoss / number #验证 net.eval() validLoss, number = 0, 0 yTrue, yPre, ySentence, probArr = [], [], [], [] with torch.no_grad(): for batchSentence, batchTag, lenList, originSentence in tqdm( validIter): batchSentence = batchSentence.to(DEVICE) batchTag = batchTag.to(DEVICE) loss = net(batchSentence, batchTag) #多卡训练 if torch.cuda.device_count() > 1: loss = loss.mean() tagPre, prob = net.module.decode(batchSentence) else: tagPre, prob = net.decode(batchSentence) tagTrue = [ element[:length] for element, length in zip(batchTag.cpu().numpy(), lenList) ] yTrue.extend(tagTrue) yPre.extend(tagPre) ySentence.extend(originSentence) probArr.extend(prob) validLoss += loss.item() number += 1 yTrue2tag = [[id2tag[element2] for element2 in element1] for element1 in yTrue] yPre2tag = [[id2tag[element2] for element2 in element1] for element1 in yPre] assert len(yTrue2tag) == len(yPre2tag) assert len(ySentence) == len(yTrue2tag) f1Score, accurate, recall = f1_score(y_true=yTrue2tag, y_pred=yPre2tag) validLoss = validLoss / number print('训练损失为: %f\n' % trainLoss) print('验证损失为: %f / %f\n' % (validLoss, beforeLoss)) print('f1_Score、accurate、recall: %f、%f、%f\n' % (f1Score, accurate, recall)) if validLoss < beforeLoss: beforeLoss = validLoss if torch.cuda.device_count() > 1: torch.save(net.module.state_dict(), modelSavePath) else: torch.save(net.state_dict(), modelSavePath) trainLossSave, validLossSave = trainLoss, validLoss f1ScoreSave, accurateSave, recallSave = f1Score, accurate, recall if 'validResultPath' in config.keys(): path = config['validResultPath'] f = open(path, 'w', encoding='utf-8', errors='ignore') for sentence, prob in zip(ySentence, probArr): for sentenceEle, probEle in zip(sentence, prob): probEle = '\t'.join( [str(element) for element in probEle]) f.write('%s\t%s\n' % (sentenceEle, probEle)) f.write('\n') f.close() #早停机制 if validLoss > beforeLoss: earlyNumber += 1 print('earyStop: %d / %d\n' % (earlyNumber, earlyStop)) else: earlyNumber = 0 if earlyNumber >= earlyStop: break #计算验证集中的实际效果 ###临时### f = open('temp.txt', 'w', encoding='utf-8', errors='ignore') for sentence, trueTag, preTag in zip(ySentence, yTrue2tag, yPre2tag): trueEntity = '@'.join( acquireEntity([sentence], [trueTag], method='BIOES')) preEntity = '@'.join( acquireEntity([sentence], [preTag], method='BIOES')) if trueEntity != preEntity: f.write(''.join(sentence) + '\n') f.write('True:' + trueEntity + '\n') f.write('Pre:' + preEntity + '\n') f.close() return trainLossSave, validLossSave, f1ScoreSave, accurateSave, recallSave