def test_f1_score(self): prediction = 'rock' ground_truth = 'rock n roll' precision = 1. * 1 / 1 recall = 1. * 1 / 3 f1 = (2 * precision * recall) / (precision + recall) self.assertEqual(f1_score(prediction, ground_truth), f1) self.assertEqual(f1_score(ground_truth, ground_truth), 1)
def evaluate(dev_loader, model, mode='dev'): # set model to evaluation mode model.eval() id2label = config.id2label true_tags = [] pred_tags = [] sent_data = [] dev_losses = 0 with torch.no_grad(): for idx, batch_samples in enumerate(dev_loader): batch_data, batch_token_starts, batch_tags, ori_data = batch_samples # shift tensors to GPU if available batch_data = batch_data.to(config.device) batch_token_starts = batch_token_starts.to(config.device) batch_tags = batch_tags.to(config.device) sent_data.extend(ori_data) batch_masks = batch_data.gt(0) # get padding mask label_masks = batch_tags.gt(-1) # compute model output and loss loss = model((batch_data, batch_token_starts), token_type_ids=None, attention_mask=batch_masks, labels=batch_tags)[0] dev_losses += loss.item() # shape: (batch_size, max_len, num_labels) batch_output = model((batch_data, batch_token_starts), token_type_ids=None, attention_mask=batch_masks)[0] if mode == 'dev': batch_output = model.module.crf.decode(batch_output, mask=label_masks) else: # (batch_size, max_len - padding_label_len) batch_output = model.crf.decode(batch_output, mask=label_masks) batch_tags = batch_tags.to('cpu').numpy() pred_tags.extend([[id2label.get(idx) for idx in indices] for indices in batch_output]) # (batch_size, max_len - padding_label_len) true_tags.extend( [[id2label.get(idx) for idx in indices if idx > -1] for indices in batch_tags]) assert len(pred_tags) == len(true_tags) assert len(sent_data) == len(true_tags) # logging loss, f1 and report metrics = {} f1, p, r = f1_score(true_tags, pred_tags) metrics['f1'] = f1 metrics['p'] = p metrics['r'] = r if mode != 'dev': bad_case(sent_data, pred_tags, true_tags) output_write(sent_data, pred_tags) output2res() metrics['loss'] = float(dev_losses) / len(dev_loader) return metrics
def for_each_fold(fold, folds, data, labels, model, error_function): (x_train, y_train), (x_test, y_test) = partition_data(data, labels, fold, folds) model.fit(x_train, y_train) y_pred = model.predict(x_test) # Based on the error_function passed if error_function is None: # if None calculate precision error = precision_score(y_test, y_pred) elif error_function == 'precision': error = precision_score(y_test, y_pred) if error_function == 'accuracy': error = accuracy_score(y_test, y_pred) elif error_function == 'recall': error = recall_score(y_test, y_pred) elif error_function == 'f1': error = f1_score(y_test, y_pred) else: raise ValueError('%s error function is not defined.' % error_function) return {'expected labels': y_test, 'predicted labels': y_pred, 'errors': [error]}
def classification_report(y_true, y_pred): print('--------------------------------') print('Accuracy -', metrics.accuracy(y_true, y_pred)) print('Recall -', metrics.recall(y_true, y_pred)) print('Precision -', metrics.precision(y_true, y_pred)) print('F1 score -', metrics.f1_score(y_true, y_pred)) print('--------------------------------')
def evaluate(model, data_iterator, params, mark='Eval', verbose=True): """Evaluate the model on `steps` batches.""" # set model to evaluation mode model.eval() # id2tag dict idx2tag = {idx: tag for idx, tag in enumerate(params.tags)} true_tags = [] pred_tags = [] # a running average object for loss loss_avg = utils.RunningAverage() for input_ids, input_mask, labels in data_iterator: # to device input_ids = input_ids.to(params.device) input_mask = input_mask.to(params.device) labels = labels.to(params.device) batch_size, max_len = labels.size() # get loss loss = model(input_ids, attention_mask=input_mask.bool(), labels=labels) loss /= batch_size # update the average loss loss_avg.update(loss.item()) # inference with torch.no_grad(): batch_output = model(input_ids, attention_mask=input_mask.bool()) # 恢复标签真实长度 real_batch_tags = [] for i in range(batch_size): real_len = int(input_mask[i].sum()) real_batch_tags.append(labels[i][:real_len].to('cpu').numpy()) # List[int] pred_tags.extend([idx2tag.get(idx) for indices in batch_output for idx in indices]) true_tags.extend([idx2tag.get(idx) for indices in real_batch_tags for idx in indices]) # sanity check assert len(pred_tags) == len(true_tags), 'len(pred_tags) is not equal to len(true_tags)!' # logging loss, f1 and report metrics = {} f1 = f1_score(true_tags, pred_tags) accuracy = accuracy_score(true_tags, pred_tags) metrics['loss'] = loss_avg() metrics['f1'] = f1 metrics['accuracy'] = accuracy metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items()) logging.info("- {} metrics: ".format(mark) + metrics_str) # f1 classification report if verbose: report = classification_report(true_tags, pred_tags) logging.info(report) return metrics
def evaluate(model, data_iterator, params, mark='Test', verbose=False): """Evaluate the model on `steps` batches.""" # set model to evaluation mode model.eval() idx2tag = params.idx2tag true_tags = [] pred_tags = [] # a running average object for loss loss_avg = utils.RunningAverage() for _ in range(params.eval_steps): # fetch the next evaluation batch batch_data, batch_tags = next(data_iterator) batch_masks = batch_data.gt(0) loss = model(batch_data, token_type_ids=None, attention_mask=batch_masks, labels=batch_tags) batch_output = model(batch_data, token_type_ids=None, attention_mask=batch_masks ) # shape: (batch_size, max_len, num_labels) loss = loss[0] batch_output = batch_output[0] if params.n_gpu > 1 and params.multi_gpu: loss = loss.mean() loss_avg.update(loss.item()) batch_output = batch_output.detach().cpu().numpy() batch_tags = batch_tags.to('cpu').numpy() pred_tags.extend([ idx2tag.get(idx) for indices in np.argmax(batch_output, axis=2) for idx in indices ]) true_tags.extend( [idx2tag.get(idx) for indices in batch_tags for idx in indices]) assert len(pred_tags) == len(true_tags) # logging loss, f1 and report metrics = {} f1 = f1_score(true_tags, pred_tags) metrics['loss'] = loss_avg() metrics['f1'] = f1 metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items()) logging.info("- {} metrics: ".format(mark) + metrics_str) if verbose: report = classification_report(true_tags, pred_tags) logging.info(report) return metrics
def validate(self): self.train = False a = self.activation() preds = probability_to_preds(a, self.threshold) acc = accuracy(a, self.valid_y) f1 = f1_score(preds, self.valid_y) print(f'f1 score {f1}') print(f'test accuracy {acc}')
def validation_end(self, outputs): # OPTIONAL avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() tb_logs = {'val_loss': avg_loss, 'ppl': math.exp(avg_loss)} tb_logs['acc'] = torch.stack([x['val_acc'] for x in outputs]).mean() tb_logs['bleu'] = np.mean([x['bleu'] for x in outputs]) total = {} for metric_name in ['tp', 'fp', 'fn']: metric_value = torch.stack([x[metric_name] for x in outputs]).sum() total[metric_name] = metric_value prec_rec_f1 = metrics.f1_score(total['tp'], total['fp'], total['fn']) tb_logs.update(prec_rec_f1) return {'avg_val_loss': avg_loss, 'log': tb_logs}
def kfold_average_score(learner, files, dirs, k=5, min_word_len=2, min_freq=10, feature_size=160, weight='tfidf', sw=True): scores = [0] * k kfold = KFold(files, dirs, k) for ii in range(k): train_X, train_Y, test_X, test_Y = kfold.kth(ii) features = Features(train_X, train_Y, min_word_len, min_freq, feature_size, sw) x = [] for f_name in train_X: x.append(features.get_x_vector(f_name, weight)) learner.fit(x, features.y_transform(train_Y)) x = [] for f_name in test_X: x.append(features.get_x_vector(f_name, weight)) scores[ii] = f1_score(features.y_transform(test_Y), learner.predict(x).tolist()) return mean(scores)
def evaluate(self, darray, thr): batch_index = 0 X_batch, P_batch, y_batch = self.get_batch(darray, self.batch_size, batch_index) y_pred = None y_label = None while len(X_batch) > 0: num_batch = len(y_batch) feed_dict = { self.vocab_index: X_batch, self.props: P_batch, self.label: y_batch, self.first_level_lstm_dropout_p: [1.0] * len(self.first_level_lstm_dropout), self.deep_dropout_p: [1.0] * len(self.deep_dropout), self.conv_pool_dropout_p: [1.0] * len(self.conv_pool_dropout), self.second_level_lstm_dropout_p: [1.0] * len(self.second_level_lstm_dropout), self.train_phase: False } batch_out = self.sess.run(self.out, feed_dict=feed_dict) if batch_index == 0: y_pred = np.reshape(batch_out, (num_batch, )) y_label = np.reshape(y_batch, (num_batch, )) else: y_pred = np.concatenate( (y_pred, np.reshape(batch_out, (num_batch, )))) y_label = np.concatenate( (y_label, np.reshape(y_batch, (num_batch, )))) batch_index += 1 X_batch, P_batch, y_batch = self.get_batch(darray, self.batch_size, batch_index) pred = [1 if y_pred[i] > thr else 0 for i in range(len(y_pred))] accuracy = metrics.accuracy_score(y_label, pred) precision = metrics.precision_score(y_label, pred) recall = metrics.recall_score(y_label, pred) f1 = metrics.f1_score(y_label, pred) return accuracy, precision, recall, f1
def __predict_names(self, model): x = [] y_true = [] y_pred = [] for training_pair in self.pairs: input_tensor = training_pair[0] output_tensor = model(input_tensor) x.append(input_tensor) y_true.append(training_pair[1]) y_pred.append(output_tensor) # Convert numbers to words and remove SOS and EOS tokens x = [[ self.input_lang.index2word[index.item()] for index in sent if index.item() not in [SOS_TOKEN, EOS_TOKEN] ] for sent in x] y_true = [[ self.output_lang.index2word[index.item()] for index in sent if index.item() not in [SOS_TOKEN, EOS_TOKEN] ] for sent in y_true] y_pred = [[ self.output_lang.index2word[index] for index in sent if index not in [SOS_TOKEN, EOS_TOKEN] ] for sent in y_pred] names = pd.DataFrame( OrderedDict([ ('Source', [' '.join(sent) for sent in x]), ('True Name', [' '.join(sent) for sent in y_true]), ('Our Name', [' '.join(sent) for sent in y_pred]), ('BLEU', [bleu(y_true[i], y_pred[i]) for i in range(len(y_true))]), ('ROUGE', [rouge(y_true[i], y_pred[i]) for i in range(len(y_true))]), ('F1', [f1_score(y_true[i], y_pred[i]) for i in range(len(y_true))]) ])) return names
def forward(self, epoch=10000): for i in range(epoch): print(f'epoch {i}') a = self.activation() preds = probability_to_preds(a, self.threshold) dz = a - self.train_y dw = np.dot(self.train_x, dz.T) / self.no_of_samples db = np.sum(dz) / self.no_of_samples self.optimizer(dw, db, self.learning_rate) acc = accuracy(a, self.train_y) f1 = f1_score(preds, self.train_y) print(f'f1_score {f1}') print(f'train accuracy {acc}') print(f'train loss {self.cost(self.train_y, a)}') self.validate()
def main(): """ Main. """ # Load features data = sio.loadmat('data/LightenedCNN_C_lfw.mat') features = data['features'] labels = data['labels_original'][0] label_lookup = {} for idx, label in enumerate(labels): label_lookup[idx] = int(label[0][:]) print('Features shape: ', features.shape) start_time = time.time() clusters = aroc(features, 200, 1.1, 12) print('Time taken for clustering: {:.3f} seconds'.format( time.time() - start_time)) _, _, _, precision, recall, score = f1_score( clusters, label_lookup) print('Clusters: {} Precision: {:.3f} Recall: {:.3f} F1: {:.3f}'.format( len(clusters), precision, recall, score))
def generate_classification_perf(truths, pred_probs, multiclass=False): """Given truths, and predicted probabilities, generate ModelPerf object""" pred_classes = np.round(pred_probs).astype(int) with warnings.catch_warnings(): warnings.simplefilter("ignore") retval = ClassificationModelPerf( auroc=metrics.roc_auc_score(truths, pred_probs), auroc_curve=metrics.roc_curve(truths, pred_probs) if not multiclass else None, auprc=metrics.average_precision_score(truths, pred_probs), accuracy=metrics.accuracy_score(truths, pred_classes) if not multiclass else None, recall=metrics.recall_score(truths, pred_classes) if not multiclass else None, precision=metrics.precision_score(truths, pred_classes) if not multiclass else None, f1=metrics.f1_score(truths, pred_classes) if not multiclass else None, ce_loss=metrics.log_loss(truths, pred_probs, normalize=False) / np.prod(truths.shape), ) return retval
def main(): best_val_acc = -1.0 start_epoch = 1 trn_ds = loaders.SatClassificationDataset(LBL_DATA_DIR, SPLIT_CSV, POSITIVE_CLASS, False, trn_tfms) print('Train Samples:', len(trn_ds)) trn_dl = DataLoader(trn_ds, BATCH_SIZE, shuffle=True, num_workers=WORKERS) unlbl_ds = loaders.UnlabeledDataset(UNLBL_DATA_DIR, IMAGE_SIZE) print('Unlabeled:', len(unlbl_ds)) unlbl_dl = DataLoader(unlbl_ds, BATCH_SIZE, shuffle=True, num_workers=WORKERS) val_ds = loaders.SatClassificationDataset(LBL_DATA_DIR, SPLIT_CSV, POSITIVE_CLASS, True, val_tfms) print('Val Samples:', len(val_ds)) val_dl = DataLoader(val_ds, BATCH_SIZE, shuffle=False, num_workers=WORKERS) model = models.Resnet(visionmodels.resnet50, 2) model.to(DEVICE) ce_loss_fn = nn.CrossEntropyLoss().to(DEVICE) vat_loss_fn = vat.VATLoss(IP, EPSILON, XI).to(DEVICE) optimizer = optim.Adam(model.parameters(), lr=LR) lr_sched = optim.lr_scheduler.StepLR(optimizer, LR_STEP, gamma=LR_DECAY) trn_metrics = BookKeeping(TENSORBOARD_LOGDIR, 'trn') val_metrics = BookKeeping(TENSORBOARD_LOGDIR, 'val') if not os.path.exists(WEIGHTS_SAVE_PATH): os.mkdir(WEIGHTS_SAVE_PATH) if LOAD_CHECKPOINT is not None: checkpoint = torch.load(LOAD_CHECKPOINT, pickle_module=dill) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer = checkpoint['optimizer'] lr_sched = checkpoint['lr_scheduler'] best_val_acc = checkpoint['best_metrics'] for epoch in range(start_epoch, EPOCHS + 1): # Train t_pbar = tqdm(trn_dl, desc=pbar_desc('train', epoch, EPOCHS, 0.0, -1.0, -1.0)) ul_iter = iter(unlbl_dl) model.train() for (xs, ys) in t_pbar: try: xs_ul, ys_ul = next(ul_iter) except StopIteration: # Reset the iterator in case we've used # up all of the images ul_iter = iter(unlbl_dl) xs_ul, ys_ul = next(ul_iter) xs = xs.to(DEVICE) ys = ys.to(DEVICE) y_pred1 = model(xs) ce_loss = ce_loss_fn(y_pred1, ys) xs_ul = xs_ul.to(DEVICE) vat_loss = vat_loss_fn(xs_ul, model, logits=True) total_loss = ce_loss + vat_loss acc = metrics.accuracy(y_pred1, ys) f1 = metrics.f1_score(y_pred1, ys) trn_metrics.update(ce=ce_loss.item(), vat=vat_loss.item(), total=total_loss.item(), f1=f1.item(), accuracy=acc.item()) optimizer.zero_grad() total_loss.backward() optimizer.step() t_pbar.set_description( pbar_desc('train', epoch, EPOCHS, total_loss.item(), acc, f1)) # Final update to training bar avg_trn_metrics = trn_metrics.get_avg_losses() t_pbar.set_description( pbar_desc('train', epoch, EPOCHS, avg_trn_metrics['total'], avg_trn_metrics['accuracy'], avg_trn_metrics['f1'])) trn_metrics.update_tensorboard(epoch) # Validate v_pbar = tqdm(val_dl, desc=pbar_desc('valid', epoch, EPOCHS, 0.0, -1.0, -1.0)) model.eval() for xs, ys in v_pbar: xs = xs.to(DEVICE) ys = ys.to(DEVICE) y_pred1 = model(xs) ce_loss = ce_loss_fn(y_pred1, ys) acc = metrics.accuracy(y_pred1, ys) f1 = metrics.f1_score(y_pred1, ys) val_metrics.update(ce=ce_loss.item(), vat=0, total=ce_loss.item(), f1=f1.item(), accuracy=acc.item()) v_pbar.set_description( pbar_desc('valid', epoch, EPOCHS, ce_loss.item(), acc, f1)) avg_val_metrics = val_metrics.get_avg_losses() avg_acc = avg_val_metrics['accuracy'] if avg_acc > best_val_acc: best_val_acc = avg_acc torch.save( model.state_dict(), f'{WEIGHTS_SAVE_PATH}/{EXP_NO:02d}-{MODEL_NAME}_epoch-{epoch:04d}_acc-{avg_acc:.3f}.pth' ) # Final update to validation bar v_pbar.set_description( pbar_desc('train', epoch, EPOCHS, avg_val_metrics['total'], avg_val_metrics['accuracy'], avg_val_metrics['f1'])) val_metrics.update_tensorboard(epoch) # Update scheduler and save checkpoint lr_sched.step(epoch=epoch) save_checkpoint(epoch, model, best_val_acc, optimizer, lr_sched)
def evaluate(dataloader, model, word_vocab, label_vocab, output_path, prefix, use_gpu=False): model.eval() prediction = [] trues_list = [] preds_list = [] for batch in dataloader: batch_text, seq_length, word_perm_idx = batch['text'] batch_label, _, _ = batch['label'] char_inputs = batch['char'] char_inputs = char_inputs[word_perm_idx] char_dim = char_inputs.size(-1) char_inputs = char_inputs.contiguous().view(-1, char_dim) if use_gpu: batch_text = batch_text.cuda() batch_label = batch_label.cuda() char_inputs = char_inputs.cuda() mask = get_mask(batch_text) with torch.no_grad(): tag_seq = model(batch_text, seq_length, char_inputs, batch_label, mask) for line_tesor, labels_tensor, predicts_tensor in zip( batch_text, batch_label, tag_seq): for word_tensor, label_tensor, predict_tensor in zip( line_tesor, labels_tensor, predicts_tensor): if word_tensor.item() == 0: break line = [ word_vocab.id_to_word(word_tensor.item()), label_vocab.id_to_label(label_tensor.item()), label_vocab.id_to_label(predict_tensor.item()) ] trues_list.append(line[1]) preds_list.append(line[2]) prediction.append(' '.join(line)) prediction.append('') true_entities = get_entities_bio(trues_list) pred_entities = get_entities_bio(preds_list) print(len(trues_list), len(preds_list), len(prediction)) results = { "f1": f1_score(true_entities, pred_entities), 'report': classification_report(true_entities, pred_entities) } with open(os.path.join(output_path, '%s_pred.txt' % prefix), 'w', encoding='utf-8') as f: f.write('\n'.join(prediction)) with open(os.path.join(output_path, '%s_score.txt' % prefix), "a") as writer: writer.write("***** Eval results {} *****\n".format(prefix)) for key in sorted(results.keys()): if key == 'report_dict': continue writer.write("{} = {}\n".format(key, str(results[key]))) return results["f1"]
def main(): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id cudnn.benchmark = True if args.model == 'res18': net = resnet.ResNet18(num_classes=40).cuda() elif args.model == 'resnext': net = resnext.ResNeXt(cardinality=args.cardinality, depth=args.depth, nlabels=40, base_width=args.base_width, widen_factor=args.widen_factor).cuda() elif args.model == 'res_cifar': net = resnet_cifar.resnet20(num_classes=40).cuda() state_dict = torch.load(f'{args.model_path}/model_200.pth') net.load_state_dict(state_dict) criterion = nn.CrossEntropyLoss().cuda() metric_logger = utils.Logger(os.path.join(args.save_path, 'metric.log')) ''' Open Set Recognition ''' ''' validation ''' print('') print('Open Set Recognition/Out of Distribution Detection-Validation') print('known data: CIFAR40') print('unknown data: new-TinyImageNet158') print('') train_loader = dataloader.train_loader(args.data_root, args.data, args.batch_size) in_valid_loader = dataloader.in_dist_loader(args.data_root, args.in_data, args.batch_size, 'valid') ood_valid_loader = dataloader.out_dist_loader(args.data_root, 'new-tinyimagenet158', args.batch_size, 'valid') alpha_list = [40] eta_list = [5, 10, 20, 30, 40] for alpha in alpha_list: for eta in eta_list: args.weibull_alpha = alpha args.weibull_tail = eta in_softmax, in_openmax, in_softlogit, in_openlogit,\ _, _, _ = test(net, train_loader, in_valid_loader) out_softmax, out_openmax, out_softlogit, out_openlogit,\ _, _, _ = test(net, train_loader, ood_valid_loader) f1, li_f1, li_thresholds, \ li_precision, li_recall = metrics.f1_score(1-np.array(in_openmax), 1-np.array(out_openmax), pos_label=0) ood_scores = metrics.ood_metrics(1 - np.array(in_openmax), 1 - np.array(out_openmax)) if not os.path.exists(args.save_path): os.makedirs(args.save_path) metric_logger.write([ 'VAL CIFAR40-Tiny158', '\t', 'FPR@95%TPR', '\t', 'DET ERR', '\t', 'AUROC', '\t\t', 'AUPR-IN', '\t', 'AUPR-OUT', '\t', 'F1 SCORE', '\t', '' ]) metric_logger.write([ '', '\t\t\t', 100 * ood_scores['FPR95'], '\t', 100 * ood_scores['DTERR'], '\t', 100 * ood_scores['AUROC'], '\t', 100 * ood_scores['AUIN'], '\t', 100 * ood_scores['AUOUT'], '\t', f1, '\t', '' ]) # save to .csv with open(f'{args.save_path}/openmax-scores.csv', 'a', newline='') as f: columns = [ "", "FPR@95%TPR", "DET ERR", "AUROC", "AUPR-IN", "AUPR-OUT", "F1 SCORE" "alpha" "eta" ] writer = csv.writer(f) if args.weibull_alpha == 40 and args.weibull_tail == 5: writer.writerow([ '* Open Set Recognition/Out of Distribution Detection Validation-new-TinyImageNet158' ]) writer.writerow(columns) writer.writerow([ '', 100 * ood_scores['FPR95'], 100 * ood_scores['DTERR'], 100 * ood_scores['AUROC'], 100 * ood_scores['AUIN'], 100 * ood_scores['AUOUT'], f1, args.weibull_alpha, args.weibull_tail ]) # writer.writerow(['']) f.close()
def evaluate(model, iterator, f, ner_label, verbose = False): """Evaluate the model on `steps` batches.""" # set model to evaluation mode model.eval() y_true = [] y_pred = [] Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], [] with torch.no_grad(): for i, batch in enumerate(iterator): words, input_ids, is_heads, tags, input_tags, entity_label, seqlens = batch _, _, y_hat = model(input_ids, input_tags, entity_label) # y_hat: (N, T) Words.extend(words) Is_heads.extend(is_heads) Tags.extend(tags) Y.extend(input_tags.numpy().tolist()) Y_hat.extend(y_hat.cpu().numpy().tolist()) ## gets results and save with open("temp", 'w') as fout: for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat): y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1] preds = [ner_label.idx2tag[hat] for hat in y_hat] if len(preds[1:-1]) > 0: y_pred.append(preds[1:-1]) if len(tags.split()[1:-1]) > 0: y_true.append(tags.split()[1:-1]) assert len(preds) == len(words.split()) == len(tags.split()) for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]): fout.write(f"{w} {t} {p}\n") fout.write("\n") assert len(y_pred) == len(y_true) # logging loss, f1 and report p, r, f1 = f1_score(y_true, y_pred) # metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items()) # logging.info("- {} metrics: ".format(mark) + metrics_str) # # if verbose: # report = classification_report(true_tags, pred_tags) # logging.info(report) final = f + ".P%.4f_R%.4f_F%.4f" %(p, r, f1) with open(final, 'w') as fout: result = open("temp", "r").read() fout.write(f"{result}\n") fout.write(f"precision={p}\n") fout.write(f"recall={r}\n") fout.write(f"f1={f1}\n") if verbose: report = classification_report(y_true, y_pred) print(report) os.remove("temp") print("precision=%.2f"%p) print("recall=%.2f"%r) print("f1=%.2f"%f1) return p, r, f1
def evaluate(args, model, eval_dataloader, params): model.eval() # 记录平均损失 loss_avg = utils.RunningAverage() # init pre_result = [] gold_result = [] # get data for batch in tqdm(eval_dataloader, unit='Batch'): # to device batch = tuple(t.to(params.device) for t in batch) input_ids, input_mask, segment_ids, start_pos, end_pos, ne_cate = batch with torch.no_grad(): # get loss loss = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, start_positions=start_pos, end_positions=end_pos) if params.n_gpu > 1 and args.multi_gpu: loss = loss.mean() # mean() to average on multi-gpu. # update the average loss loss_avg.update(loss.item()) # inference start_logits, end_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) # gold label start_pos = start_pos.to("cpu").numpy().tolist() end_pos = end_pos.to("cpu").numpy().tolist() input_mask = input_mask.to('cpu').numpy().tolist() ne_cate = ne_cate.to("cpu").numpy().tolist() # predict label start_label = start_logits.detach().cpu().numpy().tolist() end_label = end_logits.detach().cpu().numpy().tolist() # idx to label cate_idx2label = { idx: value for idx, value in enumerate(params.label_list) } # get bio result for start_p, end_p, start_g, end_g, input_mask_s, ne_cate_s in zip( start_label, end_label, start_pos, end_pos, input_mask, ne_cate): ne_cate_str = cate_idx2label[ne_cate_s] # 问题长度 q_len = len(IO2QUERY[ne_cate_str]) # 有效长度 act_len = sum(input_mask_s[q_len + 2:-1]) # get BIO labels pre_bio_labels = pointer2bio(start_p[q_len + 2:q_len + 2 + act_len], end_p[q_len + 2:q_len + 2 + act_len], ne_cate=ne_cate_str) gold_bio_labels = pointer2bio(start_g[q_len + 2:q_len + 2 + act_len], end_g[q_len + 2:q_len + 2 + act_len], ne_cate=ne_cate_str) pre_result.append(pre_bio_labels) gold_result.append(gold_bio_labels) # metrics f1 = f1_score(y_true=gold_result, y_pred=pre_result) acc = accuracy_score(y_true=gold_result, y_pred=pre_result) # f1, acc metrics = {'loss': loss_avg(), 'f1': f1, 'acc': acc} metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items()) logging.info("- {} metrics: ".format('Val') + metrics_str) # f1 classification report report = classification_report(y_true=gold_result, y_pred=pre_result) logging.info(report) return metrics
pools = [] for i in range(5): pools.append(FeaturePooling(ims[i])) pred_points = model_gcn(graph, pools) # Compute eval metrics _, loss_norm = chamfer_loss(pred_points[-1], gt_points_list[0].squeeze(), normalized=True) _, loss_unorm = chamfer_loss(pred_points[-1], gt_points_list[0].squeeze(), normalized=False) tot_loss_norm += loss_norm.item() tot_loss_unorm += loss_unorm.item() tot_f1_1 += f1_score(pred_points[-1], gt_points_list[0].squeeze(), threshold=tau) tot_f1_2 += f1_score(pred_points[-1], gt_points_list[0].squeeze(), threshold=2 * tau) # Logs if n % log_step == 0: print("Batch", n) print("Normalized Chamfer loss so far", tot_loss_norm / (n + 1)) print("Unormalized Chamfer loss so far", tot_loss_unorm / (n + 1)) print("F1 score (tau=1e-4)", tot_f1_1 / (n + 1)) print("F1 score (tau=2e-4)", tot_f1_2 / (n + 1)) # Generate meshes if args.output is not None:
all_ids_conc = np.concatenate(all_ids, axis=0) # Argmax to get predicted label arg_type = np.argmax(all_preds_conc, 1) arg_sp = np.argmax(all_sp_conc, 1) arg_mt = np.argmax(all_mt_conc, 1) arg_ch = np.argmax(all_ch_conc, 1) arg_th = np.argmax(all_th_conc, 1) arg_y = np.argmax(all_y_conc, 1) all_tp = np.concatenate(all_tp, axis=0) # Calculate precision and recall for the cleavage site prec_sp, prec_mt, prec_ch, prec_th = precision_cs(arg_sp, arg_mt, arg_ch, arg_th, arg_type, all_tp, arg_y) recall_sp, recall_mt, recall_ch, recall_th = recall_cs(arg_sp, arg_mt, arg_ch, arg_th, arg_type, all_tp, arg_y) # Calculate f1 score f1_type = metrics.f1_score(all_tp, arg_type, average=None) # Summary print('========== Final results ==========') print('Signal\tF1 score\tPrec. CS\tRec. CS') print('noTP\t%.6f\t%.6f\t%.6f' % (f1_type[0], 0.0, 0.0)) print('SP\t%.6f\t%.6f\t%.6f' % (f1_type[1], prec_sp, recall_sp)) print('mTP\t%.6f\t%.6f\t%.6f' % (f1_type[2], prec_mt, recall_mt)) print('cTP\t%.6f\t%.6f\t%.6f' % (f1_type[3], prec_ch, recall_ch)) print('luTP\t%.6f\t%.6f\t%.6f' % (f1_type[4], prec_th, recall_th))
def run(self): self.model.eval() total_bleu = 0 total_f1 = 0 total_dist1 = 0 total_dist2 = 0 total_loss = 0 print('Run eval...') with torch.no_grad(): for batch_idx, feature in enumerate(self.test_iter): utils.feature_to_device(feature, self.device) out, out_lm = self.model(feature) print(self.vocab.itos(out[3, 0].argmax(dim=0).item()), self.vocab.itos(out_lm[3, 0].argmax(dim=0).item())) loss, loss_lm = models.AR.loss(self.out_loss_fn, out, out_lm, feature.resp, feature.lm.y) print(loss, loss_lm) loss = loss + self.model_config.alpha * loss_lm total_loss += loss.item() # target include w1, w2...[EOS], len: max_seq_length + 1 target = copy.deepcopy(feature.resp[1:]) # feature will be changed pred, pred_padded = utils.sample_sequence( feature, self.vocab, self.model, self.args) pred_tokens = [[self.vocab.itos(k) for k in ks] for ks in pred] target_tokens = [[[self.vocab.itos(k) for k in ks]] for ks in target.T.tolist()] print('----------------------------------') print( 'Context: ', ''.join([ self.vocab.itos(k) for k in feature.context.T.tolist()[0] ])) print( 'LM x: ', ''.join([ self.vocab.itos(k) for k in feature.lm.x.T.tolist()[0] ])) print( 'LM y: ', ''.join([ self.vocab.itos(k) for k in feature.lm.y.T.tolist()[0] ])) print( 'Pred: ', ''.join([ self.vocab.itos(k) for k in pred_padded.T.tolist()[0] ])) print('Target: ', ''.join(target_tokens[0][0])) print( 'Pred: ', ''.join([ self.vocab.itos(k) for k in pred_padded.T.tolist()[-1] ])) print('Target: ', ''.join(target_tokens[-1][0])) print('----------------------------------') bleu = metrics.bleu_score(pred_tokens, target_tokens) f1 = metrics.f1_score(pred_padded.T.to('cpu'), target.T.to('cpu')) # dist1 = metrics.distinct_score([v[:-1] for v in pred]) dist1 = metrics.distinct_score(pred_tokens) dist2 = metrics.distinct_score(pred_tokens, 2) total_bleu += bleu total_f1 += f1 total_dist1 += dist1 total_dist2 += dist2 l = len(self.test_iter) bleu = total_bleu / l f1 = total_f1 / l dist1 = total_dist1 / l dist2 = total_dist2 / l # https://stackoverflow.com/questions/59209086/calculate-perplexity-in-pytorch # see per-word perplexity: # https://github.com/huggingface/transfer-learning-conv-ai/blob/master/convai_evaluation.py#L161 # https://github.com/facebookresearch/ParlAI/blob/56d46551190a7ffaedccd13534412d43bc7076e5/parlai/scripts/eval_ppl.py ppl = math.exp(total_loss / l) print(f'\tBleu: {bleu:.8f} | F1: {f1:.8f} | ' f'Dist1: {dist1:.3f} | Dist2: {dist2:.3f} | PPL: {ppl:7.3f}')
continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) #Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] permutation_scores_ir.append({metric: eval_function(real_arrays, relevant_arrays)}) elif metric is None: f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f}) #Compute the final score for Error Statistics for result in permutation_scores_error: for key in result: final_score_error['avg'].setdefault(key, []) final_score_error['avg'][key].append(result[key]) for key in final_score_error['avg']: final_score_error['stdev'][key] = np.std(final_score_error['avg'][key]) final_score_error['avg'][key] = np.average(final_score_error['avg'][key]) #Compute the final score for IR statistics for result in permutation_scores_ir:
print('Reading Data Path') files, dirs = fe.get_file_name_and_path(parser.parse_args().data_dir) print('Spliting Train-Test Set ') train_x, train_y, test_x, test_y = train_test_split(files, dirs, 0.25) learner = svm.SVC(kernel='rbf', C=1) features = Features(train_x, train_y, 3, 0, 160) x = [] for f_name in train_x: x.append(features.get_x_vector(f_name, 'tfidf')) learner.fit(x, features.y_transform(train_y)) x=[] for f_name in test_x: x.append(features.get_x_vector(f_name, 'tfidf')) print('Score:', f1_score(features.y_transform(test_y), learner.predict(x).tolist())) # print('Test if "TFIDF" is better than "TF"') # print('TF:', kfold_average_score(learner, train_x, train_y, weight='tf',feature_size=2)) # print('TFIDF:', kfold_average_score(learner, train_x, train_y, weight='tfidf',feature_size=2)) # # # # # print('Test which min length of word is best') # lst = [] # for i in range(10): # lst.append(kfold_average_score(learner, files, dirs, k=5, min_word_len=i)) # print(i, lst[-1]) # do_plot(1, lst) # # # print('Test which min freq of word is best')
def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs): """ Evaluate on the folds of a dataset split Parameters ---------- recommender: The BaseRecommender instance The recommender instance to be evaluated. metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae'] If metrics is None, all metrics available will be evaluated. Otherwise it will return the specified metric evaluated. sampling_users: float or sampling, optional, default = None If an float is passed, it is the percentage of evaluated users. If sampling_users is None, all users are used in the evaluation. Specific sampling objects can be passed, see scikits.crab.metrics.sampling module for the list of possible objects. cv: integer or crossvalidation, optional, default = None If an integer is passed, it is the number of fold (default 3). Specific sampling objects can be passed, see scikits.crab.metrics.cross_validation module for the list of possible objects. at: integer, optional, default = None This number at is the 'at' value, as in 'precision at 5'. For example this would mean precision or recall evaluated by removing the top 5 preferences for a user and then finding the percentage of those 5 items included in the top 5 recommendations for that user. If at is None, it will consider all the top 3 elements. Returns ------- score: dict a dictionary containing the average results over the different permutations on the split. permutation_scores : array, shape = [n_permutations] The scores obtained for each permutations. """ sampling_users = kwargs.pop('sampling_users', 0.7) permutation = kwargs.pop('permutation', True) at = kwargs.pop('at', 3) if metric not in evaluation_metrics and metric is not None: raise ValueError('metric %s is not recognized. valid keywords \ are %s' % (metric, evaluation_metrics.keys())) permutation_scores_error = [] permutation_scores_ir = [] final_score_error = {'avg': {}, 'stdev': {}} final_score_ir = {'avg': {}, 'stdev': {}} n_users = recommender.model.users_count() sampling_users = check_sampling(sampling_users, n_users) users_set, _ = sampling_users.split(permutation=permutation) total_ratings = [] #Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: #Select the ratings to be evaluated. preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) total_ratings.extend([(user_id, preference) for preference in preferences]) n_ratings = len(total_ratings) cross_val = check_cv(cv, n_ratings) #Defining the splits and run on the splits. for train_set, test_set in cross_val: training_set = {} testing_set = {} for idx in train_set: user_id, pref = total_ratings[idx] if recommender.model.has_preference_values(): training_set.setdefault(user_id, {}) training_set[user_id][pref[0]] = pref[1] else: training_set.setdefault(user_id, {}) training_set[user_id][pref] = 1.0 for idx in test_set: user_id, pref = total_ratings[idx] if recommender.model.has_preference_values(): testing_set.setdefault(user_id, []) testing_set[user_id].append(pref) else: testing_set.setdefault(user_id, []) testing_set[user_id].append((pref, 1.0)) #Evaluate the recommender. recommender_training = self._build_recommender(training_set, \ recommender) real_preferences = [] estimated_preferences = [] for user_id, preferences in testing_set.iteritems(): for item_id, preference in preferences: #Estimate the preferences try: estimated = recommender_training.estimate_preference( user_id, item_id) real_preferences.append(preference) except: # It is possible that an item exists #in the test data but # not training data in which case #an exception will be # throw. Just ignore it and move on continue estimated_preferences.append(estimated) #Return the error results. if metric in ['rmse', 'mae', 'nmae']: eval_function = evaluation_metrics[metric] if metric == 'nmae': permutation_scores_error.append({ metric: eval_function(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value())}) else: permutation_scores_error.append( {metric: eval_function(real_preferences, estimated_preferences)}) elif metric is None: #Return all mae, nmae, rmse = evaluation_error(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) permutation_scores_error.append({'mae': mae, 'nmae': nmae, 'rmse': rmse}) #IR_Statistics (Precision, Recall and F1-Score) n_users = recommender.model.users_count() cross_val = check_cv(cv, n_users) for train_idx, test_idx in cross_val: relevant_arrays = [] real_arrays = [] for user_id in user_ids[train_idx]: preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) if len(preferences) < 2 * at: # Really not enough prefs to meaningfully evaluate the user continue # List some most-preferred items that would count as most if not recommender.model.has_preference_values(): preferences = [(preference, 1.0) for preference in preferences] preferences = sorted(preferences, key=lambda x: x[1], reverse=True) relevant_item_ids = [item_id for item_id, preference in preferences[:at]] if len(relevant_item_ids) == 0: continue #Build the training set. training_set = {} for other_user_id in recommender.model.user_ids(): preferences_other_user = \ recommender.model.preferences_from_user(other_user_id) if not recommender.model.has_preference_values(): preferences_other_user = [(preference, 1.0) for preference in preferences_other_user] if other_user_id == user_id: preferences_other_user = \ [pref for pref in preferences_other_user \ if pref[0] not in relevant_item_ids] if preferences_other_user: training_set[other_user_id] = \ dict(preferences_other_user) else: training_set[other_user_id] = dict(preferences_other_user) #Evaluate the recommender recommender_training = self._build_recommender(training_set, \ recommender) try: preferences = \ recommender_training.model.preferences_from_user(user_id) preferences = list(preferences) if not preferences: continue except: #Excluded all prefs for the user. move on. continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) #Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] permutation_scores_ir.append({metric: eval_function(real_arrays, relevant_arrays)}) elif metric is None: f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f}) #Compute the final score for Error Statistics for result in permutation_scores_error: for key in result: final_score_error['avg'].setdefault(key, []) final_score_error['avg'][key].append(result[key]) for key in final_score_error['avg']: final_score_error['stdev'][key] = np.std(final_score_error['avg'][key]) final_score_error['avg'][key] = np.average(final_score_error['avg'][key]) #Compute the final score for IR statistics for result in permutation_scores_ir: for key in result: final_score_ir['avg'].setdefault(key, []) final_score_ir['avg'][key].append(result[key]) for key in final_score_ir['avg']: final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key]) final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key]) permutation_scores = {} scores = {} if permutation_scores_error: permutation_scores['error'] = permutation_scores_error scores['final_error'] = final_score_error if permutation_scores_ir: permutation_scores['ir'] = permutation_scores_ir scores.setdefault('final_error', {}) scores['final_error'].setdefault('avg', {}) scores['final_error'].setdefault('stdev', {}) scores['final_error']['avg'].update(final_score_ir['avg']) scores['final_error']['stdev'].update(final_score_ir['stdev']) return permutation_scores, scores
def evaluate(self, recommender, metric=None, **kwargs): """ Evaluates the predictor Parameters ---------- recommender: The BaseRecommender instance The recommender instance to be evaluated. metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae'] If metrics is None, all metrics available will be evaluated. Otherwise it will return the specified metric evaluated. sampling_users: float or sampling, optional, default = None If an float is passed, it is the percentage of evaluated users. If sampling_users is None, all users are used in the evaluation. Specific sampling objects can be passed, see scikits.crab.metrics.sampling module for the list of possible objects. sampling_ratings: float or sampling, optional, default = None If an float is passed, it is the percentage of evaluated ratings. If sampling_ratings is None, 70% will be used in the training set and 30% in the test set. Specific sampling objects can be passed, see scikits.crab.metrics.sampling module for the list of possible objects. at: integer, optional, default = None This number at is the 'at' value, as in 'precision at 5'. For example this would mean precision or recall evaluated by removing the top 5 preferences for a user and then finding the percentage of those 5 items included in the top 5 recommendations for that user. If at is None, it will consider all the top 3 elements. Returns ------- Returns a dictionary containing the evaluation results: (NMAE, MAE, RMSE, Precision, Recall, F1-Score) """ sampling_users = kwargs.pop('sampling_users', None) sampling_ratings = kwargs.pop('sampling_ratings', 0.7) permutation = kwargs.pop('permutation', True) at = kwargs.pop('at', 3) if metric not in evaluation_metrics and metric is not None: raise ValueError('metric %s is not recognized. valid keywords \ are %s' % (metric, evaluation_metrics.keys())) n_users = recommender.model.users_count() sampling_users = check_sampling(sampling_users, n_users) users_set, _ = sampling_users.split(permutation=permutation) training_set = {} testing_set = {} #Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: #Select the ratings to be evaluated. preferences = recommender.model.preferences_from_user(user_id) sampling_eval = check_sampling(sampling_ratings, \ len(preferences)) train_set, test_set = sampling_eval.split(indices=True, permutation=permutation) preferences = list(preferences) if recommender.model.has_preference_values(): training_set[user_id] = dict((preferences[idx] for idx in train_set)) if preferences else {} testing_set[user_id] = [preferences[idx] for idx in test_set] if preferences else [] else: training_set[user_id] = dict(((preferences[idx], 1.0) for idx in train_set)) if preferences else {} testing_set[user_id] = [(preferences[idx], 1.0) for idx in test_set] if preferences else [] #Evaluate the recommender. recommender_training = self._build_recommender(training_set, \ recommender) real_preferences = [] estimated_preferences = [] for user_id, preferences in testing_set.iteritems(): for item_id, preference in preferences: #Estimate the preferences try: estimated = recommender_training.estimate_preference( user_id, item_id) real_preferences.append(preference) except ItemNotFoundError: # It is possible that an item exists in the test data but # not training data in which case an exception will be # throw. Just ignore it and move on continue estimated_preferences.append(estimated) #Return the error results. if metric in ['rmse', 'mae', 'nmae']: eval_function = evaluation_metrics[metric] if metric == 'nmae': return {metric: eval_function(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value())} return {metric: eval_function(real_preferences, estimated_preferences)} #IR_Statistics relevant_arrays = [] real_arrays = [] #Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) if len(preferences) < 2 * at: # Really not enough prefs to meaningfully evaluate the user continue # List some most-preferred items that would count as most if not recommender.model.has_preference_values(): preferences = [(preference, 1.0) for preference in preferences] preferences = sorted(preferences, key=lambda x: x[1], reverse=True) relevant_item_ids = [item_id for item_id, preference in preferences[:at]] if len(relevant_item_ids) == 0: continue training_set = {} for other_user_id in recommender.model.user_ids(): preferences_other_user = \ recommender.model.preferences_from_user(other_user_id) if not recommender.model.has_preference_values(): preferences_other_user = [(preference, 1.0) for preference in preferences_other_user] if other_user_id == user_id: preferences_other_user = \ [pref for pref in preferences_other_user \ if pref[0] not in relevant_item_ids] if preferences_other_user: training_set[other_user_id] = \ dict(preferences_other_user) else: training_set[other_user_id] = dict(preferences_other_user) #Evaluate the recommender recommender_training = self._build_recommender(training_set, \ recommender) try: preferences = \ recommender_training.model.preferences_from_user(user_id) preferences = list(preferences) if not preferences: continue except: #Excluded all prefs for the user. move on. continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) #Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] return {metric: eval_function(real_arrays, relevant_arrays)} if metric is None: #Return all mae, nmae, rmse = evaluation_error(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) return {'mae': mae, 'nmae': nmae, 'rmse': rmse, 'precision': p, 'recall': r, 'f1score': f}
# print(os.path.dirname(os.path.realpath(__file__))) parser = argparse.ArgumentParser("Read in data directory") parser.add_argument('data_dir') files, cls = fe.get_file_name_and_path(parser.parse_args().data_dir) train_X, train_Y, test_X, test_Y = train_test_split(files, cls, 0.25) nb = NaiveBayes() nb.fit(train_X, train_Y, 40) # lst = [] # for i in range(1, 100): # nb.fit(train_X, train_Y, i) # y_pred = nb.predict_list(test_X) # lst.append(f1_score(test_Y, y_pred)) # print(i,lst[-1]) # do_plot(0, lst) save_learner = open('naive_bayes.pkl','wb') pickle.dump(nb,save_learner) # load_learner = open('naive_bayes.pkl', 'rb') # nb = pickle.load(load_learner) # incorrect = 0 y_pred = nb.predict_list(test_X) error = 0 for i in range(len(y_pred)): if y_pred[i] != test_Y[i]: error += 1 print('Errors/Total:',error,'/',len(y_pred), 'F1: ', f1_score(test_Y, y_pred)) # print("InCorrect:", incorrect, "Present:", incorrect / len(test_X))
def evaluate(self, recommender, metric=None, **kwargs): """ Evaluates the predictor Parameters ---------- recommender: The BaseRecommender instance The recommender instance to be evaluated. metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae'] If metrics is None, all metrics available will be evaluated. Otherwise it will return the specified metric evaluated. sampling_users: float or sampling, optional, default = None If an float is passed, it is the percentage of evaluated users. If sampling_users is None, all users are used in the evaluation. Specific sampling objects can be passed, see scikits.crab.metrics.sampling module for the list of possible objects. sampling_ratings: float or sampling, optional, default = None If an float is passed, it is the percentage of evaluated ratings. If sampling_ratings is None, 70% will be used in the training set and 30% in the test set. Specific sampling objects can be passed, see scikits.crab.metrics.sampling module for the list of possible objects. at: integer, optional, default = None This number at is the 'at' value, as in 'precision at 5'. For example this would mean precision or recall evaluated by removing the top 5 preferences for a user and then finding the percentage of those 5 items included in the top 5 recommendations for that user. If at is None, it will consider all the top 3 elements. Returns ------- Returns a dictionary containing the evaluation results: (NMAE, MAE, RMSE, Precision, Recall, F1-Score) """ sampling_users = kwargs.pop('sampling_users', None) sampling_ratings = kwargs.pop('sampling_ratings', 0.7) permutation = kwargs.pop('permutation', True) at = kwargs.pop('at', 3) if metric not in evaluation_metrics and metric is not None: raise ValueError('metric %s is not recognized. valid keywords \ are %s' % (metric, evaluation_metrics.keys())) n_users = recommender.model.users_count() sampling_users = check_sampling(sampling_users, n_users) users_set, _ = sampling_users.split(permutation=permutation) training_set = {} testing_set = {} #Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: #Select the ratings to be evaluated. preferences = recommender.model.preferences_from_user(user_id) sampling_eval = check_sampling(sampling_ratings, \ len(preferences)) train_set, test_set = sampling_eval.split(indices=True, permutation=permutation) preferences = list(preferences) if recommender.model.has_preference_values(): training_set[user_id] = dict( (preferences[idx] for idx in train_set)) if preferences else {} testing_set[user_id] = [preferences[idx] for idx in test_set ] if preferences else [] else: training_set[user_id] = dict( ((preferences[idx], 1.0) for idx in train_set)) if preferences else {} testing_set[user_id] = [ (preferences[idx], 1.0) for idx in test_set ] if preferences else [] #Evaluate the recommender. recommender_training = self._build_recommender(training_set, \ recommender) real_preferences = [] estimated_preferences = [] for user_id, preferences in testing_set.iteritems(): for item_id, preference in preferences: #Estimate the preferences try: estimated = recommender_training.estimate_preference( user_id, item_id) real_preferences.append(preference) except ItemNotFoundError: # It is possible that an item exists in the test data but # not training data in which case an exception will be # throw. Just ignore it and move on continue estimated_preferences.append(estimated) #Return the error results. if metric in ['rmse', 'mae', 'nmae']: eval_function = evaluation_metrics[metric] if metric == 'nmae': return { metric: eval_function(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) } return { metric: eval_function(real_preferences, estimated_preferences) } #IR_Statistics relevant_arrays = [] real_arrays = [] #Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) if len(preferences) < 2 * at: # Really not enough prefs to meaningfully evaluate the user continue # List some most-preferred items that would count as most if not recommender.model.has_preference_values(): preferences = [(preference, 1.0) for preference in preferences] preferences = sorted(preferences, key=lambda x: x[1], reverse=True) relevant_item_ids = [ item_id for item_id, preference in preferences[:at] ] if len(relevant_item_ids) == 0: continue training_set = {} for other_user_id in recommender.model.user_ids(): preferences_other_user = \ recommender.model.preferences_from_user(other_user_id) if not recommender.model.has_preference_values(): preferences_other_user = [ (preference, 1.0) for preference in preferences_other_user ] if other_user_id == user_id: preferences_other_user = \ [pref for pref in preferences_other_user \ if pref[0] not in relevant_item_ids] if preferences_other_user: training_set[other_user_id] = \ dict(preferences_other_user) else: training_set[other_user_id] = dict(preferences_other_user) #Evaluate the recommender recommender_training = self._build_recommender(training_set, \ recommender) try: preferences = \ recommender_training.model.preferences_from_user(user_id) preferences = list(preferences) if not preferences: continue except: #Excluded all prefs for the user. move on. continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) #Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] return {metric: eval_function(real_arrays, relevant_arrays)} if metric is None: #Return all mae, nmae, rmse = evaluation_error( real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) return { 'mae': mae, 'nmae': nmae, 'rmse': rmse, 'precision': p, 'recall': r, 'f1score': f }
def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs): """ Evaluate on the folds of a dataset split Parameters ---------- recommender: The BaseRecommender instance The recommender instance to be evaluated. metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae'] If metrics is None, all metrics available will be evaluated. Otherwise it will return the specified metric evaluated. sampling_users: float or sampling, optional, default = None If an float is passed, it is the percentage of evaluated users. If sampling_users is None, all users are used in the evaluation. Specific sampling objects can be passed, see scikits.crab.metrics.sampling module for the list of possible objects. cv: integer or crossvalidation, optional, default = None If an integer is passed, it is the number of fold (default 3). Specific sampling objects can be passed, see scikits.crab.metrics.cross_validation module for the list of possible objects. at: integer, optional, default = None This number at is the 'at' value, as in 'precision at 5'. For example this would mean precision or recall evaluated by removing the top 5 preferences for a user and then finding the percentage of those 5 items included in the top 5 recommendations for that user. If at is None, it will consider all the top 3 elements. Returns ------- score: dict a dictionary containing the average results over the different permutations on the split. permutation_scores : array, shape = [n_permutations] The scores obtained for each permutations. """ sampling_users = kwargs.pop('sampling_users', 0.7) permutation = kwargs.pop('permutation', True) at = kwargs.pop('at', 3) if metric not in evaluation_metrics and metric is not None: raise ValueError('metric %s is not recognized. valid keywords \ are %s' % (metric, evaluation_metrics.keys())) permutation_scores_error = [] permutation_scores_ir = [] final_score_error = {'avg': {}, 'stdev': {}} final_score_ir = {'avg': {}, 'stdev': {}} n_users = recommender.model.users_count() sampling_users = check_sampling(sampling_users, n_users) users_set, _ = sampling_users.split(permutation=permutation) total_ratings = [] #Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: #Select the ratings to be evaluated. preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) total_ratings.extend([(user_id, preference) for preference in preferences]) n_ratings = len(total_ratings) cross_val = check_cv(cv, n_ratings) #Defining the splits and run on the splits. for train_set, test_set in cross_val: training_set = {} testing_set = {} for idx in train_set: user_id, pref = total_ratings[idx] if recommender.model.has_preference_values(): training_set.setdefault(user_id, {}) training_set[user_id][pref[0]] = pref[1] else: training_set.setdefault(user_id, {}) training_set[user_id][pref] = 1.0 for idx in test_set: user_id, pref = total_ratings[idx] if recommender.model.has_preference_values(): testing_set.setdefault(user_id, []) testing_set[user_id].append(pref) else: testing_set.setdefault(user_id, []) testing_set[user_id].append((pref, 1.0)) #Evaluate the recommender. recommender_training = self._build_recommender(training_set, \ recommender) real_preferences = [] estimated_preferences = [] for user_id, preferences in testing_set.iteritems(): for item_id, preference in preferences: #Estimate the preferences try: estimated = recommender_training.estimate_preference( user_id, item_id) real_preferences.append(preference) except: # It is possible that an item exists #in the test data but # not training data in which case #an exception will be # throw. Just ignore it and move on continue estimated_preferences.append(estimated) #Return the error results. if metric in ['rmse', 'mae', 'nmae']: eval_function = evaluation_metrics[metric] if metric == 'nmae': permutation_scores_error.append({ metric: eval_function( real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) }) else: permutation_scores_error.append({ metric: eval_function(real_preferences, estimated_preferences) }) elif metric is None: #Return all mae, nmae, rmse = evaluation_error( real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) permutation_scores_error.append({ 'mae': mae, 'nmae': nmae, 'rmse': rmse }) #IR_Statistics (Precision, Recall and F1-Score) n_users = recommender.model.users_count() cross_val = check_cv(cv, n_users) for train_idx, test_idx in cross_val: relevant_arrays = [] real_arrays = [] for user_id in user_ids[train_idx]: preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) if len(preferences) < 2 * at: # Really not enough prefs to meaningfully evaluate the user continue # List some most-preferred items that would count as most if not recommender.model.has_preference_values(): preferences = [(preference, 1.0) for preference in preferences] preferences = sorted(preferences, key=lambda x: x[1], reverse=True) relevant_item_ids = [ item_id for item_id, preference in preferences[:at] ] if len(relevant_item_ids) == 0: continue #Build the training set. training_set = {} for other_user_id in recommender.model.user_ids(): preferences_other_user = \ recommender.model.preferences_from_user(other_user_id) if not recommender.model.has_preference_values(): preferences_other_user = [ (preference, 1.0) for preference in preferences_other_user ] if other_user_id == user_id: preferences_other_user = \ [pref for pref in preferences_other_user \ if pref[0] not in relevant_item_ids] if preferences_other_user: training_set[other_user_id] = \ dict(preferences_other_user) else: training_set[other_user_id] = dict( preferences_other_user) #Evaluate the recommender recommender_training = self._build_recommender(training_set, \ recommender) try: preferences = \ recommender_training.model.preferences_from_user(user_id) preferences = list(preferences) if not preferences: continue except: #Excluded all prefs for the user. move on. continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) #Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] permutation_scores_ir.append( {metric: eval_function(real_arrays, relevant_arrays)}) elif metric is None: f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) permutation_scores_ir.append({ 'precision': p, 'recall': r, 'f1score': f }) #Compute the final score for Error Statistics for result in permutation_scores_error: for key in result: final_score_error['avg'].setdefault(key, []) final_score_error['avg'][key].append(result[key]) for key in final_score_error['avg']: final_score_error['stdev'][key] = np.std( final_score_error['avg'][key]) final_score_error['avg'][key] = np.average( final_score_error['avg'][key]) #Compute the final score for IR statistics for result in permutation_scores_ir: for key in result: final_score_ir['avg'].setdefault(key, []) final_score_ir['avg'][key].append(result[key]) for key in final_score_ir['avg']: final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key]) final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key]) permutation_scores = {} scores = {} if permutation_scores_error: permutation_scores['error'] = permutation_scores_error scores['final_error'] = final_score_error if permutation_scores_ir: permutation_scores['ir'] = permutation_scores_ir scores.setdefault('final_error', {}) scores['final_error'].setdefault('avg', {}) scores['final_error'].setdefault('stdev', {}) scores['final_error']['avg'].update(final_score_ir['avg']) scores['final_error']['stdev'].update(final_score_ir['stdev']) return permutation_scores, scores
def evaluate(self, recommender, metric=None, **kwargs): sampling_users = kwargs.pop('sampling_users', None) sampling_ratings = kwargs.pop('sampling_ratings', 0.7) permutation = kwargs.pop('permutation', True) at = kwargs.pop('at', 3) if metric not in evaluation_metrics and metric is not None: raise ValueError('metric %s is not recognized. valid keywords \ are %s' % (metric, evaluation_metrics.keys())) n_users = recommender.model.users_count() sampling_users = check_sampling(sampling_users, n_users) users_set, _ = sampling_users.split(permutation=permutation) training_set = {} testing_set = {} # Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: # Select the ratings to be evaluated. preferences = recommender.model.preferences_from_user(user_id) sampling_eval = check_sampling(sampling_ratings, \ len(preferences)) train_set, test_set = sampling_eval.split(indices=True, permutation=permutation) preferences = list(preferences) if recommender.model.has_preference_values(): training_set[user_id] = dict((preferences[idx] for idx in train_set)) if preferences else {} testing_set[user_id] = [preferences[idx] for idx in test_set] if preferences else [] else: training_set[user_id] = dict(((preferences[idx], 1.0) for idx in train_set)) if preferences else {} testing_set[user_id] = [(preferences[idx], 1.0) for idx in test_set] if preferences else [] # Evaluate the recommender. recommender_training = self._build_recommender(training_set, \ recommender) real_preferences = [] estimated_preferences = [] for user_id, preferences in testing_set.iteritems(): for item_id, preference in preferences: # Estimate the preferences try: estimated = recommender_training.estimate_preference( user_id, item_id) real_preferences.append(preference) except ItemNotFoundError: # It is possible that an item exists in the test data but # not training data in which case an exception will be # throw. Just ignore it and move on continue estimated_preferences.append(estimated) # Return the error results. if metric in ['rmse', 'mae', 'nmae']: eval_function = evaluation_metrics[metric] if metric == 'nmae': return {metric: eval_function(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value())} return {metric: eval_function(real_preferences, estimated_preferences)} # IR_Statistics relevant_arrays = [] real_arrays = [] # Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) if len(preferences) < 2 * at: # Really not enough prefs to meaningfully evaluate the user continue # List some most-preferred items that would count as most if not recommender.model.has_preference_values(): preferences = [(preference, 1.0) for preference in preferences] preferences = sorted(preferences, key=lambda x: x[1], reverse=True) relevant_item_ids = [item_id for item_id, preference in preferences[:at]] if len(relevant_item_ids) == 0: continue training_set = {} for other_user_id in recommender.model.user_ids(): preferences_other_user = \ recommender.model.preferences_from_user(other_user_id) if not recommender.model.has_preference_values(): preferences_other_user = [(preference, 1.0) for preference in preferences_other_user] if other_user_id == user_id: preferences_other_user = \ [pref for pref in preferences_other_user \ if pref[0] not in relevant_item_ids] if preferences_other_user: training_set[other_user_id] = \ dict(preferences_other_user) else: training_set[other_user_id] = dict(preferences_other_user) # Evaluate the recommender recommender_training = self._build_recommender(training_set, \ recommender) try: preferences = \ recommender_training.model.preferences_from_user(user_id) preferences = list(preferences) if not preferences: continue except: # Excluded all prefs for the user. move on. continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) # Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] return {metric: eval_function(real_arrays, relevant_arrays)} if metric is None: # Return all mae, nmae, rmse = evaluation_error(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) return {'mae': mae, 'nmae': nmae, 'rmse': rmse, 'precision': p, 'recall': r, 'f1score': f}
def predict(loss_fn, model, data_set, data_loader, counting=False): """ Validate after training an epoch Note: """ model.eval() true_positives = [] predicted_positives = [] possible_positives = [] union_areas = [] loss = [] for bc_cnt, bc_data in enumerate(data_loader): if counting: print('%d/%d' % (bc_cnt, len(data_set) // data_loader.batch_size)) imgs, masks, _ = bc_data imgs = Variable(imgs).cuda() masks = Variable(masks).cuda() # labels = Variable(labels).cuda() outputs = model(imgs) # outputs = outputs.view(-1, outputs.size()[2], outputs.size()[3]) # print outputs.size(), masks.size() # if outputs.size() != masks.size(): # outputs = F.upsample(outputs, size=masks.size()[-2:], mode='bilinear') mask_loss = torch.zeros(1).cuda() for o in outputs: o = o.view(-1, o.size()[2], o.size()[3]) mask_loss = mask_loss + float(loss_fn(o, masks)) # mask_loss = mask_loss # loss = criterion(outputs, masks) loss.append(mask_loss) # loss.append(loss_fn(outputs, masks)) # outputs = F.softmax(model(imgs), dim=1) # if outputs.size() != masks.size(): # outputs = F.upsample(outputs, size=masks.size()[-2:], mode='bilinear') # # _, outputs = torch.max(outputs, dim=1) output = outputs[-1] output = output.view(-1, output.size()[2], output.size()[3]) output = output.cpu().data.numpy() # labels = labels.cpu().data.numpy() masks = masks.cpu().data.numpy() imgs = imgs.cpu().data.numpy() true_positive, predicted_positive, possible_positive, union_area = metrics_pred( output, imgs, masks) true_positives += true_positive predicted_positives += predicted_positive possible_positives += possible_positive union_areas += union_area precisions = precision(true_positives, predicted_positives) recalls = recall(true_positives, possible_positives) f1_scores = f1_score(recalls, precisions) loss = torch.tensor(loss) return precisions, recalls, f1_scores, loss.mean()
sents = sents[idx] labs = labs[idx] loss = model.neg_log_likelihood(sents, labs, lens) loss.backward() optimizer.step() score, preds = model(sents, lens) true_labs = [ seqid2text(labs[i, :l], ix_to_lab) for i, l in enumerate(lens) ] pred_labs = [ seqid2text(preds[i, :l], ix_to_lab) for i, l in enumerate(lens) ] acc = accuracy_score(true_labs, pred_labs) f1 = f1_score(true_labs, pred_labs) print( "Epoch {}, batch {}, train loss {:.4f}, train acc {:.4f}, train f1 {:.4f} " .format(epoch, i, loss.item(), acc, f1)) if ((i + 1) % 50 == 0): with torch.no_grad(): model.eval() print("Evaluation on validation set") true_labels = [] pred_labels = [] for batch in val_data_loader: sents, labs, lens = batch sents = pad_sequence(sents, batch_first=True).to(device) labs = pad_sequence(labs, batch_first=True).to(device)
descriptions, split_sentences=False, transform_labels=False) model = 'sbw' #idf = create_decs_embeddings() # Run just in case you don't have decs_mix decs_sbw and idf json files with open('../embeddings/idf.json') as f: idf = json.load(f) dev_sbw_similarity = similarity(x_dev, model, idf) result = np.apply_along_axis(top_k_values, 1, dev_sbw_similarity, 100) create_json(dev['id'], result, descriptions, model) with open(f'../embeddings/{model}_predictions.json') as json_file: data = json.load(json_file) pred = [] for doc in data['documents']: pred.append(doc['labels']) real = dev["decsCodes"] assert (len(real) == len(pred)) tp, fn, fp, p, r, f1 = f1_score(real, pred) print(F'TP: {tp}') print(F'FN: {fn}') print(F'FP: {fp}') print(f'Precision: {p}') print(f'Recall: {r}') print(f'F1-Score: {f1}')
def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs): sampling_users = kwargs.pop('sampling_users', 0.7) permutation = kwargs.pop('permutation', True) at = kwargs.pop('at', 3) if metric not in evaluation_metrics and metric is not None: raise ValueError('metric %s is not recognized. valid keywords \ are %s' % (metric, evaluation_metrics.keys())) permutation_scores_error = [] permutation_scores_ir = [] final_score_error = {'avg': {}, 'stdev': {}} final_score_ir = {'avg': {}, 'stdev': {}} n_users = recommender.model.users_count() sampling_users = check_sampling(sampling_users, n_users) users_set, _ = sampling_users.split(permutation=permutation) total_ratings = [] # Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: # Select the ratings to be evaluated. preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) total_ratings.extend([(user_id, preference) for preference in preferences]) n_ratings = len(total_ratings) cross_val = check_cv(cv, n_ratings) # Defining the splits and run on the splits. for train_set, test_set in cross_val: training_set = {} testing_set = {} for idx in train_set: user_id, pref = total_ratings[idx] if recommender.model.has_preference_values(): training_set.setdefault(user_id, {}) training_set[user_id][pref[0]] = pref[1] else: training_set.setdefault(user_id, {}) training_set[user_id][pref] = 1.0 for idx in test_set: user_id, pref = total_ratings[idx] if recommender.model.has_preference_values(): testing_set.setdefault(user_id, []) testing_set[user_id].append(pref) else: testing_set.setdefault(user_id, []) testing_set[user_id].append((pref, 1.0)) # Evaluate the recommender. recommender_training = self._build_recommender(training_set, \ recommender) real_preferences = [] estimated_preferences = [] for user_id, preferences in testing_set.iteritems(): for item_id, preference in preferences: # Estimate the preferences try: estimated = recommender_training.estimate_preference( user_id, item_id) real_preferences.append(preference) except: # It is possible that an item exists # in the test data but # not training data in which case # an exception will be # throw. Just ignore it and move on continue estimated_preferences.append(estimated) # Return the error results. if metric in ['rmse', 'mae', 'nmae']: eval_function = evaluation_metrics[metric] if metric == 'nmae': permutation_scores_error.append({ metric: eval_function(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value())}) else: permutation_scores_error.append( {metric: eval_function(real_preferences, estimated_preferences)}) elif metric is None: # Return all mae, nmae, rmse = evaluation_error(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) permutation_scores_error.append({'mae': mae, 'nmae': nmae, 'rmse': rmse}) # IR_Statistics (Precision, Recall and F1-Score) n_users = recommender.model.users_count() cross_val = check_cv(cv, n_users) for train_idx, test_idx in cross_val: relevant_arrays = [] real_arrays = [] for user_id in user_ids[train_idx]: preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) if len(preferences) < 2 * at: # Really not enough prefs to meaningfully evaluate the user continue # List some most-preferred items that would count as most if not recommender.model.has_preference_values(): preferences = [(preference, 1.0) for preference in preferences] preferences = sorted(preferences, key=lambda x: x[1], reverse=True) relevant_item_ids = [item_id for item_id, preference in preferences[:at]] if len(relevant_item_ids) == 0: continue # Build the training set. training_set = {} for other_user_id in recommender.model.user_ids(): preferences_other_user = recommender.model.preferences_from_user(other_user_id) if not recommender.model.has_preference_values(): preferences_other_user = [(preference, 1.0) for preference in preferences_other_user] if other_user_id == user_id: preferences_other_user = [pref for pref in preferences_other_user if pref[0] not in relevant_item_ids] if preferences_other_user: training_set[other_user_id] = dict(preferences_other_user) else: training_set[other_user_id] = dict(preferences_other_user) # Evaluate the recommender recommender_training = self._build_recommender(training_set, recommender) try: preferences = recommender_training.model.preferences_from_user(user_id) preferences = list(preferences) if not preferences: continue except: # Excluded all prefs for the user. move on. continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) # Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] permutation_scores_ir.append({metric: eval_function(real_arrays, relevant_arrays)}) elif metric is None: f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f}) # Compute the final score for Error Statistics for result in permutation_scores_error: for key in result: final_score_error['avg'].setdefault(key, []) final_score_error['avg'][key].append(result[key]) for key in final_score_error['avg']: final_score_error['stdev'][key] = np.std(final_score_error['avg'][key]) final_score_error['avg'][key] = np.average(final_score_error['avg'][key]) # Compute the final score for IR statistics for result in permutation_scores_ir: for key in result: final_score_ir['avg'].setdefault(key, []) final_score_ir['avg'][key].append(result[key]) for key in final_score_ir['avg']: final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key]) final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key]) permutation_scores = {} scores = {} if permutation_scores_error: permutation_scores['error'] = permutation_scores_error scores['final_error'] = final_score_error if permutation_scores_ir: permutation_scores['ir'] = permutation_scores_ir scores.setdefault('final_error', {}) scores['final_error'].setdefault('avg', {}) scores['final_error'].setdefault('stdev', {}) scores['final_error']['avg'].update(final_score_ir['avg']) scores['final_error']['stdev'].update(final_score_ir['stdev']) return permutation_scores, scores
def test(self): self.model.eval() batch_loss_history = [] n_total_words = 0 n_sentences = 0 f1_total = [] for batch_i, (conversations, conversation_length, sentence_length) in enumerate( tqdm(self.test_data_loader, ncols=80)): # conversations: (batch_size) list of conversations # conversation: list of sentences # sentence: list of tokens # conversation_length: list of int # sentence_length: (batch_size) list of conversation list of sentence_lengths input_conversations = [conv[:-1] for conv in conversations] target_conversations = [conv[1:] for conv in conversations] # flatten input and target conversations input_sentences = [ sent for conv in input_conversations for sent in conv ] target_sentences = [ sent for conv in target_conversations for sent in conv ] input_sentence_length = [ l for len_list in sentence_length for l in len_list[:-1] ] target_sentence_length = [ l for len_list in sentence_length for l in len_list[1:] ] input_conversation_length = [l - 1 for l in conversation_length] with torch.no_grad(): input_sentences = to_var(torch.LongTensor(input_sentences)) target_sentences = to_var(torch.LongTensor(target_sentences)) input_sentence_length = to_var( torch.LongTensor(input_sentence_length)) target_sentence_length = to_var( torch.LongTensor(target_sentence_length)) input_conversation_length = to_var( torch.LongTensor(input_conversation_length)) if batch_i == 0: self.generate_sentence(input_sentences, input_sentence_length, input_conversation_length, target_sentences) generated_sentences = self.generate_conversations_with_gold_responses( input_sentences, input_sentence_length, input_conversation_length, target_sentences) conv_f1 = 0 for target_sent, output_sent in zip(target_sentences, generated_sentences): target_sent = self.vocab.decode(target_sent) output_sent = self.vocab.decode(output_sent) f1 = metrics.f1_score(output_sent, target_sent) conv_f1 += f1 conv_f1 = conv_f1 / target_sentences.shape[0] sentence_logits = self.model(input_sentences, input_sentence_length, input_conversation_length, target_sentences) batch_loss, n_words = masked_cross_entropy(sentence_logits, target_sentences, target_sentence_length) assert not isnan(batch_loss.item()) batch_loss_history.append(batch_loss.item()) n_total_words += n_words.item() f1_total.append(conv_f1) n_sentences += target_sentences.shape[0] epoch_loss = np.sum(batch_loss_history) / n_total_words f1_average = np.sum(f1_total) / n_sentences print(f'Number of words: {n_total_words}') print(f'Bits per word: {epoch_loss:.3f}') word_perplexity = np.exp(epoch_loss) return word_perplexity, f1_average
# 测试集AUC probs_test= lr_model.predict_proba(X_test) predict_test = lr_model.predict(X_test) AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1]) print("Test Auc: %s"%(AUC2)) # 准确率 accuracy = metrics.accuracy_score(Y_test, predict_test) print("Test Accuracy: %s"%(accuracy)) # 召回率 recall = metrics.recall_score(Y_test, predict_test) print("Test Recall: %s"%(recall)) # F1值 f1 = metrics.f1_score(Y_test, predict_test) print("Test F1: %s"%(f1)) In [42]: # 3.4 打印模型参数 w=lr_model.coef_ print("参数大小:") print(w.shape) print("参数前10个:") print(lr_model.coef_[:,0:10]) print("截距:") print(lr_model.intercept_) print("稀疏化特征比率:%.2f%%" %(np.mean(lr_model.coef_.ravel()==0)*100)) print("sigmoid函数转化的值,即:概率p") print(lr_model.predict_proba(X_test[0:5]))
def evaluating(self, model, dataset, split): """ input: model: (object) pytorch model dataset: (object) dataset split: (str) split of dataset in ['train', 'val', 'test'] return [overall_accuracy, precision, recall, f1-score, jaccard, kappa] """ args = self.args oa, precision, recall, f1, jac, kappa = 0, 0, 0, 0, 0, 0 model.eval() data_loader = DataLoader(dataset, args.batch_size, num_workers=4, shuffle=False) batch_iterator = iter(data_loader) steps = len(dataset) // args.batch_size start = time.time() for step in range(steps): x, y = next(batch_iterator) x = Variable(x, volatile=True) y = Variable(y, volatile=True) if args.cuda: x = x.cuda() y = y.cuda() # calculate pixel accuracy of generator gen_y = model(x) if self.is_multi: gen_y = gen_y[0] oa += metrics.overall_accuracy(gen_y.data, y.data) precision += metrics.precision(gen_y.data, y.data) recall += metrics.recall(gen_y.data, y.data) f1 += metrics.f1_score(gen_y.data, y.data) jac += metrics.jaccard(gen_y.data, y.data) kappa += metrics.kappa(gen_y.data, y.data) _time = time.time() - start if not os.path.exists(os.path.join(Logs_DIR, 'statistic')): os.makedirs(os.path.join(Logs_DIR, 'statistic')) # recording performance of the model nb_samples = steps * args.batch_size basic_info = [ self.date, self.method, self.epoch, self.iter, nb_samples, _time ] basic_info_names = [ 'date', 'method', 'epochs', 'iters', 'nb_samples', 'time(sec)' ] perform = [ round(idx / steps, 3) for idx in [oa, precision, recall, f1, jac, kappa] ] perform_names = [ "overall_accuracy", "precision", "recall", "f1-score", "jaccard", "kappa" ] cur_log = pd.DataFrame([basic_info + perform], columns=basic_info_names + perform_names) # save performance if os.path.exists( os.path.join(Logs_DIR, 'statistic', "{}.csv".format(split))): logs = pd.read_csv( os.path.join(Logs_DIR, 'statistic', "{}.csv".format(split))) else: logs = pd.DataFrame([]) logs = logs.append(cur_log, ignore_index=True) logs.to_csv(os.path.join(Logs_DIR, 'statistic', "{}.csv".format(split)), index=False, float_format='%.3f')
def evaluate(args, model, eval_dataloader, params): model.eval() # 记录平均损失 loss_avg = utils.RunningAverage() # init pre_result = [] gold_result = [] # get data for batch in tqdm(eval_dataloader, unit='Batch', ascii=True): # fetch the next training batch batch = tuple(t.to(params.device) for t in batch) input_ids, input_mask, start_pos, end_pos, _, _ = batch with torch.no_grad(): # get loss loss = model(input_ids, attention_mask=input_mask, start_positions=start_pos, end_positions=end_pos) if params.n_gpu > 1 and args.multi_gpu: loss = loss.mean() # mean() to average on multi-gpu. # update the average loss loss_avg.update(loss.item()) # inference start_pre, end_pre = model(input_ids=input_ids, attention_mask=input_mask) # gold label start_pos = start_pos.to("cpu").numpy().transpose((0, 2, 1)).tolist() # (batch_size, tag_size, seq_len) end_pos = end_pos.to("cpu").numpy().transpose((0, 2, 1)).tolist() input_mask = input_mask.to('cpu').numpy().tolist() # predict label start_label = start_pre.detach().cpu().numpy().transpose((0, 2, 1)).tolist() end_label = end_pre.detach().cpu().numpy().transpose((0, 2, 1)).tolist() # idx to label cate_idx2label = {idx: str(idx + 1) for idx, _ in enumerate(params.label_list)} # get bio result for start_p_s, end_p_s, start_g_s, end_g_s, input_mask_s in zip(start_label, end_label, start_pos, end_pos, input_mask): # 有效长度 act_len = sum(input_mask_s) for idx, (start_p, end_p, start_g, end_g) in enumerate(zip(start_p_s, end_p_s, start_g_s, end_g_s)): pre_bio_labels = pointer2bio(start_p[:act_len], end_p[:act_len], ne_cate=cate_idx2label[idx]) gold_bio_labels = pointer2bio(start_g[:act_len], end_g[:act_len], ne_cate=cate_idx2label[idx]) pre_result.append(pre_bio_labels) gold_result.append(gold_bio_labels) # metrics f1 = f1_score(y_true=gold_result, y_pred=pre_result) acc = accuracy_score(y_true=gold_result, y_pred=pre_result) # f1, acc metrics = {'loss': loss_avg(), 'f1': f1, 'acc': acc} metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items()) logging.info("- {} metrics: ".format('Val') + metrics_str) # f1 classification report report = classification_report(y_true=gold_result, y_pred=pre_result) logging.info(report) return metrics