Python f1 예제들, evaluate.f1 Python 예제들

예제 #1

0

파일 보기

파일: arg_experiment.py 프로젝트: shaform/disambig

def compute_overlap(pd_arg, t_arg):
    start, end = pd_arg
    t_start, t_end = t_arg

    tp = max(0, min(end, t_end) - max(start, t_start))

    prec = tp / (end - start)
    recall = tp / (t_end - t_start)
    return evaluate.f1(recall, prec)

예제 #2

0

파일 보기

파일: get_CV_results.py 프로젝트: sarwar187/OTyper

def get_CV_results():
    folder = './type_f1_file/'
    model_string = 'attention_'
    flag_string = '0_0_1_1_'
    folder_log = './log_files/'
    with open('data/test_type_count.pkl', 'r') as f:
        b = pickle.load(f)

    final_test_dict = {}
    final_auc_dict = {}
    total_F1_num = np.zeros(3)
    example_f1s = []

    for i in range(20, 30):
        cv_id = i % 10

        cv = str(i)

        test_dict, test_F1_num, auc_dict = get_one_CV_results_test_F1s(
            folder, folder_log, model_string, flag_string, cv)

        final_test_dict.update(test_dict)
        final_auc_dict.update(auc_dict)
        total_F1_num += test_F1_num
        log_file_path = folder_log + model_string + flag_string + cv + '.txt'
        example_f1s.append(parse_results.tail(log_file_path, 2)[0][0])

    count_sum = 0
    for k in final_test_dict:
        count_sum += b[int(k)][1]

    type_F1_weighted = 0.0
    for k in final_test_dict:
        type_F1_weighted += (b[int(k)][1] / count_sum) * final_test_dict[k]

    print 'OpenNET type weighted average Macro {}'.format(type_F1_weighted)
    precision = total_F1_num[0] / total_F1_num[1]
    recall = total_F1_num[0] / total_F1_num[2]
    print 'OpenNET type micro {}'.format(evaluate.f1(precision, recall))

    auc_array = []
    for k in final_auc_dict:
        auc_array.append((b[int(k)][1] / count_sum) * final_auc_dict[k])
    print 'average auc = {}'.format(np.sum(auc_array))

    print '---popularity---'
    print 'id---name---freq---auc---'
    dicts = joblib.load(
        '/home/zys133/knowledge_base/NFGEC/data/Wiki/dicts_figer.pkl')
    for k in final_auc_dict:
        print '{}\t{}\t{}\t{:.4f}'.format(k, dicts['id2label'][k], b[k][1],
                                          final_auc_dict[k])

예제 #3

0

파일 보기

파일: transformer.py 프로젝트: psyfb2/Chatbot

    def eval_f1(self):
        '''
        Get test set F1 score: 2 . (precision * recall) / (precision + recall)
        where an F1 score is calculated for each reply and the mean is returned
        Note: this can take some time

        Returns
        -------
        float
            mean F1 score

        '''
        get_reply = (lambda persona, msg: generate_reply_transformer(
            persona, msg, self.tokenizer, self.transformer, self.out_seq_length
        )[0])
        return evaluate.f1(get_reply)

예제 #4

0

파일 보기

    def eval_f1(self):
        '''
        Get test set F1 score: 2 . (precision * recall) / (precision + recall)
        where an F1 score is calculated for each reply
        Note: this can take some time

        Returns
        -------
        float
            mean F1 score

        '''
        get_reply = (lambda persona, msg: generate_reply_seq2seq(
            self.encoder, self.decoder, self.tokenizer,
            (pre.START_SEQ_TOKEN + ' ' + persona + ' ' + pre.SEP_SEQ_TOKEN +
             ' ' + msg + ' ' + pre.END_SEQ_TOKEN), self.persona_length + self.
            msg_length, self.reply_length)[0])
        return evaluate.f1(get_reply)

예제 #5

0

파일 보기

pool = Pool(args.ap_workers)
jobs = []
for iou_idx, min_overlap in enumerate(iou_range):
    for cls in range(num_class):
        jobs.append(
            pool.apply_async(eval_ap,
                             args=(
                                 [min_overlap],
                                 iou_idx,
                                 cls,
                                 gt_by_class[cls],
                                 prediction_by_class[cls],
                             ),
                             callback=callback))
    f1 = evaluate.f1(prediction, min_overlap, gt)
    f1_values[iou_idx] = f1
pool.close()
pool.join()
print("Evaluation done.\n\n")

map_iou = ap_values.mean(axis=0)
mar = ar_values.mean(axis=0)
display_title = "Detection Performance on {}".format(args.dataset)

display_data = [["IoU thresh"], ["mean AP"], ["mean AR"], ["F1 criterion"]]

for i in range(len(iou_range)):
    display_data[0].append("{:.02f}".format(iou_range[i]))
    display_data[1].append("{:.04f}".format(map_iou[i]))
    display_data[2].append("{:.04f}".format(mar[i]))

예제 #6

0

파일 보기

파일: json_eval.py 프로젝트: zdy023/code

# calculate
ious = np.arange(0.1, 1.0, 0.1)
aps = np.zeros((len(ious), label_count))
ars = np.zeros((len(ious), label_count))
f1s = np.zeros((len(ious), ))

miou = evaluate.miou_per_v(all_prediction, all_groundtruth)

for i, iou in enumerate(ious):
    for cls in range(label_count):
        ap, ar = evaluate.ap(prediction_by_cls[cls], iou,
                             groundtruth_by_cls[cls])
        aps[i][cls] = ap
        ars[i][cls] = ar
    f1 = evaluate.f1(all_prediction, iou, all_groundtruth)
    f1s[i] = f1

map_ = np.mean(aps, axis=1)
mar = np.mean(ars, axis=1)

print("Criteria solved.")

# print
title = "C3D Detection Performance"
datas = [["IoU threshold"], ["mean AP"], ["mean AR"], ["F1 criterion"]]
for i, iou in enumerate(ious):
    datas[0].append("{:.2f}".format(iou))
    datas[1].append("{:.4f}".format(map_[i]))
    datas[2].append("{:.4f}".format(mar[i]))
    datas[3].append("{:.4f}".format(f1s[i]))

예제 #7

0

파일 보기

파일: combined_eval_detection_results.py 프로젝트: AndyTang15/code

for iou_idx, min_overlap in enumerate(iou_range):
    #for iou_idx, min_overlap in enumerate([0.6]):
    for cls in range(num_class):
        #for cls in [304]:
        #jobs.append(pool.apply_async(eval_ap, args=([min_overlap], iou_idx, cls, gt_by_cls[cls], plain_detections[cls],),callback=callback))
        jobs.append(
            pool.apply_async(eval_ap,
                             args=(
                                 [min_overlap],
                                 iou_idx,
                                 cls,
                                 pku_gt_by_class[cls],
                                 pku_prediction_by_class[cls],
                             ),
                             callback=callback))
    f1 = evaluate.f1(pku_prediction, min_overlap, pku_gt)
    f1_values[iou_idx] = f1
pool.close()
pool.join()
print("Evaluation done.\n\n")
"""for zdy_i,zdy_iou in enumerate(iou_range):
	with open("accuracy_per_cls/cls_pku{:f}.txt".format(zdy_iou),"w") as zdy_f:
		for zdy_cls in range(num_class):
			zdy_f.write("{:d}\t{:.04f}\n".format(zdy_cls,ap_values[zdy_cls][zdy_i]))"""

#map_iou = ap_values[1:,:].mean(axis=0)
#mar = ar_values[1:,:].mean(axis=0)
map_iou = ap_values.mean(axis=0)
mar = ar_values.mean(axis=0)
display_title = "Detection Performance on {}".format(args.dataset)

예제 #8

0

파일 보기

def train_network():

    init_epoch = 0
    best_f1 = 0
    total_steps = 0
    train_dir = ct.TRAIN_TXT
    val_dir = ct.VAL_TXT
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.backends.cudnn.benchmark = True

    train_data = OSCD_TRAIN(train_dir)
    train_dataloader = DataLoader(train_data,
                                  batch_size=ct.BATCH_SIZE,
                                  shuffle=True)
    val_data = OSCD_TEST(val_dir)
    val_dataloader = DataLoader(val_data, batch_size=1, shuffle=False)
    netg = NetG(ct.ISIZE, ct.NC * 2, ct.NZ, ct.NDF,
                ct.EXTRALAYERS).to(device=device)
    netd = NetD(ct.ISIZE, ct.GT_C, 1, ct.NGF, ct.EXTRALAYERS).to(device=device)
    netg.apply(weights_init)
    netd.apply(weights_init)
    if ct.RESUME:
        assert os.path.exists(os.path.join(ct.WEIGHTS_SAVE_DIR, 'current_netG.pth')) \
                and os.path.exists(os.path.join(ct.WEIGHTS_SAVE_DIR, 'current_netG.pth')), \
                'There is not found any saved weights'
        print("\nLoading pre-trained networks.")
        init_epoch = torch.load(
            os.path.join(ct.WEIGHTS_SAVE_DIR, 'current_netG.pth'))['epoch']
        netg.load_state_dict(
            torch.load(os.path.join(ct.WEIGHTS_SAVE_DIR,
                                    'current_netG.pth'))['model_state_dict'])
        netd.load_state_dict(
            torch.load(os.path.join(ct.WEIGHTS_SAVE_DIR,
                                    'current_netD.pth'))['model_state_dict'])
        with open(os.path.join(ct.OUTPUTS_DIR, 'f1_score.txt')) as f:
            lines = f.readlines()
            best_f1 = float(lines[-2].strip().split(':')[-1])
        print("\tDone.\n")

    l_adv = l2_loss
    l_con = nn.L1Loss()
    l_enc = l2_loss
    l_bce = nn.BCELoss()
    l_cos = cos_loss
    dice = DiceLoss()
    optimizer_d = optim.Adam(netd.parameters(), lr=ct.LR, betas=(0.5, 0.999))
    optimizer_g = optim.Adam(netg.parameters(), lr=ct.LR, betas=(0.5, 0.999))

    start_time = time.time()
    for epoch in range(init_epoch + 1, ct.EPOCH):
        loss_g = []
        loss_d = []
        netg.train()
        netd.train()
        epoch_iter = 0
        for i, data in enumerate(train_dataloader):
            INPUT_SIZE = [ct.ISIZE, ct.ISIZE]
            x1, x2, gt = data
            x1 = x1.to(device, dtype=torch.float)
            x2 = x2.to(device, dtype=torch.float)
            gt = gt.to(device, dtype=torch.float)
            gt = gt[:, 0, :, :].unsqueeze(1)
            x = torch.cat((x1, x2), 1)

            epoch_iter += ct.BATCH_SIZE
            total_steps += ct.BATCH_SIZE
            real_label = torch.ones(size=(x1.shape[0], ),
                                    dtype=torch.float32,
                                    device=device)
            fake_label = torch.zeros(size=(x1.shape[0], ),
                                     dtype=torch.float32,
                                     device=device)

            #forward

            fake = netg(x)
            pred_real = netd(gt)
            pred_fake = netd(fake).detach()
            err_d_fake = l_bce(pred_fake, fake_label)
            err_g = l_con(fake, gt)
            err_g_total = ct.G_WEIGHT * err_g + ct.D_WEIGHT * err_d_fake

            pred_fake_ = netd(fake.detach())
            err_d_real = l_bce(pred_real, real_label)
            err_d_fake_ = l_bce(pred_fake_, fake_label)
            err_d_total = (err_d_real + err_d_fake_) * 0.5

            #backward
            optimizer_g.zero_grad()
            err_g_total.backward(retain_graph=True)
            optimizer_g.step()
            optimizer_d.zero_grad()
            err_d_total.backward()
            optimizer_d.step()

            errors = utils.get_errors(err_d_total, err_g_total)
            loss_g.append(err_g_total.item())
            loss_d.append(err_d_total.item())

            counter_ratio = float(epoch_iter) / len(train_dataloader.dataset)
            if (i % ct.DISPOLAY_STEP == 0 and i > 0):
                print(
                    'epoch:', epoch, 'iteration:', i,
                    ' G|D loss is {}|{}'.format(np.mean(loss_g[-51:]),
                                                np.mean(loss_d[-51:])))
                if ct.DISPLAY:
                    utils.plot_current_errors(epoch, counter_ratio, errors,
                                              vis)
                    utils.display_current_images(gt.data, fake.data, vis)
        utils.save_current_images(epoch, gt.data, fake.data, ct.IM_SAVE_DIR,
                                  'training_output_images')

        with open(os.path.join(ct.OUTPUTS_DIR, 'train_loss.txt'), 'a') as f:
            f.write(
                'after %s epoch, loss is %g,loss1 is %g,loss2 is %g,loss3 is %g'
                % (epoch, np.mean(loss_g), np.mean(loss_d), np.mean(loss_g),
                   np.mean(loss_d)))
            f.write('\n')
        if not os.path.exists(ct.WEIGHTS_SAVE_DIR):
            os.makedirs(ct.WEIGHTS_SAVE_DIR)
        utils.save_weights(epoch, netg, optimizer_g, ct.WEIGHTS_SAVE_DIR,
                           'netG')
        utils.save_weights(epoch, netd, optimizer_d, ct.WEIGHTS_SAVE_DIR,
                           'netD')
        duration = time.time() - start_time
        print('training duration is %g' % duration)

        #val phase
        print('Validating.................')
        pretrained_dict = torch.load(
            os.path.join(ct.WEIGHTS_SAVE_DIR,
                         'current_netG.pth'))['model_state_dict']
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        net = NetG(ct.ISIZE, ct.NC * 2, ct.NZ, ct.NDF,
                   ct.EXTRALAYERS).to(device=device)
        net.load_state_dict(pretrained_dict, False)
        with net.eval() and torch.no_grad():
            TP = 0
            FN = 0
            FP = 0
            TN = 0
            for k, data in enumerate(val_dataloader):
                x1, x2, label = data
                x1 = x1.to(device, dtype=torch.float)
                x2 = x2.to(device, dtype=torch.float)
                label = label.to(device, dtype=torch.float)
                label = label[:, 0, :, :].unsqueeze(1)
                x = torch.cat((x1, x2), 1)
                time_i = time.time()
                v_fake = net(x)

                tp, fp, tn, fn = eva.f1(v_fake, label)
                TP += tp
                FN += fn
                TN += tn
                FP += fp

            precision = TP / (TP + FP + 1e-8)
            oa = (TP + TN) / (TP + FN + TN + FP + 1e-8)
            recall = TP / (TP + FN + 1e-8)
            f1 = 2 * precision * recall / (precision + recall + 1e-8)
            if not os.path.exists(ct.BEST_WEIGHT_SAVE_DIR):
                os.makedirs(ct.BEST_WEIGHT_SAVE_DIR)
            if f1 > best_f1:
                best_f1 = f1
                shutil.copy(
                    os.path.join(ct.WEIGHTS_SAVE_DIR, 'current_netG.pth'),
                    os.path.join(ct.BEST_WEIGHT_SAVE_DIR, 'netG.pth'))
            print('current F1: {}'.format(f1))
            print('best f1: {}'.format(best_f1))
            with open(os.path.join(ct.OUTPUTS_DIR, 'f1_score.txt'), 'a') as f:
                f.write('current epoch:{},current f1:{},best f1:{}'.format(
                    epoch, f1, best_f1))
                f.write('\n')

예제 #9

0

파일 보기

파일: test.py 프로젝트: wangle53/CSA-CDGAN

def test_network():
    threshold = ct.THRESHOLD
    test_dir = ct.TEST_TXT
    path = os.path.join(ct.BEST_WEIGHT_SAVE_DIR, 'netG.pth')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pretrained_dict = torch.load(
        path, map_location=torch.device(device))['model_state_dict']
    test_data = OSCD_TEST(test_dir)
    test_dataloader = DataLoader(test_data, batch_size=1, shuffle=False)
    net = NetG(ct.ISIZE, ct.NC * 2, ct.NZ, ct.NDF, ct.EXTRALAYERS).to(device)
    #     net = nn.DataParallel(net)
    net.load_state_dict(pretrained_dict, False)
    torch.no_grad()
    net.eval()
    i = 0
    TP = 0
    FN = 0
    FP = 0
    TN = 0
    for i, data in enumerate(test_dataloader):
        INPUT_SIZE = [ct.ISIZE, ct.ISIZE]
        x1, x2, gt = data
        x1 = x1.to(device, dtype=torch.float)
        x2 = x2.to(device, dtype=torch.float)
        gt = gt.to(device, dtype=torch.float)
        gt = gt[:, 0, :, :].unsqueeze(1)

        x = torch.cat((x1, x2), 1)
        fake = net(x)

        save_path = os.path.join(ct.IM_SAVE_DIR, 'test_output_images')
        if not os.path.isdir(save_path):
            os.makedirs(save_path)

        if ct.SAVE_TEST_IAMGES:
            vutils.save_image(x1.data,
                              os.path.join(save_path, '%d_x1.png' % i),
                              normalize=True)
            vutils.save_image(x2.data,
                              os.path.join(save_path, '%d_x2.png' % i),
                              normalize=True)
            vutils.save_image(fake.data,
                              os.path.join(save_path, '%d_gt_fake.png' % i),
                              normalize=True)
            vutils.save_image(gt,
                              os.path.join(save_path, '%d_gt.png' % i),
                              normalize=True)

        tp, fp, tn, fn = eva.f1(fake, gt)
        TP += tp
        FN += fn
        TN += tn
        FP += fp
        i += 1
        print('testing {}th images'.format(i))
    iou = TP / (FN + TP + FP + 1e-8)
    precision = TP / (TP + FP + 1e-8)
    oa = (TP + TN) / (TP + FN + TN + FP + 1e-8)
    recall = TP / (TP + FN + 1e-8)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)
    P = ((TP + FP) * (TP + FN) + (FN + TN) *
         (FP + TN)) / ((TP + TN + FP + FN)**2 + 1e-8)
    Kappa = (oa - P) / (1 - P + 1e-8)
    results = {
        'iou': iou,
        'precision': precision,
        'oa': oa,
        'recall': recall,
        'f1': f1,
        'kappa': Kappa
    }

    with open(os.path.join(ct.OUTPUTS_DIR, 'test_score.txt'), 'a') as f:
        f.write('-----test results on the best model {}-----'.format(
            time.strftime('%Y-%m-%d %H:%M:%S')))
        f.write('\n')
        for key, value in results.items():
            print(key, value)
            f.write('{}: {}'.format(key, value))
            f.write('\n')

예제 #10

0

파일 보기

파일: arg_experiment.py 프로젝트: shaform/disambig

def test(fhelper,
         train_args,
         test_args,
         corpus_file,
         linkings,
         train_path,
         test_path,
         model_path,
         crf,
         log_path,
         keep_boundary=False,
         use_baseline=False,
         use_feature=None,
         reverse_select=False,
         rstats=False,
         threshold=[0.7]):

    train_args.init_truth(corpus_file)
    data_set, test_set = extract_features(corpus_file,
                                          linkings,
                                          train_args,
                                          test_args,
                                          use_feature=use_feature,
                                          reverse_select=reverse_select)

    predictor = Predictor(crf, model_path, train_path, test_path)

    processes = []
    test_crf_data = []
    for i in fhelper.folds():
        crf_data = extract_crf_data(fhelper.train_set(i), data_set)
        if keep_boundary:
            crf_ranges = get_ranges(crf_data)
        else:
            crf_ranges = None
        test_crf_data.append(extract_crf_data(fhelper.test_set(i), test_set))

        processes.append(predictor.train(i, crf_data, crf_ranges))

    print('training...', end='', flush=True)
    for i, p in enumerate(processes):
        p.wait()
        print(i, end='', flush=True)
    print()

    print('start testing')
    processes = []
    if keep_boundary:
        test_crf_ranges = [get_ranges(crf_data) for crf_data in test_crf_data]
    else:
        test_crf_ranges = [None for crf_data in test_crf_data]

    for i in fhelper.folds():
        processes.append(
            predictor.test(i, test_crf_data[i], test_crf_ranges[i]))

    preds = []
    pred_probs = []
    wow_count = 0
    for crf_data, ranges, p in zip(test_crf_data, test_crf_ranges, processes):
        p.wait()
        arg_spans, probs = load_predict(p.stdout, ranges, use_baseline,
                                        crf_data)
        assert (len(arg_spans) == len(crf_data))
        preds.append(arg_spans)
        pred_probs.append(probs)

    # evaluation
    cv_stats = defaultdict(list)
    log_stats = defaultdict(int)
    if rstats:
        r_stats = defaultdict(int)
    with open(log_path, 'w') as log_out:
        for i, crf_data, pds in zip(fhelper.folds(), test_crf_data, preds):
            correct = 0
            tp = fp = total = 0
            i_tp = i_fp = i_total = 0
            p_i_tp = [0] * len(threshold)
            p_i_fp = [0] * len(threshold)

            for arg_span, item in zip(pds, crf_data):
                label, cindices, *_ = item

                s = set()
                if cindices in train_args.argument[label]:
                    rtype = train_args.argument[label][cindices][1]
                    pd_rtype = test_args.argument[label][cindices][1]
                    if rtype == pd_rtype:
                        s = train_args.edu_truth[label][cindices]

                log_error(log_out,
                          s,
                          arg_span,
                          item,
                          corpus_file,
                          stats=log_stats)
                truth_boundaries = set()
                for start, end in s:
                    truth_boundaries.add(start)
                    truth_boundaries.add(end)
                assert (len(truth_boundaries) == len(s) + 1
                        or len(s) == len(truth_boundaries) == 0)

                pd_boundaries = set()
                for start, end in arg_span:
                    pd_boundaries.add(start)
                    pd_boundaries.add(end)
                assert (len(pd_boundaries) == len(arg_span) + 1
                        or len(arg_span) == len(pd_boundaries) == 0)

                tp += len(truth_boundaries & pd_boundaries)
                fp += len(pd_boundaries - truth_boundaries)
                if s == arg_span:
                    correct += 1
                    if len(s) > 0:
                        i_tp += 1
                elif len(arg_span) > 0:
                    i_fp += 1

                # if predicted
                if len(arg_span) > 0:
                    partial = [False] * len(threshold)

                    # if num of args the same
                    if len(s) == len(arg_span):

                        EDU_offsets = corpus_file.edu_corpus[label]
                        overlap_scores = []
                        for pd_arg, t_arg in zip(sorted(arg_span), sorted(s)):
                            start = EDU_offsets[pd_arg[0]][0]
                            end = EDU_offsets[pd_arg[-1] - 1][-1]

                            t_start = EDU_offsets[t_arg[0]][0]
                            t_end = EDU_offsets[t_arg[-1] - 1][-1]

                            overlap_scores.append(
                                compute_overlap((start, end),
                                                (t_start, t_end)))

                        for j, th in enumerate(threshold):
                            partial[j] = sum(overlap_scores) / len(
                                overlap_scores) >= th

                    for j in range(len(threshold)):
                        if partial[j]:
                            p_i_tp[j] += 1
                        else:
                            p_i_fp[j] += 1

                if rstats and len(s) > 0:

                    def count_rstats(item, is_correct):
                        r_stats[item] += 1
                        if is_correct:
                            r_stats[item + '-correct'] += 1

                    # count connective length
                    count_rstats('clen-{}'.format(len(cindices)),
                                 s == arg_span)
                    # count argument length
                    count_rstats('alen-{}'.format(len(s)), s == arg_span)

                    # count front
                    count_rstats('front',
                                 min(pd_boundaries) == min(truth_boundaries))
                    # count back
                    count_rstats('back',
                                 max(pd_boundaries) == max(truth_boundaries))

                    # count middle
                    middle_truth = set(truth_boundaries)
                    middle_truth.remove(min(truth_boundaries))
                    middle_truth.remove(max(truth_boundaries))

                    middle_pd = set(pd_boundaries)
                    middle_pd.remove(min(pd_boundaries))
                    middle_pd.remove(max(pd_boundaries))

                    count_rstats('middle', middle_pd == middle_truth)

                    # count over/underpredict
                    count_rstats('over', len(s) < len(arg_span))
                    count_rstats('under', len(s) > len(arg_span))

                    # count match or not
                    if len(s) == len(cindices):
                        count_rstats('match', s == arg_span)
                    else:
                        count_rstats('notmatch', s == arg_span)

                    # types  [ <]>
                    # types  [ <> ]
                    # types  < [] >
                    if s != arg_span:
                        tl, tr = min(truth_boundaries), max(truth_boundaries)
                        pl, pr = min(pd_boundaries), max(pd_boundaries)
                        if pl == tl and pr == tr:
                            count_rstats('exact', False)
                        else:
                            if pl >= tl and pr <= tr:
                                count_rstats('toosmall', False)
                            elif pl <= tl and pr >= tr:
                                count_rstats('toobig', False)
                            else:
                                count_rstats('cross', False)

                            if pl == tl or pr == tr:
                                count_rstats('at least one', False)
                            else:
                                count_rstats('both incorrect', False)

            for l in fhelper.test_set(i):
                d = train_args.edu_truth[l]
                for s in d.values():
                    if len(s) == 0:
                        continue
                    total += len(s) + 1
                    i_total += 1

            assert (sum(len(pd) + 1 if len(pd) != 0 else 0
                        for pd in pds) == tp + fp)
            cv_stats['Total'].append(total)
            cv_stats['iTotal'].append(i_total)
            cv_stats['Propose'].append(tp + fp)
            cv_stats['iPropose'].append(i_tp + i_fp)
            for j in range(len(threshold)):
                cv_stats['piPropose{}'.format(j)].append(p_i_tp[j] + p_i_fp[j])

            accuracy = correct / len(pds) if len(pds) > 0 else 1
            recall = tp / total
            prec = tp / (tp + fp) if (tp + fp) > 0 else 1
            f1 = evaluate.f1(recall, prec)
            cv_stats['Accuracy'].append(accuracy)
            cv_stats['Recall'].append(recall)
            cv_stats['Prec'].append(prec)
            cv_stats['F1'].append(f1)

            recall = i_tp / i_total
            prec = i_tp / (i_tp + i_fp) if (i_tp + i_fp) > 0 else 1
            f1 = evaluate.f1(recall, prec)
            cv_stats['iRecall'].append(recall)
            cv_stats['iPrec'].append(prec)
            cv_stats['iF1'].append(f1)

            for j in range(len(threshold)):
                recall = p_i_tp[j] / i_total
                prec = p_i_tp[j] / (p_i_tp[j] + p_i_fp[j]) if (
                    p_i_tp[j] + p_i_fp[j]) > 0 else 1
                f1 = evaluate.f1(recall, prec)
                cv_stats['piRecall{}'.format(j)].append(recall)
                cv_stats['piPrec{}'.format(j)].append(prec)
                cv_stats['piF1{}'.format(j)].append(f1)

    print('prec\trecall\tF1')
    print('{:.2%}\t{:.2%}\t{:.2%}'.format(
        np.mean(cv_stats['Prec']),
        np.mean(cv_stats['Recall']),
        np.mean(cv_stats['F1']),
    ))
    print('Fold Prec', cv_stats['Prec'])
    print('Fold Recall', cv_stats['Recall'])
    print('Fold F1', cv_stats['F1'])
    print('Instance')
    print('prec\trecall\tF1\tAccuracy')
    print('{:.2%}\t{:.2%}\t{:.2%}\t{:.2%}'.format(
        np.mean(cv_stats['iPrec']), np.mean(cv_stats['iRecall']),
        np.mean(cv_stats['iF1']), np.mean(cv_stats['Accuracy'])))
    print('pprec\tprecall\tpF1\tthreshold')

    for j, th in enumerate(threshold):
        print('{:.2%}\t{:.2%}\t{:.2%}\t{}'.format(
            np.mean(cv_stats['piPrec{}'.format(j)]),
            np.mean(cv_stats['piRecall{}'.format(j)]),
            np.mean(cv_stats['piF1{}'.format(j)]), th))
    print('Fold Prec', cv_stats['iPrec'])
    print('Fold Recall', cv_stats['iRecall'])
    print('Fold F1', cv_stats['iF1'])
    print('Fold Accuracy', cv_stats['Accuracy'])
    print('Totally {} arguments for all'.format(sum(cv_stats['Total'])))
    print('Totally {} instances for all'.format(sum(cv_stats['iTotal'])))
    print('Totally {} arguments predicted for all'.format(
        sum(cv_stats['Propose'])))
    print('Totally {} instances predicted for all'.format(
        sum(cv_stats['iPropose'])))

    for j, th in enumerate(threshold):
        print('{} with threshold = {}'.format(j, th))
        print('Fold pPrec', cv_stats['piPrec{}'.format(j)])
        print('Fold pRecall', cv_stats['piRecall{}'.format(j)])
        print('Fold pF1', cv_stats['piF1{}'.format(j)])
        print('Totally {} partial instances predicted for all'.format(
            sum(cv_stats['piPropose{}'.format(j)])))
    print('log stats:')
    # evaluate.print_counts(log_stats)

    if rstats:
        evaluate.print_counts(r_stats)