示例#1
0
def normalize(model, page_tuples, page_labels, page_num_labels):
    num_pages = len(page_labels)
    num_labels = len(page_labels[0])
    num_tuples = len(page_tuples[0])
    page_total, page_count = get_page_count(page_labels)
    # this is very inefficient
    score_right = [[] for l in range(num_labels)]
    score_wrong = [[] for l in range(num_labels)]
    score_both = [[] for l in range(num_labels)]
    for p in range(num_pages):
        # scores = classifier_tuples(model, page_tuples, p)
        scores = classifier_tuples(model, page_tuples, p, TVDBG_P, TVDBG_L)
        # if p == 0: print 'tvdbg before p={} scores={}'.format(p, scores)
        for l in range(num_labels):
            if scores[l] > 0:
                score_both[l].append(scores[l])
                if page_labels[p][l] == 1:
                    score_right[l].append(scores[l])
                else:
                    score_wrong[l].append(scores[l])
    # need to remove all labels with score_both[l] <= 1
    for l in range(num_labels):
        min_errs = num_pages
        if len(score_both[l]) <= 1:
            print 'ERROR: set factor to 0 since only {} positive scores for label {} = {}'.format(
                len(score_both[l]), l, model['labels'][l])
            factor = 0.0
        else:
            for i in range(1, len(score_both[l])):
                score_both[l] = sorted(score_both[l])
                # a false negative at level score_both[p] is a value score_right[p] < score_both[p]
                FN = 0
                for j in range(len(score_right[l])):
                    if score_right[l][j] < score_both[l][i]:
                        FN += 1
                # a false positive at level score_both[p] is a value score_wrong[p] >= score_both[p]
                FP = 0
                for j in range(len(score_wrong[l])):
                    if score_wrong[l][j] >= score_both[l][i]:
                        FP += 1
                if min_errs > (FN + FP):
                    min_errs = FN + FP
                    factor = (score_both[l][i - 1] + score_both[l][i]) / 2.0
        for t in range(num_tuples):
            learner = model['learners'][t]
            if factor == 0.0:
                learner['c1'][l] = 0.0
                learner['c0'][l] = 0.0
            else:
                learner['c1'][l] = learner['c1'][l] / factor
                learner['c0'][l] = learner['c0'][l] / factor
    for p in range(num_pages):
        # scores = classifier_tuples(model, page_tuples, p)
        scores = classifier_tuples(model, page_tuples, p, TVDBG_P, TVDBG_L)
        # if p == 0: print 'tvdbg after p={} scores={}'.format(p, scores)
    return model
示例#2
0
def normalize(model, page_tuples, page_labels, page_num_labels):
    num_pages = len(page_labels)
    num_labels = len(page_labels[0])
    num_tuples = len(page_tuples[0])
    page_total, page_count = get_page_count(page_labels)
    # this is very inefficient
    score_right = [[] for l in range(num_labels)]
    score_wrong = [[] for l in range(num_labels)]
    score_both = [[] for l in range(num_labels)]
    for p in range(num_pages):
        # scores = classifier_tuples(model, page_tuples, p)
        scores = classifier_tuples(model, page_tuples, p, TVDBG_P, TVDBG_L)
        # if p == 0: print 'tvdbg before p={} scores={}'.format(p, scores)
        for l in range(num_labels):
            if scores[l] > 0:
                score_both[l].append(scores[l])
                if page_labels[p][l] == 1:
                    score_right[l].append(scores[l])
                else:
                    score_wrong[l].append(scores[l])
    # need to remove all labels with score_both[l] <= 1
    for l in range(num_labels):
        min_errs = num_pages
        if len(score_both[l]) <= 1:
            print 'ERROR: set factor to 0 since only {} positive scores for label {} = {}'.format(len(score_both[l]), l, model['labels'][l])
            factor = 0.0
        else:
            for i in range(1, len(score_both[l])):
                score_both[l] = sorted(score_both[l])
                # a false negative at level score_both[p] is a value score_right[p] < score_both[p]
                FN = 0
                for j in range(len(score_right[l])):
                    if score_right[l][j] < score_both[l][i]:
                        FN += 1
                # a false positive at level score_both[p] is a value score_wrong[p] >= score_both[p]
                FP = 0
                for j in range(len(score_wrong[l])):
                    if score_wrong[l][j] >= score_both[l][i]:
                        FP += 1
                if min_errs > (FN + FP):
                    min_errs = FN + FP
                    factor = (score_both[l][i - 1] + score_both[l][i]) / 2.0
        for t in range(num_tuples):
            learner = model['learners'][t]
            if factor == 0.0:
                learner['c1'][l] = 0.0
                learner['c0'][l] = 0.0
            else:
                learner['c1'][l] = learner['c1'][l] / factor
                learner['c0'][l] = learner['c0'][l] / factor
    for p in range(num_pages):
        # scores = classifier_tuples(model, page_tuples, p)
        scores = classifier_tuples(model, page_tuples, p, TVDBG_P, TVDBG_L)
        # if p == 0: print 'tvdbg after p={} scores={}'.format(p, scores)
    return model
示例#3
0
def score_errors(model, page_tuples, page_labels, page_num_labels, cutoff,
                 tvdbg_p, tvdbg_l):
    num_pages = len(page_labels)
    num_labels = len(page_labels[0])
    # weighted counts: TP = true positive, etc.
    TP = [0.0 for i in range(num_labels)]
    FP = [0.0 for i in range(num_labels)]
    FN = [0.0 for i in range(num_labels)]
    TN = [0.0 for i in range(num_labels)]
    matthews = [0 for i in range(num_labels)]
    for p in range(num_pages):
        model_score = classifier_tuples(model, page_tuples, p, tvdbg_p,
                                        tvdbg_l)
        if cutoff:
            sort_score = sorted(model_score, key=float, reverse=True)
            cutoff = sort_score[page_num_labels[p] - 1]
            for l in range(len(model_score)):
                if model_score[l] < cutoff:
                    model_score[l] = -1
        # weight of highest and lowest score
        weight_max = max(model_score)
        # score how right or wrong the classifier is
        # instead of score being compared to zero, you might want to
        # use (weight_max + weight_min) / 2
        # if p == tvdbg_p: print 'tvdbg score[tvdbg_l]={} page_labels[{}][tvdbg_l]={}'.format(model_score[tvdbg_l], p, page_labels[p][tvdbg_l])
        for l in range(num_labels):
            score = model_score[l]
            # TP = true positive
            if page_labels[p][l] == 1 and score > 0:
                TP[l] += score / weight_max
                # if l == tvdbg_l:
                # print 'tvdbg p={} TP[l]={}+={}/{}'. format(p, TP[l], score, weight_max)
            # FP = false positive
            if page_labels[p][l] == 0 and score > 0:
                FP[l] += score / weight_max
            # TN = true negative
            if page_labels[p][l] == 1 and score <= 0:
                FN[l] += 1.0
            # FN = false negative
            if page_labels[p][l] == 0 and score <= 0:
                TN[l] += 1.0
    # once all the pages are processed,
    # normalize TP, TN, FP, FN to 1
    for l in range(num_labels):
        den = TP[l] + TN[l] + FP[l] + FN[l]
        # print 'den[{}]({}) = TP[l]({}) + TN[l]({}) + FP[l]({}) + FN[l]({})'.format(l, den, TP[l], TN[l], FP[l], FN[l])
        TP[l] = TP[l] / den
        TN[l] = TN[l] / den
        FP[l] = FP[l] / den
        FN[l] = FN[l] / den
    return TP, TN, FP, FN
示例#4
0
def score_errors(model, page_tuples, page_labels, page_num_labels, cutoff, tvdbg_p, tvdbg_l):
    num_pages = len(page_labels)
    num_labels = len(page_labels[0])
    # weighted counts: TP = true positive, etc.
    TP = [0.0 for i in range(num_labels)]
    FP = [0.0 for i in range(num_labels)]
    FN = [0.0 for i in range(num_labels)]
    TN = [0.0 for i in range(num_labels)]
    matthews = [0 for i in range(num_labels)]
    for p in range(num_pages):
        model_score = classifier_tuples(model, page_tuples, p, tvdbg_p, tvdbg_l)
        if cutoff:
            sort_score = sorted(model_score, key=float, reverse=True)
            cutoff = sort_score[page_num_labels[p] - 1]
            for l in range(len(model_score)):
                if model_score[l] < cutoff:
                    model_score[l] = -1
        # weight of highest and lowest score
        weight_max = max(model_score)
        # score how right or wrong the classifier is
        # instead of score being compared to zero, you might want to
        # use (weight_max + weight_min) / 2
        # if p == tvdbg_p: print 'tvdbg score[tvdbg_l]={} page_labels[{}][tvdbg_l]={}'.format(model_score[tvdbg_l], p, page_labels[p][tvdbg_l])
        for l in range(num_labels):
            score = model_score[l]
            # TP = true positive
            if page_labels[p][l] == 1 and score > 0:
                TP[l] += score / weight_max
                # if l == tvdbg_l:
                    # print 'tvdbg p={} TP[l]={}+={}/{}'. format(p, TP[l], score, weight_max)
            # FP = false positive
            if page_labels[p][l] == 0 and score > 0:
                FP[l] += score / weight_max
            # TN = true negative
            if page_labels[p][l] == 1 and score <= 0:
                FN[l] += 1.0
            # FN = false negative
            if page_labels[p][l] == 0 and score <= 0:
                TN[l] += 1.0
    # once all the pages are processed,
    # normalize TP, TN, FP, FN to 1
    for l in range(num_labels):
        den = TP[l] + TN[l] + FP[l] + FN[l]
        # print 'den[{}]({}) = TP[l]({}) + TN[l]({}) + FP[l]({}) + FN[l]({})'.format(l, den, TP[l], TN[l], FP[l], FN[l])
        TP[l] = TP[l] / den
        TN[l] = TN[l] / den
        FP[l] = FP[l] / den
        FN[l] = FN[l] / den
    return TP, TN, FP, FN