def normalize(model, page_tuples, page_labels, page_num_labels): num_pages = len(page_labels) num_labels = len(page_labels[0]) num_tuples = len(page_tuples[0]) page_total, page_count = get_page_count(page_labels) # this is very inefficient score_right = [[] for l in range(num_labels)] score_wrong = [[] for l in range(num_labels)] score_both = [[] for l in range(num_labels)] for p in range(num_pages): # scores = classifier_tuples(model, page_tuples, p) scores = classifier_tuples(model, page_tuples, p, TVDBG_P, TVDBG_L) # if p == 0: print 'tvdbg before p={} scores={}'.format(p, scores) for l in range(num_labels): if scores[l] > 0: score_both[l].append(scores[l]) if page_labels[p][l] == 1: score_right[l].append(scores[l]) else: score_wrong[l].append(scores[l]) # need to remove all labels with score_both[l] <= 1 for l in range(num_labels): min_errs = num_pages if len(score_both[l]) <= 1: print 'ERROR: set factor to 0 since only {} positive scores for label {} = {}'.format( len(score_both[l]), l, model['labels'][l]) factor = 0.0 else: for i in range(1, len(score_both[l])): score_both[l] = sorted(score_both[l]) # a false negative at level score_both[p] is a value score_right[p] < score_both[p] FN = 0 for j in range(len(score_right[l])): if score_right[l][j] < score_both[l][i]: FN += 1 # a false positive at level score_both[p] is a value score_wrong[p] >= score_both[p] FP = 0 for j in range(len(score_wrong[l])): if score_wrong[l][j] >= score_both[l][i]: FP += 1 if min_errs > (FN + FP): min_errs = FN + FP factor = (score_both[l][i - 1] + score_both[l][i]) / 2.0 for t in range(num_tuples): learner = model['learners'][t] if factor == 0.0: learner['c1'][l] = 0.0 learner['c0'][l] = 0.0 else: learner['c1'][l] = learner['c1'][l] / factor learner['c0'][l] = learner['c0'][l] / factor for p in range(num_pages): # scores = classifier_tuples(model, page_tuples, p) scores = classifier_tuples(model, page_tuples, p, TVDBG_P, TVDBG_L) # if p == 0: print 'tvdbg after p={} scores={}'.format(p, scores) return model
def normalize(model, page_tuples, page_labels, page_num_labels): num_pages = len(page_labels) num_labels = len(page_labels[0]) num_tuples = len(page_tuples[0]) page_total, page_count = get_page_count(page_labels) # this is very inefficient score_right = [[] for l in range(num_labels)] score_wrong = [[] for l in range(num_labels)] score_both = [[] for l in range(num_labels)] for p in range(num_pages): # scores = classifier_tuples(model, page_tuples, p) scores = classifier_tuples(model, page_tuples, p, TVDBG_P, TVDBG_L) # if p == 0: print 'tvdbg before p={} scores={}'.format(p, scores) for l in range(num_labels): if scores[l] > 0: score_both[l].append(scores[l]) if page_labels[p][l] == 1: score_right[l].append(scores[l]) else: score_wrong[l].append(scores[l]) # need to remove all labels with score_both[l] <= 1 for l in range(num_labels): min_errs = num_pages if len(score_both[l]) <= 1: print 'ERROR: set factor to 0 since only {} positive scores for label {} = {}'.format(len(score_both[l]), l, model['labels'][l]) factor = 0.0 else: for i in range(1, len(score_both[l])): score_both[l] = sorted(score_both[l]) # a false negative at level score_both[p] is a value score_right[p] < score_both[p] FN = 0 for j in range(len(score_right[l])): if score_right[l][j] < score_both[l][i]: FN += 1 # a false positive at level score_both[p] is a value score_wrong[p] >= score_both[p] FP = 0 for j in range(len(score_wrong[l])): if score_wrong[l][j] >= score_both[l][i]: FP += 1 if min_errs > (FN + FP): min_errs = FN + FP factor = (score_both[l][i - 1] + score_both[l][i]) / 2.0 for t in range(num_tuples): learner = model['learners'][t] if factor == 0.0: learner['c1'][l] = 0.0 learner['c0'][l] = 0.0 else: learner['c1'][l] = learner['c1'][l] / factor learner['c0'][l] = learner['c0'][l] / factor for p in range(num_pages): # scores = classifier_tuples(model, page_tuples, p) scores = classifier_tuples(model, page_tuples, p, TVDBG_P, TVDBG_L) # if p == 0: print 'tvdbg after p={} scores={}'.format(p, scores) return model
def score_errors(model, page_tuples, page_labels, page_num_labels, cutoff, tvdbg_p, tvdbg_l): num_pages = len(page_labels) num_labels = len(page_labels[0]) # weighted counts: TP = true positive, etc. TP = [0.0 for i in range(num_labels)] FP = [0.0 for i in range(num_labels)] FN = [0.0 for i in range(num_labels)] TN = [0.0 for i in range(num_labels)] matthews = [0 for i in range(num_labels)] for p in range(num_pages): model_score = classifier_tuples(model, page_tuples, p, tvdbg_p, tvdbg_l) if cutoff: sort_score = sorted(model_score, key=float, reverse=True) cutoff = sort_score[page_num_labels[p] - 1] for l in range(len(model_score)): if model_score[l] < cutoff: model_score[l] = -1 # weight of highest and lowest score weight_max = max(model_score) # score how right or wrong the classifier is # instead of score being compared to zero, you might want to # use (weight_max + weight_min) / 2 # if p == tvdbg_p: print 'tvdbg score[tvdbg_l]={} page_labels[{}][tvdbg_l]={}'.format(model_score[tvdbg_l], p, page_labels[p][tvdbg_l]) for l in range(num_labels): score = model_score[l] # TP = true positive if page_labels[p][l] == 1 and score > 0: TP[l] += score / weight_max # if l == tvdbg_l: # print 'tvdbg p={} TP[l]={}+={}/{}'. format(p, TP[l], score, weight_max) # FP = false positive if page_labels[p][l] == 0 and score > 0: FP[l] += score / weight_max # TN = true negative if page_labels[p][l] == 1 and score <= 0: FN[l] += 1.0 # FN = false negative if page_labels[p][l] == 0 and score <= 0: TN[l] += 1.0 # once all the pages are processed, # normalize TP, TN, FP, FN to 1 for l in range(num_labels): den = TP[l] + TN[l] + FP[l] + FN[l] # print 'den[{}]({}) = TP[l]({}) + TN[l]({}) + FP[l]({}) + FN[l]({})'.format(l, den, TP[l], TN[l], FP[l], FN[l]) TP[l] = TP[l] / den TN[l] = TN[l] / den FP[l] = FP[l] / den FN[l] = FN[l] / den return TP, TN, FP, FN