def blur_no_infogain(blue_c_imb, data_dir, task, pos_class): # assumed b_imb == 0.5 blue_dir = data_dir.replace('Red', 'Blue') d_red = setup.get_label_dict_knowing(data_dir, task, pos_class) d_blue = setup.get_label_dict_knowing(blue_dir, task, pos_class) red_c_imb = float(len( d_red['Default'])) / (len(d_red[task]) + len(d_red['Default'])) blue_c_imb = float(blue_c_imb) if red_c_imb >= blue_c_imb: # can't add all negatives num_pos = len(d_red[task]) num_neg = num_pos * (blue_c_imb / float(1 - blue_c_imb)) # num_neg = num_pos * (len(d_blue['Default'])/len(d_blue[task])) else: # can't add all positives k = ((1 - blue_c_imb) / float(blue_c_imb)) print k for key in d_red.keys(): try: print len(d_red[key]), key except: pass num_neg = len(d_red['Default']) num_pos = num_neg * k # num_pos = num_neg * (len(d_blue[task])/len(d_blue['Default'])) return int(num_pos), int(num_neg)
def blur_no_infogain(blue_c_imb, data_dir, task, pos_class): # assumed b_imb == 0.5 blue_dir = data_dir.replace('Red','Blue') d_red = setup.get_label_dict_knowing(data_dir, task, pos_class) d_blue = setup.get_label_dict_knowing(blue_dir, task, pos_class) red_c_imb=float(len(d_red['Default']))/(len(d_red[task])+len(d_red['Default'])) blue_c_imb = float(blue_c_imb) if red_c_imb >= blue_c_imb: # can't add all negatives num_pos = len(d_red[task]) num_neg = num_pos * (blue_c_imb/float(1-blue_c_imb)) # num_neg = num_pos * (len(d_blue['Default'])/len(d_blue[task])) else: # can't add all positives k = ((1-blue_c_imb)/float(blue_c_imb)) print k for key in d_red.keys(): try: print len(d_red[key]), key except: pass num_neg = len(d_red['Default']) num_pos = num_neg * k # num_pos = num_neg * (len(d_blue[task])/len(d_blue['Default'])) return int(num_pos), int(num_neg)
def what_redbox_numbers(c_imb, b_imb, data_dir, task, pos_class, b_pos, b_neg): # big prob: after redbox sampling, imbalance has changed. # so actually, redbox sampling and undersampling both need to be # determined before either takes place. # other prob: given b_imb, compute num_neg num_pos # maybe easier is given info gain, compute num_neg num_pos d = setup.get_label_dict_knowing(data_dir, task, pos_class) red_c_imb = float(len(d['Default'])) / (len(d[task]) + len(d['Default'])) if red_c_imb >= c_imb: r_pos = len(d[task]) r_neg = r_pos * (b_imb / (1 - b_imb)) * (r_pos / (r_pos)) return r_pos, r_neg # if red_c_imb lower, would have to elif red_c_imb <= c_imb: print "class imbalance going to decrease! :D" return len(d[task]), len(d['Default']) * (c_imb / red_c_imb)
def what_redbox_numbers(c_imb, b_imb, data_dir, task, pos_class, b_pos, b_neg): # big prob: after redbox sampling, imbalance has changed. # so actually, redbox sampling and undersampling both need to be # determined before either takes place. # other prob: given b_imb, compute num_neg num_pos # maybe easier is given info gain, compute num_neg num_pos d = setup.get_label_dict_knowing(data_dir, task, pos_class) red_c_imb=float(len(d['Default']))/(len(d[task])+len(d['Default'])) if red_c_imb >= c_imb: r_pos = len(d[task]) r_neg = r_pos * (b_imb/(1-b_imb)) * (r_pos/(r_pos)) return r_pos, r_neg # if red_c_imb lower, would have to elif red_c_imb <= c_imb: print "class imbalance going to decrease! :D" return len(d[task]), len(d['Default'])*(c_imb/red_c_imb)
def same_amount_as_bluebox(data_dir, task, pos_class): d = setup.get_label_dict_knowing(data_dir, task, pos_class) # ASSUMING MODEL LEARNS P(label|data) ! return len(d[task]), len(d[task])