def Supervised(filename, old_files = [], stop='', stopat=1, error='none', interval = 100000, starting =1, seed=0, step =10, learner='svm_linear', boost=None): print("FILENAME: ", filename, "OLDFILES: ", len(old_files)) stopat = float(stopat) np.random.seed(seed) read = MAR() read = read.create(filename, old_files) read.step = step read.interval = interval read.seed = seed if boost: util.vote(read, clf_name=boost, seed=seed, all=False, temp=str(seed) + filename) return num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False if boost == None: read.train_supervised(learner, seed) pos, neg, total = read.get_numbers() if boost: read.query_boost() else: read.query_supervised() read.record['est'][0] = read.est_num while True: pos, neg, total = read.get_numbers() # try: # print("%d, %d, %d" %(pos,pos+neg, read.est_num)) # except: # print("%d, %d" %(pos,pos+neg)) if pos + neg >= total: break # if pos >= target and (pos+neg) >= total * .22 and read.enable_est and read.est_num*stopat<= pos: # break if boost: ids = read.query_boost()[:read.step] else: ids = read.query_supervised()[:read.step] read.code_batch(ids) return read
def TEST_AL(filename, old_files=[], stop='est', stopat=1, error='none', interval=100000, starting=1, seed=0, step=10): stopat = float(stopat) thres = 0 counter = 0 pos_last = 0 np.random.seed(seed) read = MAR() read = read.create(filename, old_files) read.step = step read.interval = interval num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) except: print("%d, %d" % (pos, pos + neg)) if pos + neg >= total: break if pos < starting or pos + neg < thres: for id in read.random(): read.code_error(id, error=error) else: a, b, c, d = read.train(weighting=True, pne=True) if pos >= target and read.est_num * stopat <= pos: break for id in c: read.code_error(id, error=error) # read.export() # results = analyze(read) # print(results) # read.plot() return read
def Supervised(filename, old_files=[], stop='est', stopat=1, error='none', interval=100000, starting=1, seed=0, step=10): stopat = float(stopat) np.random.seed(seed) read = MAR() read = read.create(filename, old_files) read.step = step read.interval = interval num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False read.train_supervised() pos, neg, total = read.get_numbers() read.query_supervised() read.record['est'][0] = read.est_num while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) except: print("%d, %d" % (pos, pos + neg)) if pos + neg >= total: break if pos >= target and read.est_num * stopat <= pos: break for id in read.query_supervised()[:read.step]: read.code_error(id, error=error) return read
def Boosting(filename, old_files = [], stop='', stopat=1, error='none', interval = 100000, starting =1, seed=0, step =10): print("FILENAME: ", filename, "OLDFILES: ", len(old_files)) stopat = float(stopat) np.random.seed(seed) read = MAR() read = read.create(filename,old_files) read.step = step read.interval = interval util.vote(read) num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False pos, neg, total = read.get_numbers() read.query_boost() read.record['est'][0]= read.est_num while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" %(pos,pos+neg, read.est_num)) except: print("%d, %d" %(pos,pos+neg)) if pos + neg >= total: break if read.enable_est and read.est_num*stopat<= pos: break for id in read.query_boost()[:read.step]: read.code_error(id, error=error) return read
def test_semi(filename="Hall.csv"): num = 20 read = MAR() read = read.create(filename) poses = np.where(np.array(read.body['label']) == "yes")[0] negs = np.where(np.array(read.body['label']) == "no")[0] pos_sel = np.random.choice(poses, num, replace=False) neg_sel = np.random.choice(negs, num*10, replace=False) for id in pos_sel: read.code_error(id) for id in neg_sel: read.code_error(id) read.enable_est = True read.get_numbers() a,b,c,d = read.train() set_trace()
def active_learning(filename, query='', stop='true', stopat=1.00, error='none', interval=100000, seed=0): stopat = float(stopat) thres = 0 starting = 1 counter = 0 pos_last = 0 np.random.seed(seed) read = MAR() read = read.create(filename) # random sampling or by querying similar documents # self.bm is provided with a list or a view of a dict's value which is not sorted read.BM25(query.strip().split('_')) # get the rest #pos documents num2 = read.get_allpos() target = int( num2 * stopat ) # stopat is 1. Is it the minum num of pos to activate svm training ? if stop == 'est': # stop = 'true' read.enable_est = True else: read.enable_est = False # will excute this line while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) # what is est_num ? except: print("%d, %d" % (pos, pos + neg)) # execute this line if pos + neg >= total: # do not go inside if stop == 'knee' and error == 'random': coded = np.where( np.array(read.body['code']) != "undetermined")[0] seq = coded[np.argsort(read.body['time'][coded])] part1 = set(seq[:read.kneepoint * read.step]) & set( np.where(np.array(read.body['code']) == "no")[0]) part2 = set(seq[read.kneepoint * read.step:]) & set( np.where(np.array(read.body['code']) == "yes")[0]) for id in part1 | part2: read.code_error(id, error=error) break if pos < starting or pos + neg < thres: # the second condition doesn't work for id in read.BM25_get( ): # select a set of candidates from self.pool read.code_error( id, error=error ) # simulate human labeling error, default is no error else: a, b, c, d = read.train(weighting=True, pne=True) if stop == 'est': if stopat * read.est_num <= pos: break elif stop == 'soft': if pos >= 10 and pos_last == pos: counter = counter + 1 else: counter = 0 pos_last = pos if counter >= 5: break elif stop == 'knee': if pos >= 10: if read.knee(): if error == 'random': coded = np.where( np.array(read.body['code']) != "undetermined" )[0] seq = coded[np.argsort( np.array(read.body['time'])[coded])] part1 = set( seq[:read.kneepoint * read.step]) & set( np.where( np.array(read.body['code']) == "no") [0]) part2 = set( seq[read.kneepoint * read.step:]) & set( np.where( np.array(read.body['code']) == "yes") [0]) for id in part1 | part2: read.code_error(id, error=error) break else: if pos >= target: break if pos < 10: for id in a: read.code_error(id, error=error) else: for id in c: read.code_error(id, error=error) return read
def active_learning(filename, query='', stop='true', stopat=0.95, error='none', interval=100000, seed=0): stopat = float(stopat) thres = 0 starting = 1 counter = 0 pos_last = 0 np.random.seed(seed) read = MAR() read = read.create(filename) read.interval = interval read.BM25(query.strip().split('_')) num2 = read.get_allpos() target = int(num2 * stopat) print("number of target, true/close here:", target) if stop == 'est': read.enable_est = True else: read.enable_est = False while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) except: print("%d, %d" % (pos, pos + neg)) if pos + neg >= total: if stop == 'knee' and error == 'random': coded = np.where( np.array(read.body['code']) != "undetermined")[0] seq = coded[np.argsort(read.body['time'][coded])] part1 = set(seq[:read.kneepoint * read.step]) & set( np.where(np.array(read.body['code']) == "no")[0]) part2 = set(seq[read.kneepoint * read.step:]) & set( np.where(np.array(read.body['code']) == "yes")[0]) for id in part1 | part2: read.code_error(id, error=error) break if pos < starting or pos + neg < thres: for id in read.BM25_get(): read.code_error(id, error=error) else: a, b, c, d = read.train(weighting=True, pne=True) if stop == 'est': if stopat * read.est_num <= pos: break elif stop == 'soft': if pos >= 10 and pos_last == pos: counter = counter + 1 else: counter = 0 pos_last = pos if counter >= 5: break elif stop == 'knee': if pos >= 10: if read.knee(): if error == 'random': coded = np.where( np.array(read.body['code']) != "undetermined" )[0] seq = coded[np.argsort( np.array(read.body['time'])[coded])] part1 = set( seq[:read.kneepoint * read.step]) & set( np.where( np.array(read.body['code']) == "no") [0]) part2 = set( seq[read.kneepoint * read.step:]) & set( np.where( np.array(read.body['code']) == "yes") [0]) for id in part1 | part2: read.code_error(id, error=error) break else: if pos >= target: break if pos < 10: for id in a: read.code_error(id, error=error) else: for id in c: read.code_error(id, error=error) set_trace() return read
from __future__ import print_function, division import sys, os root = os.getcwd().split("src")[0] + "src/src/util" sys.path.append(root) from mar import MAR from pdb import set_trace if __name__ == "__main__": data_path = "Hall.csv" target_recall = 0.95 thres = 10 query = "defect prediction" read = MAR() read = read.create(data_path) read.enable_est = True if query: read.BM25(query.split()) while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) except: print("%d, %d" % (pos, pos + neg)) if pos + neg >= total: break if pos < 1: if query: ids, scores = read.BM25_get()