def svm_learner(budget): accuracy = [] data = csv_reader('resources/pool.csv') testset = csv_reader('resources/testSet.csv') true_labels = oracle.read_mat() used = {} # do nothing about model until reasonable training subset achieved [row, col] = data.shape preds = np.zeros(row) selected = [] labels = [] query = 0 # query each point until get one with label 1 while 1: r = compound.next_compound(data) r_str = np.array_str(np.char.mod('%d', r)) if r_str[1: (len(r_str) - 1)] not in used: r_label = oracle.oracle2(r, data) query += 1 used[r_str[1: (len(r_str) - 1)]] = r_label selected.append(r.tolist()) labels.append(r_label) accuracy.append(error.generalization_error(preds, true_labels)) if np.sum(labels) == 1 and len(labels) > 1: accuracy.pop() break x = np.array(selected) y = np.array(labels) clf = SVC(kernel='linear') clf.fit(x, y) preds = clf.predict(data) accuracy.append(error.generalization_error(preds, true_labels)) num = 2543 - len(used) i = 0 while i < num and query < budget: r = compound.next_compound(data) r_str = np.array_str(np.char.mod('%d', r)) if r_str[1: (len(r_str) - 1)] not in used: i += 1 distance = clf.decision_function(r) if np.abs(distance[0]) <= 0.78: x = np.vstack([x, r]) r_label = oracle.oracle2(r, data) y = np.hstack([y.tolist(), r_label]) query += 1 clf.fit(x, y) preds = clf.predict(testset) accuracy.append(error.test_error(preds, true_labels)) plt.plot(accuracy) plt.show() print f1_score(preds, true_labels[0:250]) return
def svm_learner_all(): data = pool_reader() true_labels = oracle.read_mat() clf = SVC(kernel='linear') X = np.array(data) y = np.array(true_labels) clf.fit(X, y) preds = clf.predict(data) accuracy = (err.generalization_error(preds, true_labels)) print accuracy print f1_score(preds, true_labels) return accuracy
def svm_learner(option): accuracy = [] data = pool_reader() [row, col] = data.shape true_labels = oracle.read_mat() # do nothing about model until reasonable training subset achieved active_count = 0 preds = np.zeros(row) used = set() selected = [] labels = [] while 1: r = random.randint(0, row-1) if r not in used: used.add(r) selected.append(data[r].tolist()) labels.append(true_labels[r]) used.add(r) accuracy.append(err.generalization_error(preds, true_labels)) if np.sum(labels) == 1 and len(labels) > 1: accuracy.pop() break X = np.array(selected) y = np.array(labels) clf = SVC(kernel='linear') clf.fit(X, y) preds = clf.predict(data) accuracy.append(err.generalization_error(preds, true_labels)) for x in xrange(256-len(used)): if option == 'rand': # random selection strategy while 1: cur = random.randint(0, row-1) if cur not in used: break else: # farthest or say most different to previous 1 active selection strategy active = np.where(y == 0)[0].tolist() # farthest to all used cur = get_next(data, active, used) print 'oracle', true_labels[cur] used.add(cur) X = np.vstack([X, data[cur]]) y = np.hstack([y.tolist(),[true_labels[cur]]]) clf.fit(X, y) preds = clf.predict(data) accuracy.append(err.generalization_error(preds, true_labels)) print f1_score(preds, true_labels) return accuracy
def svm_margin_learner(): accuracy = [] data = pool_reader() [row, col] = data.shape true_labels = oracle.read_mat() # do nothing about model until reasonable training subset achieved preds = np.zeros(row) used = set() selected = [] labels = [] while 1: r = random.randint(0, row-1) if r not in used: used.add(r) selected.append(data[r].tolist()) labels.append(true_labels[r]) accuracy.append(err.generalization_error(preds, true_labels)) if np.sum(labels) == 1 and len(labels) > 1: accuracy.pop() break X = np.array(selected) y = np.array(labels) clf = SVC(kernel='linear') clf.fit(X, y) preds = clf.predict(data) accuracy.append(err.generalization_error(preds, true_labels)) for x in xrange(256-len(used)): # nearest to decision boundary distance = clf.decision_function(data) rank = np.argsort(np.abs(distance)) for i in xrange(len(rank)): if rank[i] not in used: cur = rank[i] break print 'oracle', true_labels[cur] used.add(cur) X = np.vstack([X, data[cur]]) y = np.hstack([y.tolist(),[true_labels[cur]]]) clf.fit(X, y) preds = clf.predict(data) accuracy.append(err.generalization_error(preds, true_labels)) print f1_score(preds, true_labels) return accuracy
def rfc_learner(option): accuracy = [] data = pool_reader() true_labels = oracle.read_mat() [row_size, col_size] = data.shape points = np.empty([0, col_size]) labels = [] used = set() flag = True predictions = np.zeros(row_size) for i in xrange(0, 256): if option == 'select': if flag: pick = random.sample(range(row_size), 1)[0] else: # pick = get_next(data, points, used) clf = RandomForestClassifier(n_estimators=10, criterion='entropy') clf.fit(points, np.array(labels)) prob = clf.predict_proba(data) weight = np.abs(prob[:, 0] - 0.5) rank = np.argsort(weight) for x in xrange(len(rank)): if rank[x] not in used: pick = rank[x] break else: while 1: pick = random.sample(range(row_size), 1)[0] if pick not in used: break used.add(pick) points = np.vstack([points, data[pick]]) if oracle.oracle1(true_labels, pick) == 1: flag = False labels.append(oracle.oracle1(true_labels, pick)) clf = RandomForestClassifier(n_estimators=10, criterion='entropy') clf.fit(points, np.array(labels)) predictions = clf.predict(data) cur_acc = err.generalization_error(predictions, true_labels) accuracy.append(cur_acc) plt.plot(accuracy) plt.show() print "f1 ", f1_score(predictions, true_labels) return accuracy
def stream_learner(method, option, budget): features = csv_reader('resources/pool.csv') [row, col] = features.shape testset = csv_reader('resources/testSet.csv') true_labels = oracle.read_mat() if method == "rf": clf = RandomForestClassifier(n_estimators=10, criterion='entropy') if method == "lr": clf = LogisticRegression(penalty='l2') accuracy = [] points = [] labels = [] used = {} flag = True query_count = 0 i = 0 pred = np.zeros(250) if option == "select": # active learner while i < 2543 and query_count < budget: if flag: # call next compound until get one point with label 1 cur_point = compound.next_compound(features) cur_str = np.array_str(np.char.mod('%d', cur_point)) if cur_str[1: (len(cur_str) - 1)] not in used: i += 1 points.append(cur_point) cur_label = oracle.oracle2(cur_point, features) labels.append(cur_label) used[cur_str[1: (len(cur_str) - 1)]] = cur_label query_count += 1 if cur_label == 1: flag = False else: clf.fit(np.asarray(points), np.array(labels)) cur_point = compound.next_compound(features) cur_str = np.array_str(np.char.mod('%d', cur_point)) if cur_str[1: (len(cur_str) - 1)] not in used: # decide if ask oracle for help i += 1 prob = clf.predict_proba(cur_point) if 0.1 <= prob[0][0] <= 0.9: points.append(cur_point) cur_label = oracle.oracle2(cur_point, features) labels.append(cur_label) query_count += 1 used[cur_str[1: (len(cur_str) - 1)]] = cur_label clf.fit(np.asarray(points), np.array(labels)) pred = clf.predict(testset) cur_acc = error.test_error(pred, true_labels) print cur_acc, " ", query_count, " ", cur_label, " ", prob[0][0], " ", prob[0][1] accuracy.append(cur_acc) else: # random learner while i < budget: cur_point = compound.next_compound(features) cur_str = np.array_str(np.char.mod('%d', cur_point)) if cur_str[1: (len(cur_str) - 1)] not in used: points.append(cur_point) cur_label = oracle.oracle2(cur_point, features) if cur_label == 1: flag = False labels.append(cur_label) used[cur_str[1: (len(cur_str) - 1)]] = cur_label query_count += 1 i += 1 if not flag: clf.fit(np.asarray(points), np.array(labels)) pred = clf.predict(testset) cur_acc = error.test_error(pred, true_labels) print cur_acc, " ", query_count, " ", cur_label accuracy.append(cur_acc) plt.plot(accuracy) plt.show() print "f1", f1_score(pred, true_labels[0:250]) return
def lrc_learner(option): accuracy = [] data = pool_reader() true_labels = oracle.read_mat() [row_size, col_size] = data.shape predictions = np.zeros(row_size) points = np.empty([0, col_size]) labels = [] used = set() flag = True for i in xrange(0, 256): if option == "select": pick = -1 if flag: while 1: pick = random.sample(range(row_size), 1)[0] if pick not in used: used.add(pick) points = np.vstack([points, data[pick]]) label = oracle.oracle1(true_labels, pick) labels.append(label) if label == 1: flag = False break else: clf = LogisticRegression() clf.fit(points, np.array(labels)) prob = clf.predict_proba(data) weight = np.abs(prob[:, 0] - 0.5) rank = np.argsort(weight) for x in xrange(len(rank)): if rank[x] not in used: pick = rank[x] break used.add(pick) points = np.vstack([points, data[pick]]) label = oracle.oracle1(true_labels, pick) labels.append(label) clf.fit(points, np.array(labels)) predictions = clf.predict(data) cur_acc = err.generalization_error(predictions, true_labels) accuracy.append(cur_acc) else: while 1: pick = random.sample(range(row_size), 1)[0] if pick not in used: break used.add(pick) points = np.vstack([points, data[pick]]) label = oracle.oracle1(true_labels, pick) labels.append(label) if label == 1: flag = False if not flag: clf = LogisticRegression() clf.fit(points, np.array(labels)) predictions = clf.predict(data) cur_acc = err.generalization_error(predictions, true_labels) accuracy.append(cur_acc) plt.plot(accuracy) plt.show() print "f1 ", f1_score(predictions, true_labels) return accuracy