def create_dataset_splits(data, labels): # Define X and y target X = data.values y = np.asarray(labels) # Create alibox tool box toolbox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') # Split data to train and test and keep only 0.01 of the original data as labeled toolbox.split_AL(test_ratio=0.15, initial_label_rate=0.1) train_idx, test_idx, labeled_idx, unlabeled_idx = toolbox.get_split(0) X_train = X[train_idx] y_train = y[train_idx] y_train = np.array(y_train).reshape(-1) X_test = X[test_idx] y_test = y[test_idx] y_test = np.array(y_test).reshape(-1) # Save dataset splits with open('dataset','wb') as f: pickle.dump((X_train, X_test, y_train, y_test), f) # Save dataset splits indexes for active learning with open('dataset_al', 'wb') as f: pickle.dump((train_idx, test_idx, labeled_idx, unlabeled_idx), f)
def __init__(self, dataset, labels, testset, testlab, model, phase, path, stopping, measure='nearest_neighbor', distance='linear'): self.dataset = dataset self.labels = labels self.testset = testset self.testlab = testlab self.model = model self.phase = phase self.classes = int(max(labels)) self.alibox = ToolBox(X=dataset, y=np.asarray([0] * len(labels), dtype=np.int), query_type='AllLabels', saving_path='./%s' % path) self.alibox.split_AL(test_ratio=0, initial_label_rate=0.05, split_count=1) self.stopping_criterion = self.alibox.get_stopping_criterion( stopping[0], value=stopping[1]) self.measure = measure if measure == 'residue': self.query_strategy = QueryInstanceResidueRegressor( X=self.dataset, y=self.labels, distance=distance) else: self.query_strategy = QueryInstanceDistribution(measure=measure) self.random = QueryRandom() self.unc_result = [] self.title = '' self.loss = [] self.path = path self.one = self.two = self.three = self.four = self.five = self.six = None self.max, self.mae, self.mse, self.evs, self.r2 = [], [], [], [], [] self.sample = []
def __init__(self, dataset, labels, testset, testlab, model, phase, path, stopping): self.dataset = dataset self.labels = labels self.testset = testset self.testlab = testlab self.model = model self.phase = phase self.classes = int(max(labels)) self.alibox = ToolBox(X=dataset, y=labels, query_type='AllLabels', saving_path='./%s' % path) self.alibox.split_AL(test_ratio=0, initial_label_rate=0.05, split_count=1) self.stopping_criterion = self.alibox.get_stopping_criterion( stopping[0], value=stopping[1]) self.query_strategy = QueryInstanceUncertainty( X=dataset, y=labels, measure='least_confident') # self.query_strategy = QueryInstanceQBC(disagreement='KL_divergence') self.random = QueryRandom() self.unc_result = [] self.title = '' self.acc = [] self.gmeans = [] self.recall = [] self.precision = [] self.specificity = [] self.auc = [] self.f1 = [] self.pos = [] self.neg = [] self.ratio = [] self.loss = [] self.mcc = [] self.path = path
def AC_(self, X, y): # X, y = shuffle(X, Y) # y = y.astype('int') alibox = ToolBox(X=X, y=y, query_type='AllLabels') alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10) model = alibox.get_default_model() # stopping_criterion = alibox.get_stopping_criterion('num_of_queries',50) model.fit(X, y) pred = model.predict(X) # 整理矩阵系数为信任度,返回start w = model.class_weight dim = w.shape[0] trustValue = [] for i in range(0, dim): value = math.exp(w[i]) # exp() 方法返回x的指数,ex。 trustValue.append(value) return trustValue
return_indicator='dense', return_distributions=False, random_state=None) y[y == 0] = -1 # the cost of each class cost = [1, 3, 3, 7, 10] # if node_i is the parent of node_j , then label_tree(i,j)=1 else 0 label_tree = np.zeros((5, 5), dtype=np.int) label_tree[0, 1] = 1 label_tree[0, 2] = 1 label_tree[1, 3] = 1 label_tree[2, 4] = 1 alibox = ToolBox(X=X, y=y, query_type='PartLabels') # Split data alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10) # baseclassifier model use RFC model = RandomForestClassifier() # The budget of query budget = 40 # The cost budget is 500 stopping_criterion = alibox.get_stopping_criterion('cost_limit', 500) performance_result = [] halc_result = []
# model = RandomForestClassifier() # model = SVC(gamma='auto') for testdataset in testdatasetnames: print('***********currently dataset is : ', testdataset) lcdata_uncertainty_select_list = [] lcdata_random_select_list = [] # active learning dt = DataSet(testdataset, dataset_path) X = dt.X y = dt.y.ravel() y = np.asarray(y, dtype=int) train_indexs, test_indexs, label_indexs, unlabel_indexs = split_load('./experiment_result/combination_classify/australian_lrmetadata_0.005/australian/') alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path=savefloder_path + testdataset +'/', train_idx=train_indexs, test_idx=test_indexs, label_idx=label_indexs, unlabel_idx=unlabel_indexs) # Split data # alibox.split_AL(test_ratio=test_ratio, initial_label_rate=initial_label_ratio, split_count=splitcount) # alibox. # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', query_num) # experiment # meta_regressor = joblib.load('meta_lr.joblib') # meta_regressor = sgdr # meta_result = []
n_features=10, n_informative=5, n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') # Split data alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10) # Use the default Logistic Regression classifier model = alibox.get_default_model() # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50) def main_loop(alibox, strategy, round): # Get the data split of one fold experiment train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment
# Print the list all = np.asarray(X, dtype=float) X = all[0:1020] df = pd.read_csv("sentences_data.csv")[0:1020] NANindex = df['Oracle label'].index[df['Oracle label'].apply(np.isnan)] X = np.delete(X, NANindex, 0) df = df.dropna(subset=['Oracle label']) print(df) print(len(X)) y = df[['Oracle label']].to_numpy().ravel().astype(int) print(y) print("siz", y.size) # y = np.asarray([1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') # Split data alibox.split_AL(test_ratio=0.2, initial_label_rate=0.1, split_count=3) # Use the default Logistic Regression classifier # model = sklearn.svm.SVC(kernel='sigmoid', probability=True) # model=MLPClassifier(hidden_layer_sizes=(80,80),activation='logistic',solver='adam',max_iter=3000, alpha=0.01) model = alibox.get_default_model() # model = RandomForestClassifier() # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', '100') def main_loop(alibox, strategy, round):
class TorchRegressionFold: def __init__(self, dataset, labels, testset, testlab, model, phase, path, stopping, measure='nearest_neighbor', distance='linear'): self.dataset = dataset self.labels = labels self.testset = testset self.testlab = testlab self.model = model self.phase = phase self.classes = int(max(labels)) self.alibox = ToolBox(X=dataset, y=np.asarray([0] * len(labels), dtype=np.int), query_type='AllLabels', saving_path='./%s' % path) self.alibox.split_AL(test_ratio=0, initial_label_rate=0.05, split_count=1) self.stopping_criterion = self.alibox.get_stopping_criterion( stopping[0], value=stopping[1]) self.measure = measure if measure == 'residue': self.query_strategy = QueryInstanceResidueRegressor( X=self.dataset, y=self.labels, distance=distance) else: self.query_strategy = QueryInstanceDistribution(measure=measure) self.random = QueryRandom() self.unc_result = [] self.title = '' self.loss = [] self.path = path self.one = self.two = self.three = self.four = self.five = self.six = None self.max, self.mae, self.mse, self.evs, self.r2 = [], [], [], [], [] self.sample = [] def train(self): from sklearn.metrics import (mean_squared_log_error as msle, max_error as max, mean_absolute_error as mae, mean_squared_error as mse, explained_variance_score as evs, r2_score as r2, mean_tweedie_deviance as tweedie) for round in range(1): try: os.mkdir('%s/%d' % (self.path, round)) except FileExistsError: pass # get data split of one fold train_idx, test_idx, label_ind, unlab_ind = self.alibox.get_split( round) # get intermediate results saver for one fold experiment saver = self.alibox.get_stateio(round) # set initial performance point model = self.model net = NN.NeuralNetworkRegressor(model=model, batch_size=1, device_ids=[0], epochs=50) net.lr_fc = 0.01 net.initiate(self.dataset[label_ind.index], self.labels[label_ind.index]) net.predict(self.testset) pred = net.preds # evaluation all = len(label_ind) + len(unlab_ind) lab_init = len(label_ind) self.mse.append(mse(self.testlab, pred)) self.mae.append(mae(self.testlab, pred)) self.max.append(max(self.testlab, pred)) self.evs.append(evs(self.testlab, pred)) self.r2.append(r2(self.testlab, pred)) self.sample.append(len(label_ind.index)) saver.set_initial_point(mse(self.testlab, pred)) iteration = 0 while not self.stopping_criterion.is_stop(): # select subsets of Uind samples according to query strategy iteration += 1 lr_fc = net.lr_fc * (1 - len(label_ind.index) / (all * 1.001)) for p in net.optimizer.param_groups: p['lr'] = lr_fc print('learning rate is', net.optimizer.state_dict()['param_groups'][0]['lr']) if self.phase == 'active': if self.measure != 'residue': net.predict(self.dataset[unlab_ind.index]) else: net.predict(self.dataset[label_ind]) pred = net.preds if self.measure == 'distance': if iteration == 1: self._update_previous_prediction(pred) else: self._update_previous_prediction( pred, select_ind, unlab_ind_save) previous = self._get_previous_prediction() else: previous = None if len(label_ind) < all * 0.6: if iteration % 10: select_ind = self.query_strategy.select_by_prediction( unlabel_index=unlab_ind, predict=pred, labels=self.labels[label_ind.index], batch_size=int(lab_init * 1), X_lab=self.dataset[label_ind.index], X_unlab=self.dataset[unlab_ind.index], previous=previous) else: select_ind = self.random.select(label_ind, unlab_ind, batch_size=int( lab_init * 1)) else: select_ind = self.query_strategy.select_by_prediction( unlabel_index=unlab_ind, predict=pred, labels=self.labels[label_ind.index], batch_size=int(len(label_ind) * 0.3), X_lab=self.dataset[label_ind.index], X_unlab=self.dataset[unlab_ind.index], previous=previous) elif self.phase == 'passive': if len(label_ind) < all * 0.6: select_ind = self.random.select(label_ind, unlab_ind, batch_size=int( lab_init * 1)) # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1) else: select_ind = self.random.select( label_ind, unlab_ind, batch_size=int(len(label_ind) * 0.3)) # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1) # update the datasets and previous prediction unlab_ind_save = unlab_ind.index label_ind.update(select_ind) unlab_ind.difference_update(select_ind) # update model and calc performance accoding to the updated model loss = net.train(self.dataset[label_ind.index], self.labels[label_ind.index]) # if not iteration%2: net.predict(self.testset) pred = net.preds # evaluation self.mse.append(mse(self.testlab, pred)) self.mae.append(mae(self.testlab, pred)) self.max.append(max(self.testlab, pred)) self.evs.append(evs(self.testlab, pred)) self.r2.append(r2(self.testlab, pred)) self.sample.append(len(label_ind.index)) self.loss.append(loss) # save the results st = self.alibox.State(select_ind, mse(self.testlab, pred)) saver.add_state(st) saver.save() self.stopping_criterion.update_information(saver) torch.save(self.model, './%s/%d/model%d' % (self.path, round, iteration)) self.stopping_criterion.reset() self.unc_result.append(copy.deepcopy(saver)) joblib.dump(self.mse, './%s/%d/mse' % (self.path, round)) joblib.dump(self.mae, './%s/%d/mae' % (self.path, round)) joblib.dump(self.max, './%s/%d/max' % (self.path, round)) joblib.dump(self.evs, './%s/%d/evs' % (self.path, round)) joblib.dump(self.r2, './%s/%d/r2' % (self.path, round)) joblib.dump(self.sample, './%s/%d/sample' % (self.path, round)) joblib.dump(self.loss, './%s/%d/loss' % (self.path, round)) joblib.dump(self.testlab, './%s/%d/testlab' % (self.path, round)) joblib.dump(pred, './%s/%d/pred' % (self.path, round)) self.analyser = self.alibox.get_experiment_analyser( x_axis='num_of_queries') self.analyser.add_method( method_name='QueryInstanceDistribution-distance', method_results=self.unc_result) print(self.analyser) def _update_previous_prediction(self, new, selected=None, unlab=None): if self.six is not None: del_ind = [unlab.index(i) for i in selected] if self.two is not None: self.one = np.delete(self.two, del_ind) if self.three is not None: self.two = np.delete(self.three, del_ind) if self.four is not None: self.three = np.delete(self.four, del_ind) if self.five is not None: self.four = np.delete(self.five, del_ind) if self.six is not None: self.five = np.delete(self.six, del_ind) self.six = new def _get_previous_prediction(self): if self.one is not None: return np.vstack((self.one, self.two, self.three, self.four, self.five, self.six)) elif self.two is not None: return np.vstack( (self.two, self.three, self.four, self.five, self.six)) elif self.three is not None: return np.vstack((self.three, self.four, self.five, self.six))
import copy from sklearn.datasets import load_iris from alipy import ToolBox X, y = load_iris(return_X_y=True) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') # Split data alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10) # Use the default Logistic Regression classifier model = alibox.get_default_model() # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50) # Use pre-defined strategy QBCStrategy = alibox.get_query_strategy(strategy_name='QueryInstanceQBC') QBC_result = [] for round in range(10): # Get the data split of one fold experiment train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) while not stopping_criterion.is_stop(): # Select a subset of Uind according to the query strategy # Passing model=None to use the default model for evaluating the committees' disagreement select_ind = QBCStrategy.select(label_ind, unlab_ind,
# gbr_r2 = r2_score(testmetadata[:, 396], gbr_pred) # print('In the ' + testdataset + 'GradientBoostingRegressor r2_score is : ', gbr_r2) # if gbr_performance is None: # gbr_performance = np.array([testdataset, gbr_mse, gbr_mae, gbr_r2]) # else: # gbr_performance = np.vstack((gbr_performance, [testdataset, gbr_mse, gbr_mae, gbr_r2])) # joblib.dump(gbr, testdataset + "meta_gbr.joblib") # active learning dt = DataSet(testdataset, dataset_path) X = dt.X y = dt.y.ravel() y = np.asarray(y, dtype=int) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='./experiment_result/' + testdataset + '/') # Split data alibox.split_AL(test_ratio=0.3, initial_label_rate=0.05, split_count=5) # Use the default Logistic Regression classifier model = alibox.get_default_model() # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 30) # experiment # meta_regressor = joblib.load('meta_lr.joblib') # meta_regressor = sgdr # meta_result = []
n_features=20, n_informative=2, n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') # Split data alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10) # Use the default Logistic Regression classifier model = alibox.get_default_model() # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50) QBC_result = [] def main_loop(alibox, strategy, round): # Get the data split of one fold experiment
from alipy.experiment import StoppingCriteria from alipy import ToolBox import numpy as np X = np.random.rand(30, 5) y = np.random.randint(2, size=30) alibox = ToolBox(X=X, y=y) # ---------------Initialize---------------- stopping_criterion = StoppingCriteria(stopping_criteria='num_of_queries', value=50) # or init by toolbox stopping_criterion = alibox.get_stopping_criterion( stopping_criteria='num_of_queries', value=50) # ---------------Usage---------------- while not stopping_criterion.is_stop(): #... Query some examples and update the StateIO object # Use the StateIO object to update stopping_criterion object saver = alibox.get_stateio(round=0) stopping_criterion.update_information(saver) # The condition is met and break the loop. # Reset the object for another fold. stopping_criterion.reset()
class TorchFold: def __init__(self, dataset, labels, testset, testlab, model, phase, path, stopping): self.dataset = dataset self.labels = labels self.testset = testset self.testlab = testlab self.model = model self.phase = phase self.classes = int(max(labels)) self.alibox = ToolBox(X=dataset, y=labels, query_type='AllLabels', saving_path='./%s' % path) self.alibox.split_AL(test_ratio=0, initial_label_rate=0.05, split_count=1) self.stopping_criterion = self.alibox.get_stopping_criterion( stopping[0], value=stopping[1]) self.query_strategy = QueryInstanceUncertainty( X=dataset, y=labels, measure='least_confident') # self.query_strategy = QueryInstanceQBC(disagreement='KL_divergence') self.random = QueryRandom() self.unc_result = [] self.title = '' self.acc = [] self.gmeans = [] self.recall = [] self.precision = [] self.specificity = [] self.auc = [] self.f1 = [] self.pos = [] self.neg = [] self.ratio = [] self.loss = [] self.mcc = [] self.path = path def train(self): for round in range(1): try: os.mkdir('%s/%d' % (self.path, round)) except FileExistsError: pass # get data split of one fold train_idx, test_idx, label_ind, unlab_ind = self.alibox.get_split( round) # get intermediate results saver for one fold experiment saver = self.alibox.get_stateio(round) # set initial performance point model = self.model # print(torch.cuda.current_device()) # print(torch.cuda.device_count(), torch.cuda.is_available()) net = NN.NeuralNetwork(model=model, num_classes=2, batch_size=500, device_ids=[0], epochs=50) net.lr_fc = 0.0001 net.initiate(self.dataset[label_ind.index], self.labels[label_ind.index]) net.predict(self.testset) pred = net.preds weight = [] conf_mat = confusion_matrix(y_true=self.testlab, y_pred=pred) precision = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[0, 1]) recall = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[1, 0]) specificity = conf_mat[0, 0] / (conf_mat[0, 0] + conf_mat[0, 1]) gmeans = sqrt(recall * specificity) f1 = metrics.f1_score(y_true=self.testlab, y_pred=pred) auc = metrics.roc_auc_score(y_true=self.testlab, y_score=pred) accuracy = self.alibox.calc_performance_metric( y_true=self.testlab, y_pred=pred.reshape(list(self.testlab.shape)), performance_metric='accuracy_score') self.auc.append(auc) self.acc.append(accuracy) self.f1.append(f1) self.gmeans.append(gmeans) self.recall.append(recall) self.precision.append(precision) self.specificity.append(specificity) all = len(label_ind) + len(unlab_ind) lab_init = len(label_ind) lab = list(self.labels[label_ind.index]) self.pos.append(lab.count(1)) self.neg.append(lab.count(0)) self.ratio.append(lab.count(0) / lab.count(1)) tn, tp, fp, fn = conf_mat[0, 0], conf_mat[1, 1], conf_mat[0, 1], conf_mat[1, 0] mcc = ((tn * tp) - (fn * fp)) / sqrt( (tn + fp) * (tn + fn) * (tp + fp) * (tp + fn)) self.mcc.append(mcc) saver.set_initial_point(gmeans) iteration = 0 while not self.stopping_criterion.is_stop(): # select subsets of Uind samples according to query strategy iteration += 1 if self.phase == 'active': net.predict(self.dataset[unlab_ind.index]) prob_pred = net.probablistic_matrix() if len(label_ind) < all * 0.3: if iteration % 10: select_ind = self.query_strategy.select_by_prediction_mat( unlabel_index=unlab_ind, predict=prob_pred, batch_size=int(lab_init * 0.4)) # batch_size=1) else: select_ind = self.random.select( label_ind, unlab_ind, batch_size=int(lab_init * 0.4)) # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1) else: select_ind = self.query_strategy.select_by_prediction_mat( unlabel_index=unlab_ind, predict=prob_pred, batch_size=int(len(label_ind) * 0.4)) # batch_size=1) elif self.phase == 'passive': if len(label_ind) < all * 0.3: select_ind = self.random.select(label_ind, unlab_ind, batch_size=int( lab_init * 0.4)) # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1) else: select_ind = self.random.select( label_ind, unlab_ind, batch_size=int(len(label_ind) * 0.4)) # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1) # print(select_ind) label_ind.update(select_ind) unlab_ind.difference_update(select_ind) # update model and calc performance accoding to the updated model loss = net.train(self.dataset[label_ind.index], self.labels[label_ind.index]) # if not iteration%2: net.predict(self.testset) pred = net.preds conf_mat = confusion_matrix(y_true=self.testlab, y_pred=pred) precision = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[0, 1]) recall = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[1, 0]) specificity = conf_mat[0, 0] / (conf_mat[0, 0] + conf_mat[0, 1]) gmeans = sqrt(recall * specificity) f1 = metrics.f1_score(y_true=self.testlab, y_pred=pred) auc = metrics.roc_auc_score(y_true=self.testlab, y_score=pred) accuracy = self.alibox.calc_performance_metric( y_true=self.testlab, y_pred=pred.reshape(list(self.testlab.shape)), performance_metric='accuracy_score') self.auc.append(auc) self.acc.append(accuracy) self.f1.append(f1) self.gmeans.append(gmeans) self.recall.append(recall) self.precision.append(precision) self.specificity.append(specificity) lab = list(self.labels[label_ind.index]) self.pos.append(lab.count(1)) self.neg.append((lab.count(0))) self.ratio.append(lab.count(0) / lab.count(1)) self.loss.append(loss) tn, tp, fp, fn = conf_mat[0, 0], conf_mat[1, 1], conf_mat[ 0, 1], conf_mat[1, 0] mcc = ((tn * tp) - (fn * fp)) / sqrt( (tn + fp) * (tn + fn) * (tp + fp) * (tp + fn)) self.mcc.append(mcc) # save the results st = self.alibox.State(select_ind, gmeans) saver.add_state(st) saver.save() self.stopping_criterion.update_information(saver) lab = list(self.labels[label_ind.index]) print('\n class \n0 and 1\n', lab.count(0), lab.count(1)) print('\n', conf_mat) torch.save(self.model, './%s/%d/model%d' % (self.path, round, iteration)) self.stopping_criterion.reset() self.unc_result.append(copy.deepcopy(saver)) joblib.dump(self.auc, './%s/%d/auc' % (self.path, round)) joblib.dump(self.acc, './%s/%d/acc' % (self.path, round)) joblib.dump(self.f1, './%s/%d/f1' % (self.path, round)) joblib.dump(self.gmeans, './%s/%d/gmeans' % (self.path, round)) joblib.dump(self.recall, './%s/%d/recall' % (self.path, round)) joblib.dump(self.precision, './%s/%d/precision' % (self.path, round)) joblib.dump(self.specificity, './%s/%d/specificity' % (self.path, round)) joblib.dump(self.pos, './%s/%d/pos' % (self.path, round)) joblib.dump(self.neg, './%s/%d/neg' % (self.path, round)) joblib.dump(self.ratio, './%s/%d/ratio' % (self.path, round)) joblib.dump(self.mcc, './%s/%d/mcc' % (self.path, round)) self.analyser = self.alibox.get_experiment_analyser( x_axis='num_of_queries') self.analyser.add_method(method_name='QueryInstanceUncertaity-lc', method_results=self.unc_result) print(self.analyser)
import copy import numpy as np from sklearn.datasets import load_iris from sklearn.preprocessing import OneHotEncoder from alipy.query_strategy.query_type import QueryTypeAURO from alipy.query_strategy.multi_label import LabelRankingModel from alipy.index.multi_label_tools import get_Xy_in_multilabel from alipy import ToolBox X, y = load_iris(return_X_y=True) mlb = OneHotEncoder() mult_y = mlb.fit_transform(y.reshape((-1, 1))) mult_y = np.asarray(mult_y.todense()) mult_y[mult_y == 0] = -1 alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels') alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False) # query type strategy AURO_results = [] for round in range(5): train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) query_y = mult_y.copy() AURO_strategy = QueryTypeAURO(X=X, y=mult_y) # base model model = LabelRankingModel()
def create_and_implement_strategy(strategy_name, data, labels, queries): # Keep only the values of data and labels dataframe (Later, we use the global split based on idxs) X = data.values y = np.asarray(labels) toolbox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') # Create Logistic Regression model ( Default Setting with liblinear solver) model = toolbox.get_default_model() # Implement query strategy uncertainty_strategy = toolbox.get_query_strategy(strategy_name=strategy_name) # Create array to save the results examples = [] # Set stopping criterion, we will stop in 1000 labeled examples stopping_criterion = toolbox.get_stopping_criterion('num_of_queries', queries) # Get the indexes of the global split with open("dataset_al", "rb") as f: train_idx, test_idx, labeled_idx, unlabeled_idx = pickle.load(f) # Create saver to save the results saver = StateIO(round=0, train_idx=train_idx, test_idx=test_idx, init_L=labeled_idx, init_U=unlabeled_idx, saving_path='.') # print(train_idx.shape, test_idx.shape) # Starting with some labeled examples model.fit(X=X[labeled_idx.index, :], y=y[labeled_idx.index]) y_pred = model.predict(X[test_idx, :]) # Calculate the accuracy of the prediction accuracy = toolbox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='accuracy_score') # Save accuracy of the prediction saver.set_initial_point(accuracy) while not stopping_criterion.is_stop(): # Select example of the unlabeled dataset example = uncertainty_strategy.select(labeled_idx, unlabeled_idx, model=model, batch_size=1) # Update the label idxs labeled_idx.update(example) unlabeled_idx.difference_update(example) # Train model for the added example model.fit(X=X[labeled_idx.index, :], y=y[labeled_idx.index]) y_pred = model.predict(X[test_idx, :]) # Calculate accuracy accuracy = toolbox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='accuracy_score') # f1 = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='f1_score') # Save update results state = toolbox.State(select_index=example, performance=accuracy) saver.add_state(state) saver.save() # Update progress for stopping criterion stopping_criterion.update_information(saver) stopping_criterion.reset() examples.append(copy.deepcopy(saver)) # Uncomment and return in order to save the new active learning dataset # Save selected x_train examples X_train = X[labeled_idx, :] # Save labels for the examples y_train = y[labeled_idx, :] # Reshape target y_train = np.array(y_train).reshape(-1) # Save to pickle # with open('qbc_dataset','wb') as f: # pickle.dump((X_train, y_train), f) return examples
initial_label_ratio = 0.005 savefloder_path = './experiment_result/classical_active_learning/' model = LogisticRegression(solver='lbfgs') for testdataset in testdatasetnames: print('***********currently dataset is : ', testdataset) # prepare dataset dt = DataSet(testdataset, dataset_path) X = dt.X y = dt.y.ravel() y = np.asarray(y, dtype=int) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path=savefloder_path + testdataset + '/') # Split data alibox.split_AL(test_ratio=test_ratio, initial_label_rate=initial_label_ratio, split_count=splitcount) # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', query_num) def main_loop(alibox, strategy, round): # Get the data split of one fold experiment train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round)
from sklearn.datasets import load_iris from alipy import ToolBox X, y = load_iris(return_X_y=True) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') # get tools tr, te, lab, unlab = alibox.split_AL() tr0, te0, lab0, unlab0 = alibox.get_split(round=0) oracle = alibox.get_clean_oracle() saver = alibox.get_stateio(round=0) repo = alibox.get_repository(round=0) rand_strategy = alibox.get_query_strategy(strategy_name="QueryInstanceRandom") perf = alibox.calc_performance_metric(y_true=[1], y_pred=[1], performance_metric='accuracy_score') model = alibox.get_default_model() sc = alibox.get_stopping_criterion(stopping_criteria='num_of_queries', value=50) analyser = alibox.get_experiment_analyser(x_axis='num_of_queries') acethread = alibox.get_ace_threading() # data struct defined in alipy ind = alibox.IndexCollection([1, 2, 3]) m_ind = alibox.MultiLabelIndexCollection([(1, 0), (2, )]) st = alibox.State(select_index=[1], performance=perf) # io alibox.save() # al_settings.pkl is the default name. To use another name, please pass a specific file name # to 'saving_path' parameter when initializing the ToolBox object. (e.g., saving_path='./my_file.pkl') alibox = ToolBox.load(path='./al_settings.pkl')
test_data = test_data.drop(["FaultCause"], axis=1) all_data = pd.concat([train_data, test_data], axis=0) all_label = pd.concat([train_label, test_label], axis=0) all_data = all_data.values all_label = all_label.values all_resampled_data, all_resampled_label = SMOTE().fit_resample( all_data, all_label) all_data = all_resampled_data all_label = all_resampled_label for index in range(0, len(all_label)): all_label[index] = all_label[index] - 1 alibox = ToolBox(X=all_data, y=all_label, query_type='AllLabels', saving_path='.') alibox.split_AL(test_ratio=0.7, initial_label_rate=0.001, split_count=1) model = alibox.get_default_model() # model = AdaBoostClassifier(n_estimators=10) # model = XGBClassifier(objective="reg:logistic") # model = LogisticRegression() # rft = SVC(kernel='linear') # knn = KNeighborsClassifier(n_neighbors=7) stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 500) def main_loop(alibox, strategy, round): # Get the data split of one fold experiment
from alipy import ToolBox from alipy.query_strategy.multi_label import * X, y = load_iris(return_X_y=True) mlb = OneHotEncoder() mult_y = mlb.fit_transform(y.reshape((-1, 1))) mult_y = np.asarray(mult_y.todense()) # Or generate a dataset with any sizes # X, mult_y = make_multilabel_classification(n_samples=5000, n_features=20, n_classes=5, length=5) # Since we are using the label ranking model, the label 0 means unknown. we need to # set the 0 entries to -1 which means irrelevant. mult_y[mult_y == 0] = -1 alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels') alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False) def main_loop(alibox, round, strategy): train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) # base model model = LabelRankingModel() # A simple stopping criterion to specify the query budget. while len(label_ind) <= 120: # query and update select_labs = strategy.select(label_ind, unlab_ind) # use cost to record the amount of queried instance-label pairs
df = pd.read_excel('Juliet_Test_Suite/combined_data_table.xlsx') df = encode_and_bind(df, 'Clang Rule') df = encode_and_bind(df, 'CodeSonar Rule') df = encode_and_bind(df, 'Severity') df = encode_and_bind(df, 'CWE') df.dropna(subset=['True Positive'], inplace=True) df = df.reindex() X = df.drop('True Positive', axis=1) y = df.loc[:, 'True Positive'] #change these parameters to alter experiment init_labels = 0.005 #initial label rate trn_tst_split = 0.2 # train test split portion stop = 300 #number of queries to execute alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') alibox.split_AL(test_ratio=0.2, initial_label_rate=0.005, split_count=3) # model=LogisticRegression(penalty='l1',solver='liblinear') model = RandomForestClassifier(n_estimators=100) stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 300) uncertainStrategy = alibox.get_query_strategy( strategy_name='QueryInstanceUncertainty') unc_result = [] train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(0) saver = alibox.get_stateio(0) # print(y.loc[label_ind.index]) model.fit(X=X.values[label_ind.index, :], y=y.values[label_ind.index]) while not stopping_criterion.is_stop(): select_ind = uncertainStrategy.select(label_ind, unlab_ind, model=model,