def create_dataset_splits(data, labels): # Define X and y target X = data.values y = np.asarray(labels) # Create alibox tool box toolbox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') # Split data to train and test and keep only 0.01 of the original data as labeled toolbox.split_AL(test_ratio=0.15, initial_label_rate=0.1) train_idx, test_idx, labeled_idx, unlabeled_idx = toolbox.get_split(0) X_train = X[train_idx] y_train = y[train_idx] y_train = np.array(y_train).reshape(-1) X_test = X[test_idx] y_test = y[test_idx] y_test = np.array(y_test).reshape(-1) # Save dataset splits with open('dataset','wb') as f: pickle.dump((X_train, X_test, y_train, y_test), f) # Save dataset splits indexes for active learning with open('dataset_al', 'wb') as f: pickle.dump((train_idx, test_idx, labeled_idx, unlabeled_idx), f)
def __init__(self, dataset, labels, testset, testlab, model, phase, path, stopping, measure='nearest_neighbor', distance='linear'): self.dataset = dataset self.labels = labels self.testset = testset self.testlab = testlab self.model = model self.phase = phase self.classes = int(max(labels)) self.alibox = ToolBox(X=dataset, y=np.asarray([0] * len(labels), dtype=np.int), query_type='AllLabels', saving_path='./%s' % path) self.alibox.split_AL(test_ratio=0, initial_label_rate=0.05, split_count=1) self.stopping_criterion = self.alibox.get_stopping_criterion( stopping[0], value=stopping[1]) self.measure = measure if measure == 'residue': self.query_strategy = QueryInstanceResidueRegressor( X=self.dataset, y=self.labels, distance=distance) else: self.query_strategy = QueryInstanceDistribution(measure=measure) self.random = QueryRandom() self.unc_result = [] self.title = '' self.loss = [] self.path = path self.one = self.two = self.three = self.four = self.five = self.six = None self.max, self.mae, self.mse, self.evs, self.r2 = [], [], [], [], [] self.sample = []
def __init__(self, dataset, labels, testset, testlab, model, phase, path, stopping): self.dataset = dataset self.labels = labels self.testset = testset self.testlab = testlab self.model = model self.phase = phase self.classes = int(max(labels)) self.alibox = ToolBox(X=dataset, y=labels, query_type='AllLabels', saving_path='./%s' % path) self.alibox.split_AL(test_ratio=0, initial_label_rate=0.05, split_count=1) self.stopping_criterion = self.alibox.get_stopping_criterion( stopping[0], value=stopping[1]) self.query_strategy = QueryInstanceUncertainty( X=dataset, y=labels, measure='least_confident') # self.query_strategy = QueryInstanceQBC(disagreement='KL_divergence') self.random = QueryRandom() self.unc_result = [] self.title = '' self.acc = [] self.gmeans = [] self.recall = [] self.precision = [] self.specificity = [] self.auc = [] self.f1 = [] self.pos = [] self.neg = [] self.ratio = [] self.loss = [] self.mcc = [] self.path = path
def AC_(self, X, y): # X, y = shuffle(X, Y) # y = y.astype('int') alibox = ToolBox(X=X, y=y, query_type='AllLabels') alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10) model = alibox.get_default_model() # stopping_criterion = alibox.get_stopping_criterion('num_of_queries',50) model.fit(X, y) pred = model.predict(X) # 整理矩阵系数为信任度,返回start w = model.class_weight dim = w.shape[0] trustValue = [] for i in range(0, dim): value = math.exp(w[i]) # exp() 方法返回x的指数,ex。 trustValue.append(value) return trustValue
from alipy import ToolBox from alipy.query_strategy.multi_label import * X, y = load_iris(return_X_y=True) mlb = OneHotEncoder() mult_y = mlb.fit_transform(y.reshape((-1, 1))) mult_y = np.asarray(mult_y.todense()) # Or generate a dataset with any sizes # X, mult_y = make_multilabel_classification(n_samples=5000, n_features=20, n_classes=5, length=5) # Since we are using the label ranking model, the label 0 means unknown. we need to # set the 0 entries to -1 which means irrelevant. mult_y[mult_y == 0] = -1 alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels') alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False) def main_loop(alibox, round, strategy): train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) # base model model = LabelRankingModel() # A simple stopping criterion to specify the query budget. while len(label_ind) <= 120: # query and update select_labs = strategy.select(label_ind, unlab_ind) # use cost to record the amount of queried instance-label pairs
import copy from sklearn.datasets import load_iris from alipy import ToolBox X, y = load_iris(return_X_y=True) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') # Split data alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10) # Use the default Logistic Regression classifier model = alibox.get_default_model() # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50) # Use pre-defined strategy QBCStrategy = alibox.get_query_strategy(strategy_name='QueryInstanceQBC') QBC_result = [] for round in range(10): # Get the data split of one fold experiment train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) while not stopping_criterion.is_stop(): # Select a subset of Uind according to the query strategy # Passing model=None to use the default model for evaluating the committees' disagreement select_ind = QBCStrategy.select(label_ind, unlab_ind,
def create_and_implement_strategy(strategy_name, data, labels, queries): # Keep only the values of data and labels dataframe (Later, we use the global split based on idxs) X = data.values y = np.asarray(labels) toolbox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') # Create Logistic Regression model ( Default Setting with liblinear solver) model = toolbox.get_default_model() # Implement query strategy uncertainty_strategy = toolbox.get_query_strategy(strategy_name=strategy_name) # Create array to save the results examples = [] # Set stopping criterion, we will stop in 1000 labeled examples stopping_criterion = toolbox.get_stopping_criterion('num_of_queries', queries) # Get the indexes of the global split with open("dataset_al", "rb") as f: train_idx, test_idx, labeled_idx, unlabeled_idx = pickle.load(f) # Create saver to save the results saver = StateIO(round=0, train_idx=train_idx, test_idx=test_idx, init_L=labeled_idx, init_U=unlabeled_idx, saving_path='.') # print(train_idx.shape, test_idx.shape) # Starting with some labeled examples model.fit(X=X[labeled_idx.index, :], y=y[labeled_idx.index]) y_pred = model.predict(X[test_idx, :]) # Calculate the accuracy of the prediction accuracy = toolbox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='accuracy_score') # Save accuracy of the prediction saver.set_initial_point(accuracy) while not stopping_criterion.is_stop(): # Select example of the unlabeled dataset example = uncertainty_strategy.select(labeled_idx, unlabeled_idx, model=model, batch_size=1) # Update the label idxs labeled_idx.update(example) unlabeled_idx.difference_update(example) # Train model for the added example model.fit(X=X[labeled_idx.index, :], y=y[labeled_idx.index]) y_pred = model.predict(X[test_idx, :]) # Calculate accuracy accuracy = toolbox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='accuracy_score') # f1 = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='f1_score') # Save update results state = toolbox.State(select_index=example, performance=accuracy) saver.add_state(state) saver.save() # Update progress for stopping criterion stopping_criterion.update_information(saver) stopping_criterion.reset() examples.append(copy.deepcopy(saver)) # Uncomment and return in order to save the new active learning dataset # Save selected x_train examples X_train = X[labeled_idx, :] # Save labels for the examples y_train = y[labeled_idx, :] # Reshape target y_train = np.array(y_train).reshape(-1) # Save to pickle # with open('qbc_dataset','wb') as f: # pickle.dump((X_train, y_train), f) return examples
datasetnames = np.load('datasetname.npy') # datasetname = 'echocardiogram' # datasetname = 'australian' # datasetname = 'blood' # datasetname = 'texture' datasetnames = ['tic-tac-toe'] for datasetname in datasetnames: dt = DataSet(datasetname, dataset_path) X = dt.X y = dt.y.ravel() y = np.asarray(y, dtype=int) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='./experiment_result/') # Split data alibox.split_AL(test_ratio=0.3, initial_label_rate=0.05, split_count=5) # Use the default Logistic Regression classifier model = alibox.get_default_model() # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 30) # experiment meta_regressor = joblib.load('meta_lr.joblib') # meta_query = QueryMetaData(X, y, meta_regressor) meta_result = []
# model = RandomForestClassifier() # model = SVC(gamma='auto') for testdataset in testdatasetnames: print('***********currently dataset is : ', testdataset) lcdata_uncertainty_select_list = [] lcdata_random_select_list = [] # active learning dt = DataSet(testdataset, dataset_path) X = dt.X y = dt.y.ravel() y = np.asarray(y, dtype=int) train_indexs, test_indexs, label_indexs, unlabel_indexs = split_load('./experiment_result/combination_classify/australian_lrmetadata_0.005/australian/') alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path=savefloder_path + testdataset +'/', train_idx=train_indexs, test_idx=test_indexs, label_idx=label_indexs, unlabel_idx=unlabel_indexs) # Split data # alibox.split_AL(test_ratio=test_ratio, initial_label_rate=initial_label_ratio, split_count=splitcount) # alibox. # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', query_num) # experiment # meta_regressor = joblib.load('meta_lr.joblib') # meta_regressor = sgdr # meta_result = []
# rfc_meta = joblib.load('./newmetadata/rfc_p_classify_australian.joblib') # lr_meta = joblib.load('./newmetadata/lr_p_classify_australian.joblib') # Use the default Logistic Regression classifier model = LogisticRegression(solver='lbfgs') # model = RandomForestClassifier() # model = SVC(gamma='auto') for testdataset in testdatasetnames: print('***********currently dataset is : ', testdataset) # prepare dataset dt = DataSet(testdataset, dataset_path) X = dt.X y = dt.y.ravel() y = np.asarray(y, dtype=int) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path=savefloder_path + testdataset +'/') # Split data alibox.split_AL(test_ratio=test_ratio, initial_label_rate=initial_label_ratio, split_count=splitcount) # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', query_num) # generate the first five rounds data(label_index unlabel_index model_output) label_index_round = [] unlabel_index_round = [] model_output_round = [] for round in range(splitcount): label_inds_5 = [] unlabel_inds_5 = [] model_output_5 = []
test_data = test_data.drop(["FaultCause"], axis=1) all_data = pd.concat([train_data, test_data], axis=0) all_label = pd.concat([train_label, test_label], axis=0) all_data = all_data.values all_label = all_label.values all_resampled_data, all_resampled_label = SMOTE().fit_resample( all_data, all_label) all_data = all_resampled_data all_label = all_resampled_label for index in range(0, len(all_label)): all_label[index] = all_label[index] - 1 alibox = ToolBox(X=all_data, y=all_label, query_type='AllLabels', saving_path='.') alibox.split_AL(test_ratio=0.7, initial_label_rate=0.001, split_count=1) model = alibox.get_default_model() # model = AdaBoostClassifier(n_estimators=10) # model = XGBClassifier(objective="reg:logistic") # model = LogisticRegression() # rft = SVC(kernel='linear') # knn = KNeighborsClassifier(n_neighbors=7) stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 500) def main_loop(alibox, strategy, round): # Get the data split of one fold experiment
from alipy.experiment import StoppingCriteria from alipy import ToolBox import numpy as np X = np.random.rand(30, 5) y = np.random.randint(2, size=30) alibox = ToolBox(X=X, y=y) # ---------------Initialize---------------- stopping_criterion = StoppingCriteria(stopping_criteria='num_of_queries', value=50) # or init by toolbox stopping_criterion = alibox.get_stopping_criterion( stopping_criteria='num_of_queries', value=50) # ---------------Usage---------------- while not stopping_criterion.is_stop(): #... Query some examples and update the StateIO object # Use the StateIO object to update stopping_criterion object saver = alibox.get_stateio(round=0) stopping_criterion.update_information(saver) # The condition is met and break the loop. # Reset the object for another fold. stopping_criterion.reset()
# gbr_performance = np.vstack((gbr_performance, [testdataset, gbr_mse, gbr_mae, gbr_r2])) # joblib.dump(gbr, testdataset + "meta_gbr.joblib") # GaussianProcessRegressor # kernel = DotProduct() + WhiteKernel() # gpr = GaussianProcessRegressor(kernel=kernel, random_state=0) # gpr.fit(metadata[:, 0:396], metadata[:, 396]) # active learning dt = DataSet(testdataset, dataset_path) X = dt.X y = dt.y.ravel() y = np.asarray(y, dtype=int) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='./newexperiment_result/' + testdataset + '/') # Split data alibox.split_AL(test_ratio=0.3, initial_label_rate=0.02, split_count=5) # Use the default Logistic Regression classifier model = LogisticRegression(solver='lbfgs') # model = SVC(gamma='auto') # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 30) # experiment # meta_regressor = joblib.load('meta_lr.joblib') # meta_regressor = sgdr
# # # LinearRegression # print('train rfc') # rfc = LogisticRegression() # rfc.fit(metadata[:, 0:396], metadata[:, 396]) # print('done') # active learning dt = DataSet(testdataset, dataset_path) X = dt.X y = dt.y.ravel() y = np.asarray(y, dtype=int) <<<<<<< HEAD <<<<<<< HEAD alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='./preprocessing_classify_experiment_result-0.03/'+ testdataset +'/') # Split data alibox.split_AL(test_ratio=0.3, initial_label_rate=0.03, split_count=5) ======= ======= >>>>>>> c98c1150ebf8ae2e076dd6d435408eaf239d6abe alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='./n_labelleds_ethn_classify_exp/'+ testdataset +'/') # Split data alibox.split_AL(test_ratio=0.3, initial_label_rate=0.005, split_count=10) >>>>>>> df23abca5976e040cd8e125673b41208006c148f # Use the default Logistic Regression classifier