def __init__(self, X, y, cb_classifier, label_inds_5=None, unlabel_inds_5=None, modelOutput_5=None): self.X = X self.y = y self.cb_classifier = cb_classifier dt = DataSet('au', X=X, y=y) self.distacne = dt.get_distance() _, self.cluster_center_index = dt.get_cluster_center() self.flag = False if label_inds_5 is not None: if unlabel_inds_5 is not None: if modelOutput_5 is not None: self.label_inds_5 = label_inds_5 self.unlabel_inds_5 = unlabel_inds_5 self.modelOutput_5 = modelOutput_5 self.flag = True if self.flag is False: self.label_inds_5 = [] self.unlabel_inds_5 = [] self.modelOutput_5 = []
# modelnames = ['KNN', 'LR', 'RFC', 'RFR', 'DTC', 'DTR', 'SVM', 'GBC', 'ABC', 'ABR'] modelnames = ['LR'] # in the same dataset and the same ratio of initial_label_rate,the number of split. split_count = [30, 50, 70, 90] # The number of unlabel data to select to generate the meta data. num_xjselect = 30 diff_five_round = 20 n_labelleds = np.arange(2, 100, 2) # first choose a dataset for datasetname in datasetnames: dataset = DataSet(datasetname, dataset_path) X = dataset.X y = dataset.y distacne = dataset.get_distance() _, cluster_center_index = dataset.get_cluster_center() print( datasetname + ' DataSet currently being processed********************************************' ) # run multiple split on the same dataset # every time change the value of initial_label_rate for split_c in split_count: for n_labelled in n_labelleds: metadata = None # trains, tests, label_inds, unlabel_inds = dataset.split_data_by_nlabelled(n_labelled, test_ratio=0.6, split_count=split_count, saving_path='./n_labelled_split_info') trains, tests, label_inds, unlabel_inds = dataset.split_data_by_nlabelled_fulldataset(
# 'wdbc', 'clean1', 'ethn', , 'blood', 'breast-cancer-wisc' datasetnames = ['australian'] # Different types of models, each type has many models with different parameters # modelnames = ['KNN', 'LR', 'RFC', 'RFR', 'DTC', 'DTR', 'SVM', 'GBC', 'ABC', 'ABR'] modelnames = ['LR'] # in the same dataset and the same ratio of initial_label_rate,the number of split. split_count = 30 # The number of unlabel data to select to generate the meta data. num_xjselect = 30 n_labelleds = np.arange(2, 100, 2) # first choose a dataset for datasetname in datasetnames: dataset = DataSet(datasetname, dataset_path) X = dataset.X y = dataset.y distacne = dataset.get_distance() _, cluster_center_index = dataset.get_cluster_center() print( datasetname + ' DataSet currently being processed********************************************' ) metadata = None # run multiple split on the same dataset # every time change the value of initial_label_rate if datasetname in [ 'echocardiogram', 'heart', 'heart-hungarian', 'heart-statlog', 'house', 'house-votes', 'spect', 'statlog-heart', 'vertebral-column-2clases'
QueryInstanceQUIRE, QueryRandom, QueryInstanceUncertainty, QureyExpectedErrorReduction, QueryInstanceLAL from meta_data import DataSet from QueryMetaData import QueryMetaData dataset_path = './newdata/' datasetnames = np.load('datasetname.npy') # datasetname = 'echocardiogram' # datasetname = 'australian' # datasetname = 'blood' # datasetname = 'texture' datasetnames = ['tic-tac-toe'] for datasetname in datasetnames: dt = DataSet(datasetname, dataset_path) X = dt.X y = dt.y.ravel() y = np.asarray(y, dtype=int) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='./experiment_result/') # Split data alibox.split_AL(test_ratio=0.3, initial_label_rate=0.05, split_count=5) # Use the default Logistic Regression classifier model = alibox.get_default_model()
rfr_meta = joblib.load('./newmetadata/rfr_p_regression_australian.joblib') # rfc_meta = joblib.load('./newmetadata/rfc_p_classify_australian.joblib') # lr_meta = joblib.load('./newmetadata/lr_p_classify_australian.joblib') # Use the default Logistic Regression classifier model = LogisticRegression(solver='lbfgs') # model = RandomForestClassifier() # model = SVC(gamma='auto') for testdataset in testdatasetnames: print('***********currently dataset is : ', testdataset) lcdata_uncertainty_select_list = [] lcdata_random_select_list = [] # active learning dt = DataSet(testdataset, dataset_path) X = dt.X y = dt.y.ravel() y = np.asarray(y, dtype=int) train_indexs, test_indexs, label_indexs, unlabel_indexs = split_load('./experiment_result/combination_classify/australian_lrmetadata_0.005/australian/') alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path=savefloder_path + testdataset +'/', train_idx=train_indexs, test_idx=test_indexs, label_idx=label_indexs, unlabel_idx=unlabel_indexs) # Split data # alibox.split_AL(test_ratio=test_ratio, initial_label_rate=initial_label_ratio, split_count=splitcount) # alibox. # The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', query_num)
# datasetnames = ['echocardiogram', 'heart', 'heart-hungarian', 'heart-statlog', 'house', # 'house-votes', 'spect', 'statlog-heart', 'vertebral-column-2clases'] datasetnames = ['australian'] # Different types of models, each type has many models with different parameters # modelnames = ['KNN', 'LR', 'RFC', 'RFR', 'DTC', 'DTR', 'SVM', 'GBC', 'ABC', 'ABR'] modelnames = ['LR'] # in the same dataset and the same ratio of initial_label_rate,the number of split. split_count = 10 # The number of unlabel data to select to generate the meta data. num_xjselect = 20 # first choose a dataset for datasetname in datasetnames: dataset = DataSet(datasetname, dataset_path) X = dataset.X y = dataset.y distacne = dataset.get_distance() _, cluster_center_index = dataset.get_cluster_center() print( datasetname + ' DataSet currently being processed********************************************' ) metadata = None # run multiple split on the same dataset # every time change the value of initial_label_rate if datasetname in [ 'echocardiogram', 'heart', 'heart-hungarian', 'heart-statlog', 'house', 'house-votes', 'spect', 'statlog-heart', 'vertebral-column-2clases'