示例#1
0
    def __init__(self,
                 X,
                 y,
                 cb_classifier,
                 label_inds_5=None,
                 unlabel_inds_5=None,
                 modelOutput_5=None):
        self.X = X
        self.y = y
        self.cb_classifier = cb_classifier
        dt = DataSet('au', X=X, y=y)
        self.distacne = dt.get_distance()
        _, self.cluster_center_index = dt.get_cluster_center()

        self.flag = False
        if label_inds_5 is not None:
            if unlabel_inds_5 is not None:
                if modelOutput_5 is not None:
                    self.label_inds_5 = label_inds_5
                    self.unlabel_inds_5 = unlabel_inds_5
                    self.modelOutput_5 = modelOutput_5
                    self.flag = True

        if self.flag is False:
            self.label_inds_5 = []
            self.unlabel_inds_5 = []
            self.modelOutput_5 = []
    # modelnames = ['KNN', 'LR', 'RFC', 'RFR', 'DTC', 'DTR', 'SVM', 'GBC', 'ABC', 'ABR']
    modelnames = ['LR']

    # in the same dataset and the same ratio of initial_label_rate,the number of split.
    split_count = [30, 50, 70, 90]
    # The number of unlabel data to select to generate the meta data.
    num_xjselect = 30

    diff_five_round = 20

    n_labelleds = np.arange(2, 100, 2)

    # first choose a dataset
    for datasetname in datasetnames:

        dataset = DataSet(datasetname, dataset_path)
        X = dataset.X
        y = dataset.y
        distacne = dataset.get_distance()
        _, cluster_center_index = dataset.get_cluster_center()
        print(
            datasetname +
            ' DataSet currently being processed********************************************'
        )
        # run multiple split on the same dataset
        # every time change the value of initial_label_rate
        for split_c in split_count:
            for n_labelled in n_labelleds:
                metadata = None
                # trains, tests, label_inds, unlabel_inds = dataset.split_data_by_nlabelled(n_labelled, test_ratio=0.6, split_count=split_count, saving_path='./n_labelled_split_info')
                trains, tests, label_inds, unlabel_inds = dataset.split_data_by_nlabelled_fulldataset(
示例#3
0
    # 'wdbc', 'clean1', 'ethn', , 'blood', 'breast-cancer-wisc'
    datasetnames = ['australian']
    # Different types of models, each type has many models with different parameters
    # modelnames = ['KNN', 'LR', 'RFC', 'RFR', 'DTC', 'DTR', 'SVM', 'GBC', 'ABC', 'ABR']
    modelnames = ['LR']

    # in the same dataset and the same ratio of initial_label_rate,the number of split.
    split_count = 30
    # The number of unlabel data to select to generate the meta data.
    num_xjselect = 30

    n_labelleds = np.arange(2, 100, 2)

    # first choose a dataset
    for datasetname in datasetnames:
        dataset = DataSet(datasetname, dataset_path)
        X = dataset.X
        y = dataset.y
        distacne = dataset.get_distance()
        _, cluster_center_index = dataset.get_cluster_center()
        print(
            datasetname +
            ' DataSet currently being processed********************************************'
        )
        metadata = None
        # run multiple split on the same dataset
        # every time change the value of initial_label_rate
        if datasetname in [
                'echocardiogram', 'heart', 'heart-hungarian', 'heart-statlog',
                'house', 'house-votes', 'spect', 'statlog-heart',
                'vertebral-column-2clases'
示例#4
0
    QueryInstanceQUIRE, QueryRandom, QueryInstanceUncertainty, QureyExpectedErrorReduction, QueryInstanceLAL

from meta_data import DataSet
from QueryMetaData import QueryMetaData

dataset_path = './newdata/'
datasetnames = np.load('datasetname.npy')
# datasetname = 'echocardiogram'
# datasetname = 'australian'
# datasetname = 'blood'
# datasetname = 'texture'
datasetnames = ['tic-tac-toe']

for datasetname in datasetnames:

    dt = DataSet(datasetname, dataset_path)
    X = dt.X
    y = dt.y.ravel()
    y = np.asarray(y, dtype=int)

    alibox = ToolBox(X=X,
                     y=y,
                     query_type='AllLabels',
                     saving_path='./experiment_result/')

    # Split data
    alibox.split_AL(test_ratio=0.3, initial_label_rate=0.05, split_count=5)

    # Use the default Logistic Regression classifier
    model = alibox.get_default_model()
示例#5
0
rfr_meta = joblib.load('./newmetadata/rfr_p_regression_australian.joblib')
# rfc_meta = joblib.load('./newmetadata/rfc_p_classify_australian.joblib')
# lr_meta = joblib.load('./newmetadata/lr_p_classify_australian.joblib')
# Use the default Logistic Regression classifier
model = LogisticRegression(solver='lbfgs')
# model = RandomForestClassifier()
# model = SVC(gamma='auto')

for testdataset in testdatasetnames:
    print('***********currently dataset is : ', testdataset)

    lcdata_uncertainty_select_list = []
    lcdata_random_select_list = []

    # active learning 
    dt = DataSet(testdataset, dataset_path)
    X = dt.X
    y = dt.y.ravel()
    y = np.asarray(y, dtype=int)
    train_indexs, test_indexs, label_indexs, unlabel_indexs = split_load('./experiment_result/combination_classify/australian_lrmetadata_0.005/australian/')
    alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path=savefloder_path + testdataset +'/', train_idx=train_indexs, test_idx=test_indexs, label_idx=label_indexs, unlabel_idx=unlabel_indexs)

    # Split data
    # alibox.split_AL(test_ratio=test_ratio, initial_label_rate=initial_label_ratio, split_count=splitcount)
    # alibox.
    


    # The cost budget is 50 times querying
    stopping_criterion = alibox.get_stopping_criterion('num_of_queries', query_num)
    # datasetnames = ['echocardiogram', 'heart', 'heart-hungarian', 'heart-statlog', 'house',
    #                     'house-votes', 'spect', 'statlog-heart', 'vertebral-column-2clases']
    datasetnames = ['australian']

    # Different types of models, each type has many models with different parameters
    # modelnames = ['KNN', 'LR', 'RFC', 'RFR', 'DTC', 'DTR', 'SVM', 'GBC', 'ABC', 'ABR']
    modelnames = ['LR']

    # in the same dataset and the same ratio of initial_label_rate,the number of split.
    split_count = 10
    # The number of unlabel data to select to generate the meta data.
    num_xjselect = 20

    # first choose a dataset
    for datasetname in datasetnames:
        dataset = DataSet(datasetname, dataset_path)
        X = dataset.X
        y = dataset.y
        distacne = dataset.get_distance()
        _, cluster_center_index = dataset.get_cluster_center()
        print(
            datasetname +
            ' DataSet currently being processed********************************************'
        )
        metadata = None
        # run multiple split on the same dataset
        # every time change the value of initial_label_rate
        if datasetname in [
                'echocardiogram', 'heart', 'heart-hungarian', 'heart-statlog',
                'house', 'house-votes', 'spect', 'statlog-heart',
                'vertebral-column-2clases'