示例#1
0
def create_dataset_splits(data, labels):
    # Define X and y target
    X = data.values
    y = np.asarray(labels)

    # Create alibox tool box
    toolbox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

    # Split data to train and test and keep only 0.01 of the original data as labeled
    toolbox.split_AL(test_ratio=0.15, initial_label_rate=0.1)
    train_idx, test_idx, labeled_idx, unlabeled_idx = toolbox.get_split(0)

    X_train = X[train_idx]
    y_train = y[train_idx]
    y_train = np.array(y_train).reshape(-1)
    X_test = X[test_idx]
    y_test = y[test_idx]
    y_test = np.array(y_test).reshape(-1)

    # Save dataset splits
    with open('dataset','wb') as f:
        pickle.dump((X_train, X_test, y_train, y_test), f)

    # Save dataset splits indexes for active learning
    with open('dataset_al', 'wb') as f:
        pickle.dump((train_idx, test_idx, labeled_idx, unlabeled_idx), f)
示例#2
0
 def __init__(self,
              dataset,
              labels,
              testset,
              testlab,
              model,
              phase,
              path,
              stopping,
              measure='nearest_neighbor',
              distance='linear'):
     self.dataset = dataset
     self.labels = labels
     self.testset = testset
     self.testlab = testlab
     self.model = model
     self.phase = phase
     self.classes = int(max(labels))
     self.alibox = ToolBox(X=dataset,
                           y=np.asarray([0] * len(labels), dtype=np.int),
                           query_type='AllLabels',
                           saving_path='./%s' % path)
     self.alibox.split_AL(test_ratio=0,
                          initial_label_rate=0.05,
                          split_count=1)
     self.stopping_criterion = self.alibox.get_stopping_criterion(
         stopping[0], value=stopping[1])
     self.measure = measure
     if measure == 'residue':
         self.query_strategy = QueryInstanceResidueRegressor(
             X=self.dataset, y=self.labels, distance=distance)
     else:
         self.query_strategy = QueryInstanceDistribution(measure=measure)
     self.random = QueryRandom()
     self.unc_result = []
     self.title = ''
     self.loss = []
     self.path = path
     self.one = self.two = self.three = self.four = self.five = self.six = None
     self.max, self.mae, self.mse, self.evs, self.r2 = [], [], [], [], []
     self.sample = []
示例#3
0
 def __init__(self, dataset, labels, testset, testlab, model, phase, path,
              stopping):
     self.dataset = dataset
     self.labels = labels
     self.testset = testset
     self.testlab = testlab
     self.model = model
     self.phase = phase
     self.classes = int(max(labels))
     self.alibox = ToolBox(X=dataset,
                           y=labels,
                           query_type='AllLabels',
                           saving_path='./%s' % path)
     self.alibox.split_AL(test_ratio=0,
                          initial_label_rate=0.05,
                          split_count=1)
     self.stopping_criterion = self.alibox.get_stopping_criterion(
         stopping[0], value=stopping[1])
     self.query_strategy = QueryInstanceUncertainty(
         X=dataset, y=labels, measure='least_confident')
     # self.query_strategy = QueryInstanceQBC(disagreement='KL_divergence')
     self.random = QueryRandom()
     self.unc_result = []
     self.title = ''
     self.acc = []
     self.gmeans = []
     self.recall = []
     self.precision = []
     self.specificity = []
     self.auc = []
     self.f1 = []
     self.pos = []
     self.neg = []
     self.ratio = []
     self.loss = []
     self.mcc = []
     self.path = path
示例#4
0
    def AC_(self, X, y):

        # X, y = shuffle(X, Y)
        # y = y.astype('int')
        alibox = ToolBox(X=X, y=y, query_type='AllLabels')

        alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10)

        model = alibox.get_default_model()

        # stopping_criterion = alibox.get_stopping_criterion('num_of_queries',50)

        model.fit(X, y)
        pred = model.predict(X)

        # 整理矩阵系数为信任度,返回start
        w = model.class_weight
        dim = w.shape[0]
        trustValue = []

        for i in range(0, dim):
            value = math.exp(w[i])  # exp() 方法返回x的指数,ex。
            trustValue.append(value)
        return trustValue
示例#5
0
from alipy import ToolBox
from alipy.query_strategy.multi_label import *

X, y = load_iris(return_X_y=True)
mlb = OneHotEncoder()
mult_y = mlb.fit_transform(y.reshape((-1, 1)))
mult_y = np.asarray(mult_y.todense())

# Or generate a dataset with any sizes
# X, mult_y = make_multilabel_classification(n_samples=5000, n_features=20, n_classes=5, length=5)

# Since we are using the label ranking model, the label 0 means unknown. we need to
# set the 0 entries to -1 which means irrelevant.
mult_y[mult_y == 0] = -1

alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False)


def main_loop(alibox, round, strategy):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    # base model
    model = LabelRankingModel()

    # A simple stopping criterion to specify the query budget.
    while len(label_ind) <= 120:
        # query and update
        select_labs = strategy.select(label_ind, unlab_ind)
        # use cost to record the amount of queried instance-label pairs
import copy
from sklearn.datasets import load_iris
from alipy import ToolBox

X, y = load_iris(return_X_y=True)
alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10)

# Use the default Logistic Regression classifier
model = alibox.get_default_model()

# The cost budget is 50 times querying
stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50)

# Use pre-defined strategy
QBCStrategy = alibox.get_query_strategy(strategy_name='QueryInstanceQBC')
QBC_result = []

for round in range(10):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)

    while not stopping_criterion.is_stop():
        # Select a subset of Uind according to the query strategy
        # Passing model=None to use the default model for evaluating the committees' disagreement
        select_ind = QBCStrategy.select(label_ind,
                                        unlab_ind,
示例#7
0
def create_and_implement_strategy(strategy_name, data, labels, queries):

    # Keep only the values of data and labels dataframe (Later, we use the global split based on idxs)
    X = data.values
    y = np.asarray(labels)
    toolbox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

    # Create Logistic Regression model ( Default Setting with liblinear solver)
    model = toolbox.get_default_model()

    # Implement query strategy
    uncertainty_strategy = toolbox.get_query_strategy(strategy_name=strategy_name)

    # Create array to save the results
    examples = []

    # Set stopping criterion, we will stop in 1000 labeled examples
    stopping_criterion = toolbox.get_stopping_criterion('num_of_queries', queries)

    # Get the indexes of the global split
    with open("dataset_al", "rb") as f:
        train_idx, test_idx, labeled_idx, unlabeled_idx = pickle.load(f)

    # Create saver to save the results
    saver = StateIO(round=0, train_idx=train_idx,
                    test_idx=test_idx, init_L=labeled_idx,
                    init_U=unlabeled_idx, saving_path='.')

    # print(train_idx.shape, test_idx.shape)

    # Starting with some labeled examples
    model.fit(X=X[labeled_idx.index, :], y=y[labeled_idx.index])
    y_pred = model.predict(X[test_idx, :])

    # Calculate the accuracy of the prediction
    accuracy = toolbox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='accuracy_score')

    # Save accuracy of the prediction
    saver.set_initial_point(accuracy)

    while not stopping_criterion.is_stop():
        # Select example of the unlabeled dataset
        example = uncertainty_strategy.select(labeled_idx, unlabeled_idx, model=model, batch_size=1)
        # Update the label idxs
        labeled_idx.update(example)
        unlabeled_idx.difference_update(example)
        # Train model for the added example
        model.fit(X=X[labeled_idx.index, :], y=y[labeled_idx.index])
        y_pred = model.predict(X[test_idx, :])
        # Calculate accuracy
        accuracy = toolbox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred,
                                                   performance_metric='accuracy_score')
        # f1 = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='f1_score')

        # Save update results
        state = toolbox.State(select_index=example, performance=accuracy)
        saver.add_state(state)
        saver.save()

        # Update progress for stopping criterion
        stopping_criterion.update_information(saver)

    stopping_criterion.reset()
    examples.append(copy.deepcopy(saver))

    # Uncomment and return in order to save the new active learning dataset
    # Save selected x_train examples
    X_train = X[labeled_idx, :]
    # Save labels for the examples
    y_train = y[labeled_idx, :]
    # Reshape target
    y_train = np.array(y_train).reshape(-1)

    # Save to pickle
    # with open('qbc_dataset','wb') as f:
    #     pickle.dump((X_train, y_train), f)

    return examples
示例#8
0
datasetnames = np.load('datasetname.npy')
# datasetname = 'echocardiogram'
# datasetname = 'australian'
# datasetname = 'blood'
# datasetname = 'texture'
datasetnames = ['tic-tac-toe']

for datasetname in datasetnames:

    dt = DataSet(datasetname, dataset_path)
    X = dt.X
    y = dt.y.ravel()
    y = np.asarray(y, dtype=int)

    alibox = ToolBox(X=X,
                     y=y,
                     query_type='AllLabels',
                     saving_path='./experiment_result/')

    # Split data
    alibox.split_AL(test_ratio=0.3, initial_label_rate=0.05, split_count=5)

    # Use the default Logistic Regression classifier
    model = alibox.get_default_model()

    # The cost budget is 50 times querying
    stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 30)

    # experiment
    meta_regressor = joblib.load('meta_lr.joblib')
    # meta_query = QueryMetaData(X, y, meta_regressor)
    meta_result = []
示例#9
0
# model = RandomForestClassifier()
# model = SVC(gamma='auto')

for testdataset in testdatasetnames:
    print('***********currently dataset is : ', testdataset)

    lcdata_uncertainty_select_list = []
    lcdata_random_select_list = []

    # active learning 
    dt = DataSet(testdataset, dataset_path)
    X = dt.X
    y = dt.y.ravel()
    y = np.asarray(y, dtype=int)
    train_indexs, test_indexs, label_indexs, unlabel_indexs = split_load('./experiment_result/combination_classify/australian_lrmetadata_0.005/australian/')
    alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path=savefloder_path + testdataset +'/', train_idx=train_indexs, test_idx=test_indexs, label_idx=label_indexs, unlabel_idx=unlabel_indexs)

    # Split data
    # alibox.split_AL(test_ratio=test_ratio, initial_label_rate=initial_label_ratio, split_count=splitcount)
    # alibox.
    


    # The cost budget is 50 times querying
    stopping_criterion = alibox.get_stopping_criterion('num_of_queries', query_num)

    # experiment
    # meta_regressor = joblib.load('meta_lr.joblib')
    # meta_regressor = sgdr
    # meta_result = []
    
示例#10
0
# rfc_meta = joblib.load('./newmetadata/rfc_p_classify_australian.joblib')
# lr_meta = joblib.load('./newmetadata/lr_p_classify_australian.joblib')
# Use the default Logistic Regression classifier
model = LogisticRegression(solver='lbfgs')
# model = RandomForestClassifier()
# model = SVC(gamma='auto')

for testdataset in testdatasetnames:
    print('***********currently dataset is : ', testdataset)
    # prepare dataset
    dt = DataSet(testdataset, dataset_path)
    X = dt.X
    y = dt.y.ravel()
    y = np.asarray(y, dtype=int)

    alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path=savefloder_path + testdataset +'/')
    # Split data
    alibox.split_AL(test_ratio=test_ratio, initial_label_rate=initial_label_ratio, split_count=splitcount)

    # The cost budget is 50 times querying
    stopping_criterion = alibox.get_stopping_criterion('num_of_queries', query_num)

    # generate the first five rounds data(label_index unlabel_index model_output)
    label_index_round = []
    unlabel_index_round = []
    model_output_round = []

    for round in range(splitcount):
        label_inds_5 = []
        unlabel_inds_5 = []
        model_output_5 = []
示例#11
0
test_data = test_data.drop(["FaultCause"], axis=1)
all_data = pd.concat([train_data, test_data], axis=0)
all_label = pd.concat([train_label, test_label], axis=0)
all_data = all_data.values
all_label = all_label.values
all_resampled_data, all_resampled_label = SMOTE().fit_resample(
    all_data, all_label)

all_data = all_resampled_data
all_label = all_resampled_label

for index in range(0, len(all_label)):
    all_label[index] = all_label[index] - 1

alibox = ToolBox(X=all_data,
                 y=all_label,
                 query_type='AllLabels',
                 saving_path='.')
alibox.split_AL(test_ratio=0.7, initial_label_rate=0.001, split_count=1)
model = alibox.get_default_model()
# model = AdaBoostClassifier(n_estimators=10)
# model = XGBClassifier(objective="reg:logistic")
# model = LogisticRegression()

# rft = SVC(kernel='linear')
# knn = KNeighborsClassifier(n_neighbors=7)

stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 500)


def main_loop(alibox, strategy, round):
    # Get the data split of one fold experiment
示例#12
0
from alipy.experiment import StoppingCriteria
from alipy import ToolBox
import numpy as np

X = np.random.rand(30, 5)
y = np.random.randint(2, size=30)
alibox = ToolBox(X=X, y=y)

# ---------------Initialize----------------
stopping_criterion = StoppingCriteria(stopping_criteria='num_of_queries',
                                      value=50)
# or init by toolbox
stopping_criterion = alibox.get_stopping_criterion(
    stopping_criteria='num_of_queries', value=50)

# ---------------Usage----------------
while not stopping_criterion.is_stop():
    #... Query some examples and update the StateIO object
    # Use the StateIO object to update stopping_criterion object
    saver = alibox.get_stateio(round=0)
    stopping_criterion.update_information(saver)
# The condition is met and break the loop.
# Reset the object for another fold.
stopping_criterion.reset()
示例#13
0
    #     gbr_performance = np.vstack((gbr_performance, [testdataset, gbr_mse, gbr_mae, gbr_r2]))
    # joblib.dump(gbr, testdataset + "meta_gbr.joblib")

    # GaussianProcessRegressor
    # kernel = DotProduct() + WhiteKernel()
    # gpr = GaussianProcessRegressor(kernel=kernel, random_state=0)
    # gpr.fit(metadata[:, 0:396], metadata[:, 396])

    # active learning
    dt = DataSet(testdataset, dataset_path)
    X = dt.X
    y = dt.y.ravel()
    y = np.asarray(y, dtype=int)

    alibox = ToolBox(X=X,
                     y=y,
                     query_type='AllLabels',
                     saving_path='./newexperiment_result/' + testdataset + '/')

    # Split data
    alibox.split_AL(test_ratio=0.3, initial_label_rate=0.02, split_count=5)

    # Use the default Logistic Regression classifier
    model = LogisticRegression(solver='lbfgs')
    # model = SVC(gamma='auto')

    # The cost budget is 50 times querying
    stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 30)

    # experiment
    # meta_regressor = joblib.load('meta_lr.joblib')
    # meta_regressor = sgdr
示例#14
0
    # # # LinearRegression
    # print('train rfc')
    # rfc = LogisticRegression()
    # rfc.fit(metadata[:, 0:396], metadata[:, 396])
    # print('done')

    # active learning 
    dt = DataSet(testdataset, dataset_path)
    X = dt.X
    y = dt.y.ravel()
    y = np.asarray(y, dtype=int)

<<<<<<< HEAD
<<<<<<< HEAD
    alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='./preprocessing_classify_experiment_result-0.03/'+ testdataset +'/')

    # Split data
    alibox.split_AL(test_ratio=0.3, initial_label_rate=0.03, split_count=5)
=======
=======
    

>>>>>>> c98c1150ebf8ae2e076dd6d435408eaf239d6abe
    alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='./n_labelleds_ethn_classify_exp/'+ testdataset +'/')

    # Split data
    alibox.split_AL(test_ratio=0.3, initial_label_rate=0.005, split_count=10)
>>>>>>> df23abca5976e040cd8e125673b41208006c148f

    # Use the default Logistic Regression classifier