Пример #1
0
def CLAMI_data(data, target, positiveLabel, percentileCutoff, suppress=0, experimental=0, stats={"tp": 0, "p": 0},
               label="Label"):
    '''
	CLAMI - Clustering, Labeling, Metric/Features Selection,
			Instance selection, and Supervised Learning
	----------

	Returns
	-------

	'''
    treatment = Treatment(data, target)
    treatment.preprocess()
    data = treatment.full_train
    testdata = treatment.full_test
    cutoffsForHigherValuesOfAttribute = getHigherValueCutoffs(data, percentileCutoff, "Label")
    print("get cutoffs")
    data = getInstancesByCLA(data, percentileCutoff, positiveLabel)
    print("get CLA instances")

    metricIdxWithTheSameViolationScores = getMetricIndicesWithTheViolationScores(data,
                                                                                 cutoffsForHigherValuesOfAttribute,
                                                                                 positiveLabel, label=label)
    print("get Features and the violation scores")
    # pdb.set_trace()
    keys = list(metricIdxWithTheSameViolationScores.keys())
    # start with the features that have the lowest violation scores
    keys.sort()
    for i in range(len(keys)):
        k = keys[i]
        selectedMetricIndices = metricIdxWithTheSameViolationScores[k]
        # while len(selectedMetricIndices) < 3:
        # 	index = i + 1
        # 	selectedMetricIndices += metricIdxWithTheSameViolationScores[keys[index]]
        print(selectedMetricIndices)
        # pick those features for both train and test sets
        trainingInstancesByCLAMI = getInstancesByRemovingSpecificAttributes(data,
                                                                            selectedMetricIndices, True, label=label)
        newTestInstances = getInstancesByRemovingSpecificAttributes(testdata,
                                                                    selectedMetricIndices, True, label="Label")
        # restart looking for the cutoffs in the train set
        cutoffsForHigherValuesOfAttribute = getHigherValueCutoffs(trainingInstancesByCLAMI,
                                                                  percentileCutoff, "Label")
        # get instaces that violated the assumption in the train set
        instIndicesNeedToRemove = getSelectedInstances(trainingInstancesByCLAMI,
                                                       cutoffsForHigherValuesOfAttribute,
                                                       positiveLabel)
        # remove the violated instances
        trainingInstancesByCLAMI = getInstancesByRemovingSpecificInstances(trainingInstancesByCLAMI,
                                                                           instIndicesNeedToRemove, False)

        # make sure that there are both classes data in the training set
        zero_count = trainingInstancesByCLAMI[trainingInstancesByCLAMI["Label"] == 0].shape[0]
        one_count = trainingInstancesByCLAMI[trainingInstancesByCLAMI["Label"] == 1].shape[0]
        if zero_count > 0 and one_count > 0:
            break

    return trainingInstancesByCLAMI, newTestInstances
Пример #2
0
def CLA(data, target, positiveLabel, percentileCutoff, suppress=0, experimental=0, both=False):
    treatment = Treatment(data, target)
    treatment.preprocess()
    testdata = treatment.full_test
    data = getInstancesByCLA(testdata, percentileCutoff, positiveLabel)
    treatment.y_label = ["yes" if y == 1 else "no" for y in data["Label"]]
    treatment.decisions = ["yes" if y == 1 else "no" for y in data["CLA"]]
    treatment.probs = data["K"]
    return treatment.eval()
Пример #3
0
def CLA_SL(data, target, model="RF", est=False, T_rec=0.90, inc=False, seed=0, both=False, stats={"tp": 0, "p": 0}):
    tm = {"RF": RF, "SVM": SVM, "LR": LR, "NB": NB, "DT": DT, "TM": TM}

    treatment = Treatment(data, target)
    treatment.preprocess()
    traindata = treatment.full_train
    full_data = getInstancesByCLA(traindata, 90, None)

    tm = tm[model]
    clf = tm(data, target)
    print(target, model)
    clf.preprocess()
    clf.x_label = ["yes" if x == 1 else "no" for x in full_data['CLA']]
    clf.train()
    clf.stats = stats
    results = clf.eval()
    return results
Пример #4
0
def ECLA(data, target, model="RF", est=False, T_rec=0.90, inc=False, seed=0, both=False):
    jitterbug = Jitterbug(data, target)
    jitterbug.find_patterns()
    jitterbug.easy_code()
    jitterbug.test_patterns()
    rest_data = jitterbug.rest
    treatment = Treatment(rest_data, target)
    treatment.preprocess()

    test_data = treatment.full_test
    if both:
        test_data = [test_data, treatment.full_train]
        test_data = pd.concat(test_data, ignore_index=True)
    final_data = getInstancesByCLA(test_data, 90, None)
    final_data = final_data[:treatment.full_test.shape[0]]
    treatment.y_label = ["yes" if y == 1 else "no" for y in final_data["Label"]]
    treatment.decisions = ["yes" if y == 1 else "no" for y in final_data["CLA"]]
    treatment.probs = final_data["K"]
    treatment.stats = jitterbug.easy.stats_test

    return treatment, rest_data
Пример #5
0
def CLA(data,
        positiveLabel,
        percentileCutoff,
        suppress=0,
        experimental=0,
        both=False):
    try:
        treatment = Treatment({}, "")
    except:
        treatment = Treatment(data, "")
    final_data = getInstancesByCLA(data, percentileCutoff, positiveLabel)
    treatment.y_label = [
        "yes" if y == 1 else "no" for y in final_data["Label"]
    ]
    treatment.decisions = [
        "yes" if y == 1 else "no" for y in final_data["CLA"]
    ]
    summary = collections.Counter(treatment.decisions)
    print(summary, summary["yes"] / (summary["yes"] + summary["no"]))
    treatment.probs = final_data["K"]
    results = treatment.eval()
    results["read"] = summary["yes"] / (summary["yes"] + summary["no"])
    return results
Пример #6
0
def tune_CLAMI(data,
               target,
               positiveLabel,
               percentileCutoff,
               suppress=0,
               experimental=0,
               metric="APFD"):
    treatment = Treatment(data, target)
    treatment.preprocess()
    data = treatment.full_train
    sss = StratifiedShuffleSplit(n_splits=1, test_size=.25, random_state=47)
    testdata = treatment.full_test
    X, y = data[data.columns[:-1]], data[data.columns[-1]]
    for train_index, tune_index in sss.split(X, y):
        train_df = data.iloc[train_index]
        tune_df = data.iloc[tune_index]
        train_df.reset_index(drop=True, inplace=True)
        tune_df.reset_index(drop=True, inplace=True)
        cutoffsForHigherValuesOfAttribute = getHigherValueCutoffs(
            train_df, percentileCutoff, "Label")
        print("get cutoffs")
        train_df = getInstancesByCLA(train_df, percentileCutoff, positiveLabel)
        print("get CLA instances")

        metricIdxWithTheSameViolationScores = getMetricIndicesWithTheViolationScores(
            train_df, cutoffsForHigherValuesOfAttribute, positiveLabel)
        # pdb.set_trace()
        keys = list(metricIdxWithTheSameViolationScores.keys())
        # keys.sort()
        evaluated_configs = random.sample(keys, INIT_POOL_SIZE * 2)
        evaluated_configs = [
            metricIdxWithTheSameViolationScores[k] for k in evaluated_configs
        ]

        tmp_scores = []
        tmp_configs = []
        for selectedMetricIndices in evaluated_configs:
            selectedMetricIndices, res = MI(train_df, tune_df,
                                            selectedMetricIndices,
                                            percentileCutoff, positiveLabel,
                                            target)
            if isinstance(res, dict):
                tmp_configs.append(
                    transform_metric_indices(data.shape[1],
                                             selectedMetricIndices))
                tmp_scores.append(res)

        ids = np.argsort([x[metric] for x in tmp_scores])[::-1][:1]
        best_res = tmp_scores[ids[0]]
        best_config = np.where(tmp_configs[ids[0]] == 1)[0]

        # number of eval
        this_budget = BUDGET
        eval = 0
        lives = 5
        print("Initial Population: %s" % len(tmp_scores))
        searchspace = [
            transform_metric_indices(data.shape[1],
                                     metricIdxWithTheSameViolationScores[k])
            for k in keys
        ]
        while this_budget > 0:
            cart_model = DecisionTreeRegressor()
            cart_model.fit(tmp_configs, [x[metric] for x in tmp_scores])

            cart_models = []
            cart_models.append(cart_model)
            next_config_id = acquisition_fn(searchspace, cart_models)
            next_config = metricIdxWithTheSameViolationScores[keys.pop(
                next_config_id)]
            searchspace.pop(next_config_id)
            next_config, next_res = MI(train_df, tune_df, next_config,
                                       percentileCutoff, positiveLabel, target)
            if not isinstance(next_res, dict):
                continue

            next_config_normal = transform_metric_indices(
                data.shape[1], next_config)
            tmp_scores.append(next_res)
            tmp_configs.append(next_config_normal)
            try:
                if abs(next_res[metric] - best_res[metric]) >= 0.03:
                    lives = 5
                else:
                    lives -= 1

                # pdb.set_trace()
                if isBetter(next_res, best_res, metric):
                    best_config = next_config
                    best_res = next_res

                if lives == 0:
                    print("***" * 5)
                    print("EARLY STOPPING!")
                    print("***" * 5)
                    break

                this_budget -= 1
                eval += 1
            except:
                pdb.set_trace()
    _, res = MI(train_df, testdata, best_config, percentileCutoff,
                positiveLabel, target)
    return res