def CLAMI_data(data, target, positiveLabel, percentileCutoff, suppress=0, experimental=0, stats={"tp": 0, "p": 0}, label="Label"): ''' CLAMI - Clustering, Labeling, Metric/Features Selection, Instance selection, and Supervised Learning ---------- Returns ------- ''' treatment = Treatment(data, target) treatment.preprocess() data = treatment.full_train testdata = treatment.full_test cutoffsForHigherValuesOfAttribute = getHigherValueCutoffs(data, percentileCutoff, "Label") print("get cutoffs") data = getInstancesByCLA(data, percentileCutoff, positiveLabel) print("get CLA instances") metricIdxWithTheSameViolationScores = getMetricIndicesWithTheViolationScores(data, cutoffsForHigherValuesOfAttribute, positiveLabel, label=label) print("get Features and the violation scores") # pdb.set_trace() keys = list(metricIdxWithTheSameViolationScores.keys()) # start with the features that have the lowest violation scores keys.sort() for i in range(len(keys)): k = keys[i] selectedMetricIndices = metricIdxWithTheSameViolationScores[k] # while len(selectedMetricIndices) < 3: # index = i + 1 # selectedMetricIndices += metricIdxWithTheSameViolationScores[keys[index]] print(selectedMetricIndices) # pick those features for both train and test sets trainingInstancesByCLAMI = getInstancesByRemovingSpecificAttributes(data, selectedMetricIndices, True, label=label) newTestInstances = getInstancesByRemovingSpecificAttributes(testdata, selectedMetricIndices, True, label="Label") # restart looking for the cutoffs in the train set cutoffsForHigherValuesOfAttribute = getHigherValueCutoffs(trainingInstancesByCLAMI, percentileCutoff, "Label") # get instaces that violated the assumption in the train set instIndicesNeedToRemove = getSelectedInstances(trainingInstancesByCLAMI, cutoffsForHigherValuesOfAttribute, positiveLabel) # remove the violated instances trainingInstancesByCLAMI = getInstancesByRemovingSpecificInstances(trainingInstancesByCLAMI, instIndicesNeedToRemove, False) # make sure that there are both classes data in the training set zero_count = trainingInstancesByCLAMI[trainingInstancesByCLAMI["Label"] == 0].shape[0] one_count = trainingInstancesByCLAMI[trainingInstancesByCLAMI["Label"] == 1].shape[0] if zero_count > 0 and one_count > 0: break return trainingInstancesByCLAMI, newTestInstances
def CLA(data, target, positiveLabel, percentileCutoff, suppress=0, experimental=0, both=False): treatment = Treatment(data, target) treatment.preprocess() testdata = treatment.full_test data = getInstancesByCLA(testdata, percentileCutoff, positiveLabel) treatment.y_label = ["yes" if y == 1 else "no" for y in data["Label"]] treatment.decisions = ["yes" if y == 1 else "no" for y in data["CLA"]] treatment.probs = data["K"] return treatment.eval()
def CLA_SL(data, target, model="RF", est=False, T_rec=0.90, inc=False, seed=0, both=False, stats={"tp": 0, "p": 0}): tm = {"RF": RF, "SVM": SVM, "LR": LR, "NB": NB, "DT": DT, "TM": TM} treatment = Treatment(data, target) treatment.preprocess() traindata = treatment.full_train full_data = getInstancesByCLA(traindata, 90, None) tm = tm[model] clf = tm(data, target) print(target, model) clf.preprocess() clf.x_label = ["yes" if x == 1 else "no" for x in full_data['CLA']] clf.train() clf.stats = stats results = clf.eval() return results
def ECLA(data, target, model="RF", est=False, T_rec=0.90, inc=False, seed=0, both=False): jitterbug = Jitterbug(data, target) jitterbug.find_patterns() jitterbug.easy_code() jitterbug.test_patterns() rest_data = jitterbug.rest treatment = Treatment(rest_data, target) treatment.preprocess() test_data = treatment.full_test if both: test_data = [test_data, treatment.full_train] test_data = pd.concat(test_data, ignore_index=True) final_data = getInstancesByCLA(test_data, 90, None) final_data = final_data[:treatment.full_test.shape[0]] treatment.y_label = ["yes" if y == 1 else "no" for y in final_data["Label"]] treatment.decisions = ["yes" if y == 1 else "no" for y in final_data["CLA"]] treatment.probs = final_data["K"] treatment.stats = jitterbug.easy.stats_test return treatment, rest_data
def CLA(data, positiveLabel, percentileCutoff, suppress=0, experimental=0, both=False): try: treatment = Treatment({}, "") except: treatment = Treatment(data, "") final_data = getInstancesByCLA(data, percentileCutoff, positiveLabel) treatment.y_label = [ "yes" if y == 1 else "no" for y in final_data["Label"] ] treatment.decisions = [ "yes" if y == 1 else "no" for y in final_data["CLA"] ] summary = collections.Counter(treatment.decisions) print(summary, summary["yes"] / (summary["yes"] + summary["no"])) treatment.probs = final_data["K"] results = treatment.eval() results["read"] = summary["yes"] / (summary["yes"] + summary["no"]) return results
def tune_CLAMI(data, target, positiveLabel, percentileCutoff, suppress=0, experimental=0, metric="APFD"): treatment = Treatment(data, target) treatment.preprocess() data = treatment.full_train sss = StratifiedShuffleSplit(n_splits=1, test_size=.25, random_state=47) testdata = treatment.full_test X, y = data[data.columns[:-1]], data[data.columns[-1]] for train_index, tune_index in sss.split(X, y): train_df = data.iloc[train_index] tune_df = data.iloc[tune_index] train_df.reset_index(drop=True, inplace=True) tune_df.reset_index(drop=True, inplace=True) cutoffsForHigherValuesOfAttribute = getHigherValueCutoffs( train_df, percentileCutoff, "Label") print("get cutoffs") train_df = getInstancesByCLA(train_df, percentileCutoff, positiveLabel) print("get CLA instances") metricIdxWithTheSameViolationScores = getMetricIndicesWithTheViolationScores( train_df, cutoffsForHigherValuesOfAttribute, positiveLabel) # pdb.set_trace() keys = list(metricIdxWithTheSameViolationScores.keys()) # keys.sort() evaluated_configs = random.sample(keys, INIT_POOL_SIZE * 2) evaluated_configs = [ metricIdxWithTheSameViolationScores[k] for k in evaluated_configs ] tmp_scores = [] tmp_configs = [] for selectedMetricIndices in evaluated_configs: selectedMetricIndices, res = MI(train_df, tune_df, selectedMetricIndices, percentileCutoff, positiveLabel, target) if isinstance(res, dict): tmp_configs.append( transform_metric_indices(data.shape[1], selectedMetricIndices)) tmp_scores.append(res) ids = np.argsort([x[metric] for x in tmp_scores])[::-1][:1] best_res = tmp_scores[ids[0]] best_config = np.where(tmp_configs[ids[0]] == 1)[0] # number of eval this_budget = BUDGET eval = 0 lives = 5 print("Initial Population: %s" % len(tmp_scores)) searchspace = [ transform_metric_indices(data.shape[1], metricIdxWithTheSameViolationScores[k]) for k in keys ] while this_budget > 0: cart_model = DecisionTreeRegressor() cart_model.fit(tmp_configs, [x[metric] for x in tmp_scores]) cart_models = [] cart_models.append(cart_model) next_config_id = acquisition_fn(searchspace, cart_models) next_config = metricIdxWithTheSameViolationScores[keys.pop( next_config_id)] searchspace.pop(next_config_id) next_config, next_res = MI(train_df, tune_df, next_config, percentileCutoff, positiveLabel, target) if not isinstance(next_res, dict): continue next_config_normal = transform_metric_indices( data.shape[1], next_config) tmp_scores.append(next_res) tmp_configs.append(next_config_normal) try: if abs(next_res[metric] - best_res[metric]) >= 0.03: lives = 5 else: lives -= 1 # pdb.set_trace() if isBetter(next_res, best_res, metric): best_config = next_config best_res = next_res if lives == 0: print("***" * 5) print("EARLY STOPPING!") print("***" * 5) break this_budget -= 1 eval += 1 except: pdb.set_trace() _, res = MI(train_df, testdata, best_config, percentileCutoff, positiveLabel, target) return res