def random_miss_prop(train_x, label, miss_column, algorithm, file_name): """ Assume all column of train_x can miss. :param train_x: :param train_y: :param algorithm: :return: """ miss_prop = np.arange(0.0, 0.45, 0.05) result = pd.DataFrame() fraction_missing_features = int(np.ceil(len(miss_column) * 0.8)) ad_detector = ADDetector(alg_type=algorithm) mvi_object = MissingValueInjector() num_missing = 0 ad_detector.train(train_x, ensemble_size=1) for alpha in miss_prop: # print alpha, num_missing test_x = train_x.copy() #alpha = 0.15 if alpha > 0: num_missing, _ = mvi_object.inject_missing_in_random_cell( test_x, alpha) # impute value and perform detection. mt_impute = metric( label, ad_detector.score(mvi_object.impute_value(test_x.copy(), method="MICE"), check_miss=False)) ms_score = ad_detector.score(mvi_object.impute_value( test_x.copy(), method="SimpleFill"), check_miss=False) mt_raw = metric(label, ms_score) else: ms_score = ad_detector.score(test_x, check_miss=False) mt_raw = metric(label, ms_score) mt_impute = mt_raw # just assign 0 value when imputation is not applicable. reduced_score = ad_detector.score(test_x, True) #print test_x[nanv,:] mt_reduced = metric(label, reduced_score) # print "For ", [num_missing] + mt + [alpha] result = result.append( pd.Series([alpha] + [num_missing] + [mt_raw[0]] + [mt_reduced[0]] + [mt_impute[0]] + [algorithm]), ignore_index=True) #print result result.rename(columns={ 0: "miss_prop", 1: "num_max_miss_features", 2: "auc", 3: "auc_reduced", 4: "auc_impute", 5: "algorithm" }, inplace=True) return result
def miss_proportions_exp(train_x, label, miss_column, algorithm): ensemble_size = 2 # run upto 4 times result = pd.DataFrame() miss_prop = np.arange(0.05, 0.45, 0.05) fraction_missing_features = int(np.ceil(len(miss_column) * 0.8)) ad_detector = ADDetector(alg_type=algorithm) mvi_object = MissingValueInjector() for en_size in range(1, ensemble_size): ad_detector.train(train_x, ensemble_size=en_size) for alpha in miss_prop: for num_missing in range(0, fraction_missing_features): # print alpha, num_missing test_x = train_x.copy() ms_score = ad_detector.score(test_x, False) mt_raw = metric(label, ms_score) if num_missing > 0: mvi_object.inject_missing_value(test_x, num_missing, alpha, miss_column) # impute value and perform detection. mt_impute = metric( label, ad_detector.score(mvi_object.impute_value(test_x), False)) else: mt_impute = [ 0, 0 ] # just assign 0 value when imputation is not applicable. mt_reduced = metric(label, ad_detector.score(test_x.copy(), True)) # print "For ", [num_missing] + mt + [alpha] result = result.append( pd.Series([alpha] + [num_missing / float(len(miss_column))] + [mt_raw[0]] + [mt_reduced[0]] + [mt_impute[0]] + [en_size] + [algorithm]), ignore_index=True) result.rename(columns={ 0: "anom_prop", 1: "num_max_miss_features", 2: "auc", 3: "auc_reduced", 4: "auc_impute", 5: "ensemble_size", 6: "algorithm" }, inplace=True) return result
def test_cell_injector(): w = np.random.randn(10,5) test = w.copy() ad_in = MissingValueInjector() ix = ad_in.inject_missing_in_random_cell(w,0.3) # print w #print ix ff = pft.IsolationForest(ntree=10) ff.train(test) print ff.score(test) print ff.average_depth()[0:3] print ff.score(w) print ff.average_depth()[0:3] print ff.score(w, cmv=True) print ff.average_depth()[0:3] print pft.__file__
def algo_miss_featuresX(train_x, label, miss_column, algorithm, file_name, label_field=0): """ For running locally with threaded process. This is useful, if the job can only run on a single node. :param train_x: :param label: :param miss_column: :param algorithm: :param file_name: :param label_field: :return: """ global ad_detector, mvi_object # Train the forest result = pd.DataFrame() miss_prop = np.arange(0, 1.1, 0.1) d = len(miss_column) fraction_missing_features = int(np.ceil( d * 0.8)) # int(np.ceil(d / np.sqrt(d))) ad_detector = ADDetector(alg_type=algorithm, label=label_field) mvi_object = MissingValueInjector() ad_detector.train(train_x, ensemble_size=1, file_name=file_name) num_cores = multiprocessing.cpu_count() result = Parallel(n_jobs=num_cores)( delayed(benchmarks, check_pickle=False)(train_x, label, miss_column, algorithm, alpha, num_miss) for alpha in miss_prop for num_miss in range(1, fraction_missing_features)) result = pd.DataFrame(result) result.rename(columns={ 0: "miss_prop", 1: "miss_features_prop", 2: "auc_mean_impute", 3: "auc_reduced", 4: "auc_MICE_impute", 5: "ensemble_size", 6: "algorithm" }, inplace=True) return result
def algo_miss_featuresX(train_x, label, miss_column, algorithm, file_name, label_field=0): global ad_detector, mvi_object # Train the forest result = pd.DataFrame() miss_prop = np.arange(0, 1.1, 0.1) d = len(miss_column) fraction_missing_features = int(np.ceil( d * 0.8)) #int(np.ceil(d / np.sqrt(d))) ad_detector = ADDetector(alg_type=algorithm, label=label_field) mvi_object = MissingValueInjector() ad_detector.train(train_x, ensemble_size=1, file_name=file_name) # for alpha in miss_prop: # for num_miss in range(1, fraction_missing_features): # result.append(pd.Series(benchmarks(train_x, label, miss_column, algorithm, alpha, num_miss, mvi_object, # ad_detector)), # ignore_index=True) num_cores = multiprocessing.cpu_count() #pool= multiprocessing.Pool() #result = pool.map() result = Parallel(n_jobs=num_cores)( delayed(benchmarks, check_pickle=False)(train_x, label, miss_column, algorithm, alpha, num_miss) for alpha in miss_prop for num_miss in range(1, fraction_missing_features)) result = pd.DataFrame(result) result.rename(columns={ 0: "miss_prop", 1: "miss_features_prop", 2: "auc_mean_impute", 3: "auc_reduced", 4: "auc_MICE_impute", 5: "ensemble_size", 6: "algorithm" }, inplace=True) return result
def algo_miss_features(train_x, label, miss_column, algorithm, file_name, label_field=0): # Train the forest ensemble_size = 2 # run upto 4 times result = pd.DataFrame() miss_prop = np.arange(0, 1.1, 0.1) d = len(miss_column) fraction_missing_features = int(np.ceil( len(miss_column) * 0.8)) # int(np.ceil(d /np.sqrt(d))) ad_detector = ADDetector(alg_type=algorithm, label=label_field) mvi_object = MissingValueInjector() for en_size in range(1, ensemble_size): ad_detector.train(train_x, ensemble_size=en_size, file_name=file_name) for alpha in miss_prop: for num_missing in range(1, fraction_missing_features): #print alpha, num_missing test_x = train_x.copy() mvi_object.inject_missing_value(test_x, num_missing, alpha, miss_column) if algorithm.upper() != 'BIFOR': # Check with missing values. if num_missing * alpha > 0: # Check the value. ms_score = ad_detector.score( mvi_object.impute_value(test_x.copy(), "SimpleFill"), False) mt = metric(label, ms_score) mt_impute = metric(label, ad_detector.score( mvi_object.impute_value( test_x.copy(), method="MICE"), False)) #impute else: ms_score = ad_detector.score(test_x, False) mt = metric(label, ms_score) mt_impute = mt else: mt = mt_impute = [ 0.0, 0.0 ] # Just to reduce computation, only run reduced approach when BIFOR is used. mt_reduced = metric(label, ad_detector.score(test_x, True)) #Bagging approach result = result.append( pd.Series([alpha] + [num_missing / float(len(miss_column))] + [mt[0]] + [mt_reduced[0]] + [mt_impute[0]] + [en_size] + [algorithm]), ignore_index=True) result.rename( columns={ 0: "miss_prop", 1: "miss_features_prop", 2: "auc_mean_impute", 3: "auc_reduced", 4: "auc_MICE_impute", 5: "ensemble_size" }, #, 6:"algorithm"}, inplace=True) return result
def single_benchmark(train_x, label, miss_column, file_name, label_field, algorithm_list=ALGORITHMS, task_id=1): d = len(miss_column) # size of missing features. frac_missing_prop, frac_features = algo_parameters(task_id) num_missing_att = int(np.ceil(d * frac_features)) test_x = train_x.copy() miss_index = [] scores_result = [] algorithms = {} mvi_object = MissingValueInjector() x_impute = lambda method: mvi_object.impute_value(test_x, method=method) for algo in algorithm_list: try: algorithms[algo] = ADDetector(alg_type=algo, label=label_field) algorithms[algo].train(train_x, ensemble_size=1, file_name=file_name) except Exception as e: print "Error from {0:s}".format(algo), e.message continue def score_algo(test_x, method, score_bool=False): scores = [] local_score = [] for algo in algorithms: try: if (method in ["mean", "MICE"]) and algo == "BIFOR": continue local_score = algorithms[algo].score(test_x, score_bool) logging.debug("score {0:d} - {1:s} - {2:s}-{3:s}".format( len(local_score), file_name, str(frac_features), str(frac_missing_prop))) auc_score = metric(label, local_score)[0] scores.append([ frac_missing_prop, frac_features, auc_score, algo, method, os.path.basename(file_name) ]) except Exception as e: print "Error from {0:s}".format(algo), e.message continue return scores if num_missing_att * frac_missing_prop > 0: miss_index = mvi_object.inject_missing_value( data=test_x, num_missing_attribute=num_missing_att, alpha=frac_missing_prop, miss_att_list=miss_column) x_na_mean = x_impute("SimpleFill") scores_result += score_algo(x_na_mean, method="mean", score_bool=False) replace_with_nan(miss_index, test_x) x_na_mice = x_impute("MICE") scores_result += score_algo(x_na_mice, method="MICE", score_bool=False) # append the list else: scores_result += score_algo(test_x, method="NoImpute") replace_with_nan(miss_index, test_x) scores_result += score_algo(test_x, method="reduced", score_bool=True) return scores_result