def __init__(self, train, test, id_column, y_column_name): self.train = train self.test = test self.y_column_name = y_column_name self.id_column = id_column self.data = pd.concat([train, test], ignore_index=True) self.feature_engineering = FeatureEngineering(train, test, id_column, y_column_name) self.feature_selection = FeatureSelection(train, test, id_column, y_column_name)
def get(argsfselection): args_list = argsfselection.split(",") if len(args_list) == 4: feasel_name = args_list[0] feasel_arg = args_list[1] filename = args_list[2] jfilename = args_list[3] json_path = currentDirPath + "/jsonfiles/" + jfilename file_path = currentDirPath + "/csvfiles/" + filename data = pd.read_csv(file_path) data_afs = FeatureSelection.selection(data, feasel_name, feasel_arg) data_afs.reset_index(inplace=True, drop=True) data_afs.to_csv(file_path, index=False) data_t10 = data_afs.head(10) data_noff = len(data_afs.columns) with open(json_path) as jcon: jdata = json.load(jcon) jdata.update({'FeatureSelection': feasel_name}) with open(json_path, 'w') as j_con: json.dump(jdata, j_con) return { 'predata': data_t10.to_json(orient='table'), 'filename': filename, 'numofcol': data_noff, 'jfilename': jfilename } elif len(args_list) == 3: feasel_name = args_list[0] filename = args_list[1] jfilename = args_list[2] json_path = currentDirPath + "/jsonfiles/" + jfilename file_path = currentDirPath + "/csvfiles/" + filename data = pd.read_csv(file_path) data_afs = FeatureSelection.selection(data, feasel_name, "") data_afs.reset_index(inplace=True, drop=True) data_afs.to_csv(file_path, index=False) data_t10 = data_afs.head(10) data_noff = len(data_afs.columns) with open(json_path) as jcon: jdata = json.load(jcon) jdata.update({'FeatureSelection': feasel_name}) with open(json_path, 'w') as j_con: json.dump(jdata, j_con) return { 'predata': data_t10.to_json(orient='table'), 'filename': filename, 'numofcol': data_noff, 'jfilename': jfilename }
def __main__(): matrix = [[0.7, 0.9, 0.4, 0.6, 1], [0.6, 0.9, 0.3, 0.7, 1], [0.6, 0.8, 0.3, 0.5, 1], [0.3, 0.5, 0.7, 0.2, 1], [0.3, 0.4, 0.8, 0.3, 1], [0.4, 0.5, 0.6, 0.3, 1], [0.9, 0.4, 0.5, 0.9, 0], [0.8, 0.5, 0.4, 0.8, 0], [0.2, 0.6, 0.7, 1.0, 0], [0.1, 0.7, 0.8, 0.8, 0]] instance_selection_obj = InstanceSelection(matrix) instance_selection_obj.apply() feature_selection_obj = FeatureSelection( instance_selection_obj.representative_instances_list[0]) feature_selection_obj.apply(instance_selection_obj) print(instance_selection_obj.representative_instances_list) print(feature_selection_obj.rep_feature_set)
class PreprocessedData: """ This class combines the feature engineering and feature selection class to one to preprocess the data. It takes in the cleaned data and the y_column_name(ratings) """ def __init__(self, data, y_column_name): self.data = data self.y_column_name = y_column_name self.feature_engineering = FeatureEngineering(data, y_column_name) self.feature_selection = FeatureSelection(data, y_column_name) def preprocess_my_data(self, num_of_features_to_select): """ This method preprocesses the cleaned data and performs feature selection to select best n-features :param num_of_features_to_select: n-best features of the model :return: Full preprocesse data with n-selected features """ self.data = self.feature_engineering.input_rare_categorical() self.data = self.feature_engineering.encode_categorical_features() self.data = self.feature_engineering.scale_features() self.data = self.feature_selection.perform_feature_selection( num_of_features_to_select) return self.data
def main(): LOCAL_LOCATION_X = "../Data/fpkm_normalized.csv" LOCAL_LOCATION_Y = "../Data/disease.csv" print("Loading Data...") features = pd.read_csv(LOCAL_LOCATION_X, header=None) labels = pd.read_csv(LOCAL_LOCATION_Y, header=None) print("Data Loaded!") feature_selection = FeatureSelection('mRMR', 5, features, labels) feature_indices = feature_selection.get_best_features() # feature_indices = [4929, 5345, 16381, 13656] print("Features has been selected!") selected_features = features[feature_indices] labels = pd.DataFrame(modify_output(labels)) labels = pd.DataFrame(keras.utils.to_categorical(labels, num_classes=34)) run(0, selected_features, labels, False)
def setUp(self): self.X = np.array([[i,i+1,i+2,i+3, i+4] for i in range(0, 100, 10)]) self.y = np.array([n*10 + 1 for n in range(10)]) self.X_sub = np.array([[31,32,33,34,35],[41,42,43,44,45]]) self.featureSelection = FeatureSelection(lower_is_better = True, method=None, X=self.X, y=self.y, X_sub=self.X_sub, clf=RandomForestRegressor(n_estimators=2), score_func=ProblemType.logloss, problem_type='classification', col_names = ['A', 'B', 'C', 'D', 'E'])
class TestFeatureSelection(TestCase): def setUp(self): self.X = np.array([[i,i+1,i+2,i+3, i+4] for i in range(0, 100, 10)]) self.y = np.array([n*10 + 1 for n in range(10)]) self.X_sub = np.array([[31,32,33,34,35],[41,42,43,44,45]]) self.featureSelection = FeatureSelection(lower_is_better = True, method=None, X=self.X, y=self.y, X_sub=self.X_sub, clf=RandomForestRegressor(n_estimators=2), score_func=ProblemType.logloss, problem_type='classification', col_names = ['A', 'B', 'C', 'D', 'E']) def test_allSelection(self): X, X_sub = self.featureSelection.allSelection() self.assertEqual(X.shape[1], 5, 'number of columns of X is not 5!') self.assertEqual(X_sub.shape[1], 5, 'number of columns of X_sub is not 5!') def test_forwardsSelection(self): X, X_sub = self.featureSelection.forwardsSelection() self.assertTrue(X is not None) self.assertTrue(X_sub is not None) def test_backwardsSelection(self): X, X_sub = self.featureSelection.forwardsSelection() self.assertTrue(X is not None) self.assertTrue(X_sub is not None) def test_featureImportancesSelection(self): X, X_sub = self.featureSelection.featureImportancesSelection(total_importance=0.95) self.assertTrue(X is not None) self.assertTrue(X_sub is not None) def test_randomSubsetSelection(self): X, X_sub = self.featureSelection.randomSubsetSelection(percent=0.4) self.assertEqual(X.shape[1], 2, 'number of columns of X is not 2!') self.assertEqual(X_sub.shape[1], 2, 'number of columns of X_sub is not 2!') def test_featureExtractionFromActualDataset(self): dataLoader = TrainTestDataLoader('../data/rossmann/train_100.csv', '../data/rossmann/test_100.csv', train_labels_column='Sales', test_ids_column='Id') dataLoader.cleanData(max_onehot_limit=200) X, X_sub, y = dataLoader.getTrainTestData() featureSelection = FeatureSelection(lower_is_better=True, method='all', X=X, y=y, clf=LogisticRegressionCV(), problem_type='classification')
class ProcessedData: def __init__(self, train, test, id_column, y_column_name): self.train = train self.test = test self.y_column_name = y_column_name self.id_column = id_column self.data = pd.concat([self.train, self.test], ignore_index=True) self.feature_engineering = FeatureEngineering(self.train, self.test, self.id_column, self.y_column_name) self.feature_selection = FeatureSelection(self.train, self.test, self.id_column, self.y_column_name) def preprocess_my_data(self, num_of_features_to_select): self.data = self.feature_engineering.scale_features() self.data = self.feature_selection.perform_feature_selection( num_of_features_to_select) return self.data
class PreprocessedData: def __init__(self, train, test, id_column, y_column_name): self.train = train self.test = test self.y_column_name = y_column_name self.id_column = id_column self.data = pd.concat([train, test], ignore_index=True) self.feature_engineering = FeatureEngineering(train, test, id_column, y_column_name) self.feature_selection = FeatureSelection(train, test, id_column, y_column_name) def preprocess_my_data(self, num_of_features_to_select): self.data = self.feature_engineering.fill_na_categorical() self.data = self.feature_engineering.fill_na_numerical() self.data = self.feature_engineering.input_rare_categorical() self.data = self.feature_engineering.label_encoder() self.data = self.feature_engineering.get_scale_features() self.data = self.feature_selection.perform_feature_selection( num_of_features_to_select) return self.data
def __init__(self, data, y_column_name): self.data = data self.y_column_name = y_column_name self.feature_engineering = FeatureEngineering(data, y_column_name) self.feature_selection = FeatureSelection(data, y_column_name)
def __init__( self, static_imputation_model_list, temporal_imputation_model_list, static_feature_selection_model_list, temporal_feature_selection_model_list, prediction_model_list, dataset_training, dataset_testing, task, metric_name, metric_parameters, ): self.dataset_testing = dataset_testing self.dataset_training = dataset_training self.static_imputation_model_list = static_imputation_model_list self.temporal_imputation_model_list = temporal_imputation_model_list self.static_feature_selection_model_list = static_feature_selection_model_list self.temporal_feature_selection_model_list = temporal_feature_selection_model_list self.prediction_model_list = prediction_model_list # imputation static_imputation_list = [ imputation.Imputation(imputation_model_name=x, data_type="static") for x in static_imputation_model_list ] temporal_imputation_list = [ imputation.Imputation(imputation_model_name=x, data_type="temporal") for x in temporal_imputation_model_list ] # feature selection static_feature_selection_list = [] for x in static_feature_selection_model_list: # Select relevant features static_feature_selection = FeatureSelection( feature_selection_model_name=x[0], feature_type="static", feature_number=x[1], task=task, metric_name=metric_name, metric_parameters=metric_parameters, ) static_feature_selection_list.append(static_feature_selection) temporal_feature_selection_list = [] for x in temporal_feature_selection_model_list: # Select relevant features temporal_feature_selection = FeatureSelection( feature_selection_model_name=x[0], feature_type="temporal", feature_number=x[1], task=task, metric_name=metric_name, metric_parameters=metric_parameters, ) temporal_feature_selection_list.append(temporal_feature_selection) # prediction pred_class_list = [] # Set predictive model model_name_list = prediction_model_list for model_name in model_name_list: # Set model parameters model_parameters = { "h_dim": 100, "n_layer": 2, "n_head": 2, "batch_size": 128, "epoch": 2, "model_type": model_name, "learning_rate": 0.001, "static_mode": "Concatenate", "time_mode": "Concatenate", "verbose": False, } # Train the predictive model pred_class = prediction(model_name, model_parameters, task) pred_class_list.append(pred_class) self.pred_class_list = pred_class_list self.temporal_feature_selection_list = temporal_feature_selection_list self.static_feature_selection_list = static_feature_selection_list self.temporal_imputation_list = temporal_imputation_list self.static_imputation_list = static_imputation_list self.domain = [ { "name": "static_imputation", "type": "discrete", "domain": list(range(len(static_imputation_list))) }, { "name": "temporal_imputation", "type": "discrete", "domain": list(range(len(temporal_imputation_list))) }, { "name": "static_feature_selection", "type": "discrete", "domain": list(range(len(static_feature_selection_list))), }, { "name": "temporal_feature_selection", "type": "discrete", "domain": list(range(len(temporal_feature_selection_list))), }, { "name": "pred_class", "type": "discrete", "domain": list(range(len(pred_class_list))) }, ] self.myBopt = BayesianOptimization(f=self.f, domain=self.domain)
from feature_selection import FeatureSelection from config import Configure import pandas as pd print('\n Aplicando algoritmo de seleção de parâmetros') settings = Configure() settings.set_fs_params() settings.set_pre_processing_params() pp_params = settings.pre_processing_params fs_params = settings.feature_selection_params df1 = pd.read_csv(settings.pf1_folder) df2 = pd.read_csv(settings.pf2_folder) df3 = pd.read_csv(settings.pf3_folder) mkt = pd.read_csv(settings.mkt_folder) fs = FeatureSelection(mkt, df1, df2, df3, pp_params, fs_params) values, features = fs.feature_selection_algorithm(m='RFECV') columns = features.values[features.values != 'Unnamed: 0'] mkt = mkt[columns]
MAX_ITERATIONS = int(sys.argv[1]) for size in range(MAX_ITERATIONS-1,MAX_ITERATIONS): np.random.shuffle(data) test_data = data[:size] #Representative Instance Selection start_time1 = time.time() InstanceSelector = InstanceSelection(test_data) InstanceSelector.apply() end_time1 = time.time() algo1_time = end_time1-start_time1 #Feature Selection start_time2 = time.time() feature_selection_obj = FeatureSelection(InstanceSelector.representative_instances_list[0]) feature_selection_obj.apply(InstanceSelector) end_time2 = time.time() algo2_time = end_time2-start_time2 # print("Algo 1 time : ",algo1_time) # print("Algo 2 time : ",algo2_time) representative_instances = InstanceSelector.representative_instances_list # print("Instance set : ",representative_instances) feature_set = list(feature_selection_obj.rep_feature_set) # print("Feature set : ",feature_set) # time_taken[size] = end_time-start_time # print(len(InstanceSelector.representative_instances_list[0])) print("{:27s} |{:8s}|{:8s}|{:8s}|{:8s}".format("Model","Accuracy","Precision","Recall","F1-score")) print("-"*65)
def main(args): '''Main function for active sensing. Args: - data loading parameters: - data_names: mimic, ward, cf - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'one-shot' or 'online' - 'one-shot': one time prediction at the end of the time-series - 'online': preditcion at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected featuer number - active_sensing_model_parameters: - active_sensing_model_name: asac, deepsensing - model_name: rnn, lstm, gru - model_parameters: network parameters such as numer of layers - h_dim: hidden dimensions - n_layer: layer number - n_head: head number (only for transformer model) - batch_size: number of samples in mini-batch - epochs: number of epochs - learning_rate: learning rate - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - task: classification or regression ''' #%% Step 0: Set basic parameters metric_parameters = { 'problem': args.problem, 'label_name': [args.label_name] } #%% Step 1: Upload Dataset # File names data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_' data_loader_training = CSVLoader( static_file=data_directory + 'static_train_data.csv.gz', temporal_file=data_directory + 'temporal_train_data_eav.csv.gz') data_loader_testing = CSVLoader( static_file=data_directory + 'static_test_data.csv.gz', temporal_file=data_directory + 'temporal_test_data_eav.csv.gz') dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print('Finish data loading.') #%% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print('Finish preprocessing.') #%% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=args.treatment) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print('Finish defining problem.') #%% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type='static') temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type='temporal') imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print('Finish imputation.') #%% Step 5: Feature selection (4 options) static_feature_selection = \ FeatureSelection(feature_selection_model_name = args.static_feature_selection_model, feature_type = 'static', feature_number = args.static_feature_selection_number, task = args.task, metric_name = args.metric_name, metric_parameters = metric_parameters) temporal_feature_selection = \ FeatureSelection(feature_selection_model_name = args.temporal_feature_selection_model, feature_type = 'temporal', feature_number = args.temporal_feature_selection_number, task = args.task, metric_name = args.metric_name, metric_parameters = metric_parameters) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print('Finish feature selection.') #%% Step 6: Fit and Predict (6 options) # Set predictor model parameters model_parameters = { 'h_dim': args.h_dim, 'n_layer': args.n_layer, 'batch_size': args.batch_size, 'epoch': args.epochs, 'model_type': args.model_name, 'learning_rate': args.learning_rate, 'static_mode': args.static_mode, 'time_mode': args.time_mode, 'verbose': True } # Set the validation data for best model saving dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0) active_sensing_class = active_sensing(args.active_sensing_model_name, model_parameters, args.task) active_sensing_class.fit(dataset_training) test_s_hat = active_sensing_class.predict(dataset_testing) print('Finish original predictor model training and testing.') #%% Step 7: Visualize Results idx = np.random.permutation(len(test_s_hat))[:2] # Visualize the output print('Future Measurements Recommendation') print_interpretation(test_s_hat[idx], dataset_testing.feature_name, metric_parameters, model_parameters) return
from feature_selection import FeatureSelection import pandas as pd from config.config_writer import ConfigWriter from utils import run_utils from config import config_reader df = pd.read_csv('data_test/iris.csv') fs = FeatureSelection(df) labels = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'] directory = 'data_test/best_exporter_test' config_test_file = 'data_test/iris_config_test.ini' checkpoints = {'1532434574': {'accuracy': 1.0, 'loss': 0.153, 'step': '42'}} dict_types = { 'sepal_length': 'numerical', 'sepal_width': 'numerical', 'petal_length': 'numerical', 'petal_width': 'numerical', 'class': 'categorical' } sfeatures = { 'sepal_length': '5.8', 'sepal_width': '3.0', 'petal_length': '4.35', 'petal_width': '1.3' } categories = [ 'numerical', 'numerical', 'numerical', 'numerical', 'categorical' ] def test_get_html_types():
def fs(df): return FeatureSelection(df)
def main(args): """Main function for AutoML in time-series predictions. Args: - data loading parameters: - data_names: mimic, ward, cf - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'one-shot' or 'online' - 'one-shot': one time prediction at the end of the time-series - 'online': preditcion at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addition, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected featuer number - predictor_parameters: - epochs: number of epochs - bo_itr: bayesian optimization iterations - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - task: classification or regression - metric_name: auc, apr, mae, mse """ #%% Step 0: Set basic parameters metric_sets = [args.metric_name] metric_parameters = { "problem": args.problem, "label_name": [args.label_name] } #%% Step 1: Upload Dataset # File names data_directory = "../datasets/data/" + args.data_name + "/" + args.data_name + "_" data_loader_training = CSVLoader( static_file=data_directory + "static_train_data.csv.gz", temporal_file=data_directory + "temporal_train_data_eav.csv.gz", ) data_loader_testing = CSVLoader( static_file=data_directory + "static_test_data.csv.gz", temporal_file=data_directory + "temporal_test_data_eav.csv.gz", ) dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print("Finish data loading.") #%% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print("Finish preprocessing.") #%% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=args.treatment) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print("Finish defining problem.") #%% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type="static") temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type="temporal") imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print("Finish imputation.") #%% Step 5: Feature selection (4 options) static_feature_selection = FeatureSelection( feature_selection_model_name=args.static_feature_selection_model, feature_type="static", feature_number=args.static_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters, ) temporal_feature_selection = FeatureSelection( feature_selection_model_name=args.temporal_feature_selection_model, feature_type="temporal", feature_number=args.temporal_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters, ) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print("Finish feature selection.") #%% Step 6: Bayesian Optimization ## Model define # RNN model rnn_parameters = { "model_type": "lstm", "epoch": args.epochs, "static_mode": args.static_mode, "time_mode": args.time_mode, "verbose": False, } general_rnn = GeneralRNN(task=args.task) general_rnn.set_params(**rnn_parameters) # CNN model cnn_parameters = { "epoch": args.epochs, "static_mode": args.static_mode, "time_mode": args.time_mode, "verbose": False, } temp_cnn = TemporalCNN(task=args.task) temp_cnn.set_params(**cnn_parameters) # Transformer transformer = TransformerPredictor(task=args.task, epoch=args.epochs, static_mode=args.static_mode, time_mode=args.time_mode) # Attention model attn_parameters = { "model_type": "lstm", "epoch": args.epochs, "static_mode": args.static_mode, "time_mode": args.time_mode, "verbose": False, } attn = Attention(task=args.task) attn.set_params(**attn_parameters) # model_class_list = [general_rnn, attn, temp_cnn, transformer] model_class_list = [general_rnn, attn] # train_validate split dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.1) # Bayesian Optimization Start metric = BOMetric(metric="auc", fold=0, split="test") ens_model_list = [] # Run BO for each model class for m in model_class_list: BO_model = automl.model.AutoTS(dataset_training, m, metric, model_path="tmp/") models, bo_score = BO_model.training_loop(num_iter=args.bo_itr) auto_ens_model = AutoEnsemble(models, bo_score) ens_model_list.append(auto_ens_model) # Load all ensemble models for ens in ens_model_list: for m in ens.models: m.load_model(BO_model.model_path + "/" + m.model_id + ".h5") # Stacking algorithm stacking_ens_model = StackingEnsemble(ens_model_list) stacking_ens_model.fit(dataset_training, fold=0, train_split="val") # Prediction assert not dataset_testing.is_validation_defined test_y_hat = stacking_ens_model.predict(dataset_testing, test_split="test") test_y = dataset_testing.label print("Finish AutoML model training and testing.") #%% Step 7: Visualize Results idx = np.random.permutation(len(test_y_hat))[:2] # Evaluate predictor model result = Metrics(metric_sets, metric_parameters).evaluate(test_y, test_y_hat) print("Finish predictor model evaluation.") # Visualize the output # (1) Performance print("Overall performance") print_performance(result, metric_sets, metric_parameters) # (2) Predictions print("Each prediction") print_prediction(test_y_hat[idx], metric_parameters) return
def main(args): '''Main function for individual treatment effect estimation. Args: - data loading parameters: - data_names: mimic, ward, cf, mimic_antibiotics - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'online' - 'online': preiction at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected featuer number - treatment effects model parameters: - model_name: CRN, RMSN, GANITE Each model has different types of hyperparameters that need to be set. - Parameters needed for the Counterfactual Recurrent Network (CRN): - hyperparameters for encoder: - rnn_hidden_units: hidden dimensions in the LSTM unit - rnn_keep_prob: keep probability used for variational dropout in the LSTM unit - br_size: size of the balancing representation - fc_hidden_units: hidden dimensions of the fully connected layers used for treatment classifier and predictor - batch_size: number of samples in mini-batch - num_epochs: number of epochs - learning_rate: learning rate - max_alpha: alpha controls the trade-off between building tratment invariant representations (domain discrimination) and being able to predict outcomes (outcome prediction); during training, CRN uses an exponentially increasing schedule for alpha from 0 to max_alpha. - hyperparameters for decoder: - the decoder requires the same hyperparameters as the encoder with the exception of the rnn_hidden_units which is set to be equal to the br_size of the encoder - Parameters for Recurrent Marginal Structural Networks (RMSN): - hyperparameters for encoder: - dropout_rate: dropout probability used for variational - rnn_hidden_units: hidden dimensions in the LSTM unit - batch_size: number of samples in mini-batch - num_epochs: number of epochs - learning_rate: learning rate - max_norm: max gradient norm used for gradient clipping during training - hyperparameters for decoder: - the decoder requires the same hyperparameters as the encoder. - model_dir: directory where the model is saved - model_name: name of the saved model - Parameters for GANITE: - batch size: number of samples in mini-batch - alpha: parameter trading off between discriminator loss and supervised loss for the generator training - learning_rate: learning rate - hidden_units: hidden dimensions of the fully connected layers used in the networks - stack_dim: number of timesteps to stack All models have the following common parameters: - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - taks: 'classification' or 'regression' - metric_name: auc, apr, mae, mse (used for factual prediction) - patient id: patient for which counterfactual trajectories are computed - timestep: timestep in patient trajectory for estimating counterfactuals ''' # %% Step 0: Set basic parameters metric_sets = [args.metric_name] metric_parameters = { 'problem': args.problem, 'label_name': [args.label_name] } # %% Step 1: Upload Dataset # File names data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_' data_loader_training = CSVLoader( static_file=data_directory + 'static_train_data.csv.gz', temporal_file=data_directory + 'temporal_train_data_eav.csv.gz') data_loader_testing = CSVLoader( static_file=data_directory + 'static_test_data.csv.gz', temporal_file=data_directory + 'temporal_test_data_eav.csv.gz') dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print('Finish data loading.') # %% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print('Finish preprocessing.') # %% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=[args.treatment]) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print('Finish defining problem.') # %% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type='static') temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type='temporal') imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print('Finish imputation.') # %% Step 5: Feature selection (4 options) static_feature_selection = \ FeatureSelection(feature_selection_model_name=args.static_feature_selection_model, feature_type='static', feature_number=args.static_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters) temporal_feature_selection = \ FeatureSelection(feature_selection_model_name=args.temporal_feature_selection_model, feature_type='temporal', feature_number=args.temporal_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print('Finish feature selection.') # %% Step 6: Fit treatment effects (3 options) # Set the validation data for best model saving dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0) # Set the treatment effects model model_name = args.model_name # Set treatment effects model parameters if model_name == 'CRN': model_parameters = { 'encoder_rnn_hidden_units': args.crn_encoder_rnn_hidden_units, 'encoder_br_size': args.crn_encoder_br_size, 'encoder_fc_hidden_units': args.crn_encoder_fc_hidden_units, 'encoder_learning_rate': args.crn_encoder_learning_rate, 'encoder_batch_size': args.crn_encoder_batch_size, 'encoder_keep_prob': args.crn_encoder_keep_prob, 'encoder_num_epochs': args.crn_encoder_num_epochs, 'encoder_max_alpha': args.crn_encoder_max_alpha, 'decoder_br_size': args.crn_decoder_br_size, 'decoder_fc_hidden_units': args.crn_decoder_fc_hidden_units, 'decoder_learning_rate': args.crn_decoder_learning_rate, 'decoder_batch_size': args.crn_decoder_batch_size, 'decoder_keep_prob': args.crn_decoder_keep_prob, 'decoder_num_epochs': args.crn_decoder_num_epochs, 'decoder_max_alpha': args.crn_decoder_max_alpha, 'projection_horizon': args.projection_horizon, 'static_mode': args.static_mode, 'time_mode': args.time_mode } treatment_model = treatment_effects_model(model_name, model_parameters, task='classification') treatment_model.fit(dataset_training) elif model_name == 'RMSN': hyperparams_encoder_iptw = { 'dropout_rate': args.rmsn_encoder_dropout_rate, 'memory_multiplier': args.rmsn_encoder_memory_multiplier, 'num_epochs': args.rmsn_encoder_num_epochs, 'batch_size': args.rmsn_encoder_batch_size, 'learning_rate': args.rmsn_encoder_learning_rate, 'max_norm': args.rmsn_encoder_max_norm } hyperparams_decoder_iptw = { 'dropout_rate': args.rmsn_decoder_dropout_rate, 'memory_multiplier': args.rmsn_decoder_memory_multiplier, 'num_epochs': args.rmsn_decoder_num_epochs, 'batch_size': args.rmsn_decoder_batch_size, 'learning_rate': args.rmsn_decoder_learning_rate, 'max_norm': args.rmsn_decoder_max_norm } model_parameters = { 'hyperparams_encoder_iptw': hyperparams_encoder_iptw, 'hyperparams_decoder_iptw': hyperparams_decoder_iptw, 'model_dir': args.rmsn_model_dir, 'model_name': args.rmsn_model_name, 'static_mode': args.static_mode, 'time_mode': args.time_mode } treatment_model = treatment_effects_model(model_name, model_parameters, task='classification') treatment_model.fit(dataset_training, projection_horizon=args.projection_horizon) elif model_name == 'GANITE': hyperparams = { 'batch_size': args.ganite_batch_size, 'alpha': args.ganite_alpha, 'hidden_dims': args.ganite_hidden_dims, 'learning_rate': args.ganite_learning_rate } model_parameters = { 'hyperparams': hyperparams, 'stack_dim': args.ganite_stack_dim, 'static_mode': args.static_mode, 'time_mode': args.time_mode } treatment_model = treatment_effects_model(model_name, model_parameters, task='classification') treatment_model.fit(dataset_training) test_y_hat = treatment_model.predict(dataset_testing) print('Finish treatment effects model training and testing.') # %% Step 9: Visualize Results # Evaluate predictor model result = Metrics(metric_sets, metric_parameters).evaluate(dataset_testing.label, test_y_hat) print('Finish predictor model evaluation.') # Visualize the output # (1) Performance on estimating factual outcomes print('Overall performance on estimating factual outcomes') print_performance(result, metric_sets, metric_parameters) # (2) Counterfactual trajectories print('Counterfactual trajectories') if model_name in ['CRN', 'RMSN']: # Predict and visualize counterfactuals for the sequence of treatments indicated by the user # through the treatment_options. The lengths of each sequence of treatments needs to be projection_horizon + 1. treatment_options = np.array([[[1], [1], [1], [1], [1], [0]], [[0], [0], [0], [0], [1], [1]]]) history, counterfactual_traj = treatment_model.predict_counterfactual_trajectories( dataset=dataset_testing, patient_id=args.patient_id, timestep=args.timestep, treatment_options=treatment_options) print_counterfactual_predictions( patient_history=history, treatment_options=treatment_options, counterfactual_predictions=counterfactual_traj) return
def assign_category(self, df): fs = FeatureSelection(df) self.set('fs', fs) category_list, unique_values, default_list, frequent_values2frequency = fs.assign_category( self.get('config_file'), df) return category_list, unique_values, default_list, frequent_values2frequency
from utils import feature_util import pytest from feature_selection import FeatureSelection import pandas as pd from config.config_writer import ConfigWriter from config import config_reader from utils import preprocessing file = 'data_test/iris.csv' config_test_file = 'data_test/iris_config_test.ini' df = pd.read_csv('data_test/iris.csv') fs = FeatureSelection(df) df_range = pd.read_csv('data_test/dataset.csv') fs_range = FeatureSelection(df_range) categories = ['numerical', 'numerical', 'numerical', 'numerical', 'categorical'] unique_values = [-1, -1, -1, -1, 3] default_list = {'sepal_length': 5.8, 'sepal_width': 3.0, 'petal_length': 4.35, 'petal_width': 1.3, 'class': 'Iris-setosa'} frequent_values2frequency = {'sepal_length': (5.0, 10), 'sepal_width': (3.0, 26), 'petal_length': (1.5, 14), 'petal_width': (0.2, 28), 'class': ('Iris-setosa', 50)} SAMPLE_DATA_SIZE = 5 data = preprocessing.insert_data(df, categories, unique_values, default_list, frequent_values2frequency, SAMPLE_DATA_SIZE) data.Category = categories def test_already_order_reorder_request(): features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'] categories = ['numerical', 'numerical', 'numerical', 'numerical', 'categorical']
def main(args): '''Main function for AutoML in time-series predictions. Args: - data loading parameters: - data_names: mimic, ward, cf - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'one-shot' or 'online' - 'one-shot': one time prediction at the end of the time-series - 'online': preditcion at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected featuer number - predictor_parameters: - epochs: number of epochs - bo_itr: bayesian optimization iterations - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - task: classification or regression - metric_name: auc, apr, mae, mse ''' #%% Step 0: Set basic parameters metric_sets = [args.metric_name] metric_parameters = { 'problem': args.problem, 'label_name': [args.label_name] } #%% Step 1: Upload Dataset # File names data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_' data_loader_training = CSVLoader( static_file=data_directory + 'static_train_data.csv.gz', temporal_file=data_directory + 'temporal_train_data_eav.csv.gz') data_loader_testing = CSVLoader( static_file=data_directory + 'static_test_data.csv.gz', temporal_file=data_directory + 'temporal_test_data_eav.csv.gz') dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print('Finish data loading.') #%% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print('Finish preprocessing.') #%% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=[args.treatment]) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print('Finish defining problem.') #%% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type='static') temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type='temporal') imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print('Finish imputation.') #%% Step 5: Feature selection (4 options) static_feature_selection = \ FeatureSelection(feature_selection_model_name = args.static_feature_selection_model, feature_type = 'static', feature_number = args.static_feature_selection_number, task = args.task, metric_name = args.metric_name, metric_parameters = metric_parameters) temporal_feature_selection = \ FeatureSelection(feature_selection_model_name = args.temporal_feature_selection_model, feature_type = 'temporal', feature_number = args.temporal_feature_selection_number, task = args.task, metric_name = args.metric_name, metric_parameters = metric_parameters) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print('Finish feature selection.') #%% Step 6: Bayesian Optimization ## Model define model_parameters = { 'projection_horizon': 5, 'static_mode': 'concatenate', 'time_mode': 'concatenate' } crn_model = CRN_Model(task=args.task) crn_model.set_params(**model_parameters) model_class = crn_model # train_validate split dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.2) # Bayesian Optimization Start metric = BOMetric(metric='auc', fold=0, split='test') # Run BO for selected model class BO_model = AutoTS(dataset_training, model_class, metric) models, bo_score = BO_model.training_loop(num_iter=2) auto_ens_model = AutoEnsemble(models, bo_score) # Prediction assert not dataset_testing.is_validation_defined test_y_hat = auto_ens_model.predict(dataset_testing, test_split='test') test_y = dataset_testing.label print('Finish AutoML model training and testing.') #%% Step 7: Visualize Results idx = np.random.permutation(len(test_y_hat))[:2] # Evaluate predictor model result = Metrics(metric_sets, metric_parameters).evaluate(test_y, test_y_hat) print('Finish predictor model evaluation.') # Visualize the output # (1) Performance print('Overall performance') print_performance(result, metric_sets, metric_parameters) # (2) Predictions print('Each prediction') print_prediction(test_y_hat[idx], metric_parameters) return
def test_log(log_info): logging.info("testing foo") logAssert(log_info == 'foo', "foo is not foo") t = ThreadHandler() username = '******' config_file = 'data_test/iris_config.ini' port = '5000' target = 'class' file = 'data_test/iris.csv' config_test_file = 'data_test/iris_config_test.ini' df = pd.read_csv('data_test/iris.csv') fs = FeatureSelection(df) categories = [ 'numerical', 'numerical', 'numerical', 'numerical', 'categorical' ] unique_values = [-1, -1, -1, -1, 3] default_list = { 'sepal_length': 5.8, 'sepal_width': 3.0, 'petal_length': 4.35, 'petal_width': 1.3, 'class': 'Iris-setosa' } frequent_values2frequency = { 'sepal_length': (5.0, 10), 'sepal_width': (3.0, 26),
def main(args): """Main function for time-series prediction. Args: - data loading parameters: - data_names: mimic, ward, cf - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'one-shot' or 'online' - 'one-shot': one time prediction at the end of the time-series - 'online': prediction at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addition, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected feature number - predictor_parameters: - model_name: rnn, gru, lstm, attention, tcn, transformer - model_parameters: network parameters such as number of layers - h_dim: hidden dimensions - n_layer: layer number - n_head: head number (only for transformer model) - batch_size: number of samples in mini-batch - epochs: number of epochs - learning_rate: learning rate - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - task: classification or regression - uncertainty_model_name: uncertainty estimation model name (ensemble) - interpretation_model_name: interpretation model name (tinvase) - metric_name: auc, apr, mae, mse """ #%% Step 0: Set basic parameters metric_sets = [args.metric_name] metric_parameters = { "problem": args.problem, "label_name": [args.label_name] } #%% Step 1: Upload Dataset # File names data_directory = "../datasets/data/" + args.data_name + "/" + args.data_name + "_" data_loader_training = CSVLoader( static_file=data_directory + "static_train_data.csv.gz", temporal_file=data_directory + "temporal_train_data_eav.csv.gz", ) data_loader_testing = CSVLoader( static_file=data_directory + "static_test_data.csv.gz", temporal_file=data_directory + "temporal_test_data_eav.csv.gz", ) dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print("Finish data loading.") #%% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print("Finish preprocessing.") #%% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=args.treatment) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print("Finish defining problem.") #%% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type="static") temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type="temporal") imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print("Finish imputation.") #%% Step 5: Feature selection (4 options) static_feature_selection = FeatureSelection( feature_selection_model_name=args.static_feature_selection_model, feature_type="static", feature_number=args.static_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters, ) temporal_feature_selection = FeatureSelection( feature_selection_model_name=args.temporal_feature_selection_model, feature_type="temporal", feature_number=args.temporal_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters, ) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print("Finish feature selection.") #%% Step 6: Fit and Predict (6 options) # Set predictor model parameters model_parameters = { "h_dim": args.h_dim, "n_layer": args.n_layer, "n_head": args.n_head, "batch_size": args.batch_size, "epoch": args.epochs, "model_type": args.model_name, "learning_rate": args.learning_rate, "static_mode": args.static_mode, "time_mode": args.time_mode, "verbose": True, } # Set the validation data for best model saving dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0) pred_class = prediction(args.model_name, model_parameters, args.task) pred_class.fit(dataset_training) test_y_hat = pred_class.predict(dataset_testing) print("Finish predictor model training and testing.") #%% Step 7: Estimate Uncertainty (1 option) uncertainty_model = uncertainty(args.uncertainty_model_name, model_parameters, pred_class, args.task) uncertainty_model.fit(dataset_training) test_ci_hat = uncertainty_model.predict(dataset_testing) print("Finish uncertainty estimation") #%% Step 8: Interpret Predictions (1 option) interpretor = interpretation(args.interpretation_model_name, model_parameters, pred_class, args.task) interpretor.fit(dataset_training) test_s_hat = interpretor.predict(dataset_testing) print("Finish model interpretation") #%% Step 9: Visualize Results idx = np.random.permutation(len(test_y_hat))[:2] # Evaluate predictor model result = Metrics(metric_sets, metric_parameters).evaluate(dataset_testing.label, test_y_hat) print("Finish predictor model evaluation.") # Visualize the output # (1) Performance print("Overall performance") print_performance(result, metric_sets, metric_parameters) # (2) Predictions print("Each prediction") print_prediction(test_y_hat[idx], metric_parameters) # (3) Uncertainty print("Uncertainty estimations") print_uncertainty(test_y_hat[idx], test_ci_hat[idx], metric_parameters) # (4) Model interpretation print("Model interpretation") print_interpretation(test_s_hat[idx], dataset_training.feature_name, metric_parameters, model_parameters) return