def model_predict_and_evaluate(self, dataset): """Train model with subset of the features and evalaute the performance. Args: - dataset: dataset with subset of temporal features Returns: - performance: performance with subset of temporal features """ # Build model pred_class = prediction(self.model_parameters['model_type'], self.model_parameters, self.task) # Train the model pred_class.fit(dataset) # Test the model test_y_hat = pred_class.predict(dataset) # Extract the labels _, _, test_y, _, _ = dataset.get_fold(fold=0, split='test') # Evaluate the performance temp_performance = Metrics([self.metric_name], self.metric_parameters).evaluate( test_y, test_y_hat) performance = np.mean(list(temp_performance.values())[0]) return performance
def expected_consistency_selection_ensemble( labels, class_num, target, mlset, nlset, cons_type='must', ensemble_methods=_default_ensemble_methods, ease_factor=1): selected_labels = _expected_consistency_selection(labels, mlset, nlset, cons_type=cons_type, ease_factor=ease_factor) retVals = [] retVals.append(ease_factor) retVals.append(selected_labels.shape[0]) print('[INFO] Selected Solutions:' + str(selected_labels.shape[0])) for method in ensemble_methods: ensemble_labels = _ensemble_method[method](selected_labels, N_clusters_max=class_num) ensemble_nmi = Metrics.normalized_max_mutual_info_score( ensemble_labels, target) retVals.append(ensemble_nmi) print('[INFO] Ensemble Method:' + method) print('[INFO] Performance:' + str(ensemble_nmi)) return retVals
def comparison_ensemble_methods(dataset_name, library_name, eval_method=None): """ get the performance of comparison methods (ensemble) :param dataset_name: :param library_name: :param eval_method: :return: """ filename = _default_eval_path + dataset_name + '_ensemble_eval_' + time.strftime( '%Y-%m-%d_%H_%M_%S', time.localtime(time.time())) + '.csv' lib = np.loadtxt(_default_library_path + dataset_name + '/' + library_name + '.res', delimiter=',') with open(filename, 'wb') as f: writer = csv.writer(f) writer.writerow(library_name) data, targets = exd.dataset[dataset_name]['data']() k = exd.dataset[dataset_name]['k'] eval_methods = _default_ensemble_eval_methods if eval_method is None else eval_method print '[Ensemble Comparison]: Dataset: ' + str(dataset_name) print '[Ensemble Comparison]: Library: ' + str(library_name) print '[Ensemble Comparison]: Comparison Methods: ' + str(eval_methods) print '[Ensemble Comparison]: Real k is ' + str(k) for method in eval_methods: ensemble_label = _ensemble_methods[method](lib, N_clusters_max=k) performance = metrics.normalized_max_mutual_info_score( targets, ensemble_label) writer.writerow([method, str(performance)]) return
def plot_consistency(labels, pos, mlset, nlset, savepath, consistency_type='both'): """ plot consistency distribution of given library Parameters ---------- :param labels: :param pos: :param mlset: :param nlset: :param savepath: :param consistency_type: """ texts = [] colors = [] plot_labels = [None] * (len(labels) - _ADDITIONAL_RANGE) markers = ['o'] * (len(labels) - _ADDITIONAL_RANGE) for label in labels[0:-_ADDITIONAL_RANGE]: cons = Metrics.consistency(label, mlset, nlset, cons_type=consistency_type) texts.append(cons) cNorm = colors2.Normalize(vmin=min(texts), vmax=max(texts)) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=plt.get_cmap('CMRmap')) plot_labels.extend(_ADDITIONAL_NAMES) for text in texts: colors.append(scalarMap.to_rgba(text)) texts = map(_round_digits, texts) texts.append('') texts.extend(_ADDITIONAL_NAMES[1:]) colors.extend(_ADDITIONAL_COLORS) markers.extend(_ADDITIONAL_MARKERS) title = consistency_type + ' Consistency ,' + 'Max val = ' + str(max(texts[0:-_ADDITIONAL_RANGE])) +\ ' ,Min k = ' + str(min(texts[0:-_ADDITIONAL_RANGE])) _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title) return
def plot_nmi_max(labels, pos, savepath): """ plot nmi_max distribution of given library Parameters ---------- :param labels: :param pos: :param savepath: """ texts = [] colors = [] plot_labels = [None] * (len(labels) - _ADDITIONAL_RANGE) markers = ['o'] * (len(labels) - _ADDITIONAL_RANGE) for label in labels[0:-1]: cons = Metrics.normalized_max_mutual_info_score(label, labels[-1]) texts.append(cons) cNorm = colors2.Normalize(vmin=min(texts[0:-_ADDITIONAL_RANGE+1]), vmax=max(texts[0:-_ADDITIONAL_RANGE+1])) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=plt.get_cmap('CMRmap')) plot_labels.extend(_ADDITIONAL_NAMES) for text in texts[0:-_ADDITIONAL_RANGE+1]: colors.append(scalarMap.to_rgba(text)) texts = map(_round_digits, texts) texts.extend(_ADDITIONAL_NAMES[-1:]) colors.extend(_ADDITIONAL_COLORS) markers.extend(_ADDITIONAL_MARKERS) title = 'NMI distribution, ' + 'Max val = ' + str(max(texts[0:-_ADDITIONAL_RANGE])) +\ ' ,Min k = ' + str(min(texts[0:-_ADDITIONAL_RANGE])) _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title) return
def evaluate_library(name, path, class_num, target, evaluate_methods=_default_evaluate_methods): """ do evaluation for a given library :param name: name of the library :param path: path where the library is :param class_num: #real_classes :param target: real class label :param evaluate_methods: consensus functions used for evaluation Return ------ :return: score of all consensus functions in a list """ labels = np.loadtxt(path + name, delimiter=',') if not name.endswith('_pure.res'): labels = labels[0:-5] scores = [] for method in evaluate_methods: ensemble_label = _ensemble_method[method](labels, N_clusters_max=class_num) scores.append( Metrics.normalized_max_mutual_info_score(target, ensemble_label)) return scores
def do_propagation_ensemble(library_folder, library_name, class_num, target, constraint_file, logger, alphas, have_zero=True, ensemble_method=_default_ensemble_method): logger.debug( '===========================================================================================' ) logger.debug('-----------------Propagation Ensemble for library:' + str(library_name) + '----------------') logger.debug('-----------------Have zero type = ' + str(have_zero) + '-----------------------------------') logger.debug('-----------------Constraint File name = ' + constraint_file + '----------------------------') labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',') labels = labels.astype(int) ml, cl = io_func.read_constraints(constraint_file) hyperedges = ce.build_hypergraph_adjacency(labels) hyperedges = hyperedges.transpose() coas_matrix = hyperedges.dot(hyperedges.transpose()) coas_matrix = np.squeeze(np.asarray(coas_matrix.todense())) coas_matrix = coas_matrix.astype(np.float32) coas_matrix /= np.max(coas_matrix) print coas_matrix nmis = [] for alpha in alphas: logger.debug( '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------' ) propagated_coas_matrix = propagation_on_coassociation_matrix( coas_matrix, ml, cl, alpha) cur_nmis = [] for method in ensemble_method: ensemble_label = _ensemble_method[method](propagated_coas_matrix, labels.shape[0], class_num) ensemble_nmi = Metrics.normalized_max_mutual_info_score( ensemble_label, target) logger.debug(method + ' alpha=' + str(alpha) + ', NMI=' + str(ensemble_nmi)) cur_nmis.append(ensemble_nmi) nmis.append(cur_nmis) logger.debug( '------------------------->>>>>> END OF THIS PARAM <<<<<<------------------------------' ) logger.debug( '===========================================================================================' ) return nmis
def evalparser(path='./examples', report=False): """ Test the parsing performance :type path: string :param path: path to the evaluation data :type report: boolean :param report: whether to report (calculate) the f1 score """ from os import listdir from os.path import join as joinpath # ---------------------------------------- # Load the parsing model pm = ParsingModel() pm.loadmodel("parsing-model.pickle.gz") # ---------------------------------------- # Evaluation met = Metrics(levels=['span','nuclearity','relation']) # ---------------------------------------- # Read all files from the given path doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus')] for fedus in doclist: # ---------------------------------------- # Parsing fpos = fedus + ".pos" d_pos = get_d_pos(fpos) fdep = fedus + ".dep" d_dep = get_d_dep(fdep) pred_rst = parse(pm, fedus=fedus, d_pos=d_pos, d_dep=d_dep) # Get brackets from parsing results pred_brackets = pred_rst.bracketing() fbrackets = fedus.replace('edus', 'brackets') writebrackets(fbrackets, pred_brackets) # ---------------------------------------- # Evaluate with gold RST tree if report: fdis = fedus.replace('edus', 'dis') gold_rst = RSTTree(fname=fdis) gold_rst.build() gold_brackets = gold_rst.bracketing() met.eval(gold_rst, pred_rst) if report: met.report()
def evalparser(path='./examples', report=False, bcvocab=None, draw=True, withdp=False, fdpvocab=None, fprojmat=None): """ Test the parsing performance :type path: string :param path: path to the evaluation data :type report: boolean :param report: whether to report (calculate) the f1 score """ # ---------------------------------------- # Load the parsing model print('Load parsing model ...') pm = ParsingModel(withdp=withdp, fdpvocab=fdpvocab, fprojmat=fprojmat) pm.loadmodel("model/parsing-model.pickle.gz") # ---------------------------------------- # Evaluation met = Metrics(levels=['span', 'nuclearity', 'relation']) # ---------------------------------------- # Read all files from the given path exsisting_files = [ ".".join(fname.split(".")[:-1]) for fname in listdir(path) if fname.endswith('.brackets') ] all_files = [ ".".join(fname.split(".")[:-1]) for fname in listdir(path) if fname.endswith('.merge') ] todo_files = list(set(all_files) - set(exsisting_files)) doclist = [joinpath(path, fname + '.merge') for fname in todo_files] print("TODO files len:") print(len(doclist)) print(doclist[0]) global_pm = pm global global_pm global_bv = bcvocab global global_bv eval_parser_unit(doclist[0]) cnt = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=cnt) pool.map(eval_parser_unit, doclist) pool.close() pool.join() """
def plot_normalized_consistency(labels, mlset, nlset, savepath, additional_values): """ plot correlations between must and cannot consistency of given library Parameters ---------- :param labels: :param mlset: :param nlset: :param savepath: :param additional_values: """ texts = additional_values colors = [] plot_labels = [None] * (len(labels) - _ADDITIONAL_RANGE) markers = ['o'] * (len(labels) - _ADDITIONAL_RANGE) cNorm = colors2.Normalize(vmin=min(texts), vmax=max(texts)) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=plt.get_cmap('CMRmap')) plot_labels.extend(_ADDITIONAL_NAMES) for text in texts: colors.append(scalarMap.to_rgba(text)) title = 'Must-Cannot Correlation' must_consistencies = [] cannot_consistencies = [] for label in labels[0:-5]: must_cons = Metrics.consistency(label, mlset, nlset, cons_type='must') cannot_cons = Metrics.consistency(label, mlset, nlset, cons_type='cannot') must_consistencies.append(must_cons) cannot_consistencies.append(cannot_cons) scaler = preprocessing.MinMaxScaler() must_consistencies = scaler.fit_transform(np.array(must_consistencies).reshape(-1, 1)) cannot_consistencies = scaler.fit_transform(np.array(cannot_consistencies).reshape(-1, 1)) pos = np.hstack((np.array(must_consistencies), np.array(cannot_consistencies))) _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title, xlabel='Must consistency', ylabel='Cannot consistency', legend_need=False) return
def evalparser(path='./examples', report=False, bcvocab=None, draw=True, withdp=False, fdpvocab=None, fprojmat=None): """ Test the parsing performance :type path: string :param path: path to the evaluation data :type report: boolean :param report: whether to report (calculate) the f1 score """ # ---------------------------------------- # Load the parsing model print 'Load parsing model ...' pm = ParsingModel(withdp=withdp, fdpvocab=fdpvocab, fprojmat=fprojmat) pm.loadmodel("model/parsing-model.pickle.gz") # ---------------------------------------- # Evaluation met = Metrics(levels=['span','nuclearity','relation']) # ---------------------------------------- # Read all files from the given path doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.merge')] for fmerge in doclist: # ---------------------------------------- # Read *.merge file dr = DocReader() doc = dr.read(fmerge) # ---------------------------------------- # Parsing pred_rst = pm.sr_parse(doc, bcvocab) if draw: strtree = pred_rst.parse() drawrst(strtree, fmerge.replace(".merge",".ps")) # Get brackets from parsing results pred_brackets = pred_rst.bracketing() fbrackets = fmerge.replace('.merge', '.brackets') # Write brackets into file writebrackets(fbrackets, pred_brackets) # ---------------------------------------- # Evaluate with gold RST tree if report: fdis = fmerge.replace('.merge', '.dis') gold_rst = RSTTree(fdis, fmerge) gold_rst.build() gold_brackets = gold_rst.bracketing() met.eval(gold_rst, pred_rst) if report: met.report()
def _expected_consistency_selection(labels, mlset, nlset, cons_type='', ease_factor=1): n_solutions = labels.shape[0] k_values = [] cons = [] final_idx = np.array([False] * n_solutions) for label in labels: cons.append( Metrics.consistency(label, mlset, nlset, cons_type=cons_type)) k_values.append(len(np.unique(label))) cons = np.array(cons) k_values = np.array(k_values, dtype=int) possible_k = np.unique(k_values) for k in possible_k: mean_value = np.mean(cons[k_values == k]) idx = np.logical_and(cons >= mean_value * ease_factor, k_values == k) final_idx = np.logical_or(final_idx, idx) return labels[final_idx]
def evalparser(path='./examples', report=False): """ Test the parsing performance :type path: string :param path: path to the evaluation data :type report: boolean :param report: whether to report (calculate) the f1 score """ from os import listdir from os.path import join as joinpath # ---------------------------------------- # Load the parsing model pm = ParsingModel() pm.loadmodel("parsing-model.pickle.gz") # ---------------------------------------- # Evaluation met = Metrics(levels=['span', 'nuclearity', 'relation']) # ---------------------------------------- # Read all files from the given path doclist = [ joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus') ] for fedus in doclist: # ---------------------------------------- # Parsing pred_rst = parse(pm, fedus=fedus) # Get brackets from parsing results # print fedus fin = open("test.dis", "w") r = fin.write(str(pred_rst)) # pred_brackets = pred_rst.bracketing() # fbrackets = fedus.replace('edus', 'brackets') # writebrackets(fbrackets, pred_brackets) # ---------------------------------------- # Evaluate with gold RST tree if report: fdis = fedus.replace('edus', 'dis') gold_rst = RSTTree(fname=fdis) gold_rst.build() gold_brackets = gold_rst.bracketing() met.eval(gold_rst, pred_rst) if report: met.report()
def plot_k_consistency_distribution(labels, mlset, nlset, savepath, pure=True, cons_type='must'): k_value = [] if not pure: labels = labels[0:-5] for label in labels: cons = len(np.unique(label)) k_value.append(cons) texts = [''] * len(labels) plot_labels = [None] * len(labels) markers = ['x'] * len(labels) colors = ['blue'] * len(labels) title = 'k-'+cons_type+' consistency Correlation' consistencies = [] for label in labels: cons = Metrics.consistency(label, mlset, nlset, cons_type=cons_type) consistencies.append(cons) pos = np.hstack((np.array(k_value).reshape(-1, 1), np.array(consistencies).reshape(-1, 1))) print (pos.shape) _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title, xlabel='k', ylabel='consistency', legend_need=False) return
def main(args): '''Main function for AutoML in time-series predictions. Args: - data loading parameters: - data_names: mimic, ward, cf - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'one-shot' or 'online' - 'one-shot': one time prediction at the end of the time-series - 'online': preditcion at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected featuer number - predictor_parameters: - epochs: number of epochs - bo_itr: bayesian optimization iterations - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - task: classification or regression - metric_name: auc, apr, mae, mse ''' #%% Step 0: Set basic parameters metric_sets = [args.metric_name] metric_parameters = { 'problem': args.problem, 'label_name': [args.label_name] } #%% Step 1: Upload Dataset # File names data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_' data_loader_training = CSVLoader( static_file=data_directory + 'static_train_data.csv.gz', temporal_file=data_directory + 'temporal_train_data_eav.csv.gz') data_loader_testing = CSVLoader( static_file=data_directory + 'static_test_data.csv.gz', temporal_file=data_directory + 'temporal_test_data_eav.csv.gz') dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print('Finish data loading.') #%% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print('Finish preprocessing.') #%% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=[args.treatment]) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print('Finish defining problem.') #%% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type='static') temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type='temporal') imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print('Finish imputation.') #%% Step 5: Feature selection (4 options) static_feature_selection = \ FeatureSelection(feature_selection_model_name = args.static_feature_selection_model, feature_type = 'static', feature_number = args.static_feature_selection_number, task = args.task, metric_name = args.metric_name, metric_parameters = metric_parameters) temporal_feature_selection = \ FeatureSelection(feature_selection_model_name = args.temporal_feature_selection_model, feature_type = 'temporal', feature_number = args.temporal_feature_selection_number, task = args.task, metric_name = args.metric_name, metric_parameters = metric_parameters) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print('Finish feature selection.') #%% Step 6: Bayesian Optimization ## Model define model_parameters = { 'projection_horizon': 5, 'static_mode': 'concatenate', 'time_mode': 'concatenate' } crn_model = CRN_Model(task=args.task) crn_model.set_params(**model_parameters) model_class = crn_model # train_validate split dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.2) # Bayesian Optimization Start metric = BOMetric(metric='auc', fold=0, split='test') # Run BO for selected model class BO_model = AutoTS(dataset_training, model_class, metric) models, bo_score = BO_model.training_loop(num_iter=2) auto_ens_model = AutoEnsemble(models, bo_score) # Prediction assert not dataset_testing.is_validation_defined test_y_hat = auto_ens_model.predict(dataset_testing, test_split='test') test_y = dataset_testing.label print('Finish AutoML model training and testing.') #%% Step 7: Visualize Results idx = np.random.permutation(len(test_y_hat))[:2] # Evaluate predictor model result = Metrics(metric_sets, metric_parameters).evaluate(test_y, test_y_hat) print('Finish predictor model evaluation.') # Visualize the output # (1) Performance print('Overall performance') print_performance(result, metric_sets, metric_parameters) # (2) Predictions print('Each prediction') print_prediction(test_y_hat[idx], metric_parameters) return
def main(args): '''Main function for individual treatment effect estimation. Args: - data loading parameters: - data_names: mimic, ward, cf, mimic_antibiotics - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'online' - 'online': preiction at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected featuer number - treatment effects model parameters: - model_name: CRN, RMSN, GANITE Each model has different types of hyperparameters that need to be set. - Parameters needed for the Counterfactual Recurrent Network (CRN): - hyperparameters for encoder: - rnn_hidden_units: hidden dimensions in the LSTM unit - rnn_keep_prob: keep probability used for variational dropout in the LSTM unit - br_size: size of the balancing representation - fc_hidden_units: hidden dimensions of the fully connected layers used for treatment classifier and predictor - batch_size: number of samples in mini-batch - num_epochs: number of epochs - learning_rate: learning rate - max_alpha: alpha controls the trade-off between building tratment invariant representations (domain discrimination) and being able to predict outcomes (outcome prediction); during training, CRN uses an exponentially increasing schedule for alpha from 0 to max_alpha. - hyperparameters for decoder: - the decoder requires the same hyperparameters as the encoder with the exception of the rnn_hidden_units which is set to be equal to the br_size of the encoder - Parameters for Recurrent Marginal Structural Networks (RMSN): - hyperparameters for encoder: - dropout_rate: dropout probability used for variational - rnn_hidden_units: hidden dimensions in the LSTM unit - batch_size: number of samples in mini-batch - num_epochs: number of epochs - learning_rate: learning rate - max_norm: max gradient norm used for gradient clipping during training - hyperparameters for decoder: - the decoder requires the same hyperparameters as the encoder. - model_dir: directory where the model is saved - model_name: name of the saved model - Parameters for GANITE: - batch size: number of samples in mini-batch - alpha: parameter trading off between discriminator loss and supervised loss for the generator training - learning_rate: learning rate - hidden_units: hidden dimensions of the fully connected layers used in the networks - stack_dim: number of timesteps to stack All models have the following common parameters: - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - taks: 'classification' or 'regression' - metric_name: auc, apr, mae, mse (used for factual prediction) - patient id: patient for which counterfactual trajectories are computed - timestep: timestep in patient trajectory for estimating counterfactuals ''' # %% Step 0: Set basic parameters metric_sets = [args.metric_name] metric_parameters = { 'problem': args.problem, 'label_name': [args.label_name] } # %% Step 1: Upload Dataset # File names data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_' data_loader_training = CSVLoader( static_file=data_directory + 'static_train_data.csv.gz', temporal_file=data_directory + 'temporal_train_data_eav.csv.gz') data_loader_testing = CSVLoader( static_file=data_directory + 'static_test_data.csv.gz', temporal_file=data_directory + 'temporal_test_data_eav.csv.gz') dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print('Finish data loading.') # %% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print('Finish preprocessing.') # %% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=[args.treatment]) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print('Finish defining problem.') # %% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type='static') temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type='temporal') imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print('Finish imputation.') # %% Step 5: Feature selection (4 options) static_feature_selection = \ FeatureSelection(feature_selection_model_name=args.static_feature_selection_model, feature_type='static', feature_number=args.static_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters) temporal_feature_selection = \ FeatureSelection(feature_selection_model_name=args.temporal_feature_selection_model, feature_type='temporal', feature_number=args.temporal_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print('Finish feature selection.') # %% Step 6: Fit treatment effects (3 options) # Set the validation data for best model saving dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0) # Set the treatment effects model model_name = args.model_name # Set treatment effects model parameters if model_name == 'CRN': model_parameters = { 'encoder_rnn_hidden_units': args.crn_encoder_rnn_hidden_units, 'encoder_br_size': args.crn_encoder_br_size, 'encoder_fc_hidden_units': args.crn_encoder_fc_hidden_units, 'encoder_learning_rate': args.crn_encoder_learning_rate, 'encoder_batch_size': args.crn_encoder_batch_size, 'encoder_keep_prob': args.crn_encoder_keep_prob, 'encoder_num_epochs': args.crn_encoder_num_epochs, 'encoder_max_alpha': args.crn_encoder_max_alpha, 'decoder_br_size': args.crn_decoder_br_size, 'decoder_fc_hidden_units': args.crn_decoder_fc_hidden_units, 'decoder_learning_rate': args.crn_decoder_learning_rate, 'decoder_batch_size': args.crn_decoder_batch_size, 'decoder_keep_prob': args.crn_decoder_keep_prob, 'decoder_num_epochs': args.crn_decoder_num_epochs, 'decoder_max_alpha': args.crn_decoder_max_alpha, 'projection_horizon': args.projection_horizon, 'static_mode': args.static_mode, 'time_mode': args.time_mode } treatment_model = treatment_effects_model(model_name, model_parameters, task='classification') treatment_model.fit(dataset_training) elif model_name == 'RMSN': hyperparams_encoder_iptw = { 'dropout_rate': args.rmsn_encoder_dropout_rate, 'memory_multiplier': args.rmsn_encoder_memory_multiplier, 'num_epochs': args.rmsn_encoder_num_epochs, 'batch_size': args.rmsn_encoder_batch_size, 'learning_rate': args.rmsn_encoder_learning_rate, 'max_norm': args.rmsn_encoder_max_norm } hyperparams_decoder_iptw = { 'dropout_rate': args.rmsn_decoder_dropout_rate, 'memory_multiplier': args.rmsn_decoder_memory_multiplier, 'num_epochs': args.rmsn_decoder_num_epochs, 'batch_size': args.rmsn_decoder_batch_size, 'learning_rate': args.rmsn_decoder_learning_rate, 'max_norm': args.rmsn_decoder_max_norm } model_parameters = { 'hyperparams_encoder_iptw': hyperparams_encoder_iptw, 'hyperparams_decoder_iptw': hyperparams_decoder_iptw, 'model_dir': args.rmsn_model_dir, 'model_name': args.rmsn_model_name, 'static_mode': args.static_mode, 'time_mode': args.time_mode } treatment_model = treatment_effects_model(model_name, model_parameters, task='classification') treatment_model.fit(dataset_training, projection_horizon=args.projection_horizon) elif model_name == 'GANITE': hyperparams = { 'batch_size': args.ganite_batch_size, 'alpha': args.ganite_alpha, 'hidden_dims': args.ganite_hidden_dims, 'learning_rate': args.ganite_learning_rate } model_parameters = { 'hyperparams': hyperparams, 'stack_dim': args.ganite_stack_dim, 'static_mode': args.static_mode, 'time_mode': args.time_mode } treatment_model = treatment_effects_model(model_name, model_parameters, task='classification') treatment_model.fit(dataset_training) test_y_hat = treatment_model.predict(dataset_testing) print('Finish treatment effects model training and testing.') # %% Step 9: Visualize Results # Evaluate predictor model result = Metrics(metric_sets, metric_parameters).evaluate(dataset_testing.label, test_y_hat) print('Finish predictor model evaluation.') # Visualize the output # (1) Performance on estimating factual outcomes print('Overall performance on estimating factual outcomes') print_performance(result, metric_sets, metric_parameters) # (2) Counterfactual trajectories print('Counterfactual trajectories') if model_name in ['CRN', 'RMSN']: # Predict and visualize counterfactuals for the sequence of treatments indicated by the user # through the treatment_options. The lengths of each sequence of treatments needs to be projection_horizon + 1. treatment_options = np.array([[[1], [1], [1], [1], [1], [0]], [[0], [0], [0], [0], [1], [1]]]) history, counterfactual_traj = treatment_model.predict_counterfactual_trajectories( dataset=dataset_testing, patient_id=args.patient_id, timestep=args.timestep, treatment_options=treatment_options) print_counterfactual_predictions( patient_history=history, treatment_options=treatment_options, counterfactual_predictions=counterfactual_traj) return
def do_new_weighted_ensemble_for_library( library_folder, library_name, class_num, target, constraint_file, logger, gammas, internals=None, cons_type='both', ensemble_method=_default_ensemble_method, scale=False): """ :param library_folder: :param library_name: :param class_num: :param target: :param constraint_file: :param logger: :param alphas: :param cons_type: :param ensemble_method :return: """ logger.debug( '===========================================================================================' ) logger.debug('-----------------New ver Weighted Ensemble for library:' + str(library_name) + '---------------') logger.debug('-----------------Weight type = ' + cons_type + '-------------------------------------------') logger.debug('-----------------Scale type = ' + str(scale) + '-------------------------------------------') logger.debug('-----------------Constraint File name = ' + constraint_file + '----------------------------') labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',') labels = labels.astype(int) # if the library is not pure, i.e, ensemble results and targets are also included. # then, last 5 rows should be removed (single kmeans, cspa, hgpa, mcla, real labels) if 'pure' not in library_name: labels = labels[0:-5] mlset, nlset = io_func.read_constraints(constraint_file) n_instances = labels.shape[1] if cons_type == 'both': n_constraints = len(mlset) + len(nlset) else: n_constraints = len(mlset) if internals is None: internals = _build_pesudo_internal(labels) # get cluster/clustering level weights # constraints in each cluster of all clusterings are also obtained to get g_gamma con_per_cluster = [] constraints_num = [] con_clustering = [] cluster_time_sum = 0.0 clustering_time_sum = 0.0 for label in labels: t1 = time.clock() weight, cluster_cons_num = Metrics.consistency_per_cluster_efficient( label, mlset, nlset, cons_type=cons_type) con_per_cluster.append(weight) constraints_num.append(cluster_cons_num) t2 = time.clock() cluster_time_sum += (t2 - t1) for label in labels: t1 = time.clock() con_clustering.append( Metrics.consistency(label, mlset, nlset, cons_type=cons_type)) t2 = time.clock() clustering_time_sum += (t2 - t1) print 'library size=' + str(labels.shape[0]) print 'cluster avg=' + str(cluster_time_sum / labels.shape[0]) print 'clustering avg=' + str(clustering_time_sum / labels.shape[0]) if scale: scaler = preprocessing.MinMaxScaler() con_clustering = scaler.fit_transform(np.array(con_clustering)) nmis = [] for gamma in gammas: logger.debug( '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------' ) cur_g_gamma = get_g_gamma(constraints_num, labels, n_constraints, n_instances, gamma) cur_nmis = [] for method in ensemble_method: ensemble_labels = _ensemble_method[method]( labels, N_clusters_max=class_num, weighted=True, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=cur_g_gamma, new_formula=True, internal=internals) # ensemble_labels = _ensemble_method[method](labels, N_clusters_max=class_num, # weighted=True, clustering_weights=con_clustering, # cluster_level_weights=con_per_cluster, alpha=cur_g_gamma, # new_formula=True, internal=internals, ml=mlset, cl=nlset) ensemble_nmi = Metrics.normalized_max_mutual_info_score( ensemble_labels, target) logger.debug(method + ' gamma=' + str(gamma) + ', NMI=' + str(ensemble_nmi)) cur_nmis.append(ensemble_nmi) nmis.append(cur_nmis) logger.debug( '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------' ) logger.debug( '===========================================================================================' ) return nmis
def comparison_methods(dataset_name, constraints_files=None, additional_postfix='', eval_method=None): """ get the performance of comparison methods. Parameters ---------- :param dataset_name: :param constraints_files: :param additional_postfix: :param eval_method: """ filename = _default_eval_path + dataset_name + '_' + time.strftime( '%Y-%m-%d_%H_%M_%S', time.localtime(time.time())) + '.csv' with open(filename, 'wb') as f: writer = csv.writer(f) data, targets = exd.dataset[dataset_name]['data']() data = data.astype(np.double) k = exd.dataset[dataset_name]['k'] km = cluster.KMeans(n_clusters=k) km.fit(data) writer.writerow([ 'KMeans', str(metrics.normalized_max_mutual_info_score(targets, km.labels_)) ]) eval_methods = _default_eval_methods if eval_method is None else eval_method if constraints_files is None: filenames = _get_default_constraints_files( dataset_name, _default_constraints_postfix, additional_postfix) else: filenames = _get_default_constraints_files(dataset_name, constraints_files, additional_postfix) for filename in filenames: ml, cl = io_func.read_constraints(_default_constraints_folder + filename + '.txt') for method in eval_methods: if method == 'Cop_KMeans': result = _constrained_methods[method](data, k, ml, cl) writer.writerow([ filename + '_Cop_KMeans', str( metrics.normalized_max_mutual_info_score( targets, result)) ]) elif method == 'E2CP': e2cp = _constrained_methods[method](data=data, ml=ml, cl=cl, n_clusters=k) e2cp.fit_constrained() result = e2cp.labels writer.writerow([ filename + '_E2CP', str( metrics.normalized_max_mutual_info_score( targets, result)) ]) return
def main(args): '''Main function for time-series prediction. Args: - data loading parameters: - data_names: mimic, ward, cf - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'one-shot' or 'online' - 'one-shot': one time prediction at the end of the time-series - 'online': preditcion at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected featuer number - predictor_parameters: - model_name: rnn, gru, lstm, attention, tcn, transformer - model_parameters: network parameters such as numer of layers - h_dim: hidden dimensions - n_layer: layer number - n_head: head number (only for transformer model) - batch_size: number of samples in mini-batch - epochs: number of epochs - learning_rate: learning rate - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - task: classification or regression - uncertainty_model_name: uncertainty estimation model name (ensemble) - interpretor_model_name: interpretation model name (tinvase) - metric_name: auc, apr, mae, mse ''' #%% Step 0: Set basic parameters metric_sets = [args.metric_name] metric_parameters = { 'problem': args.problem, 'label_name': [args.label_name] } #%% Step 1: Upload Dataset # File names data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_' data_loader_training = CSVLoader( static_file=data_directory + 'static_train_data.csv.gz', temporal_file=data_directory + 'temporal_train_data_eav.csv.gz') data_loader_testing = CSVLoader( static_file=data_directory + 'static_test_data.csv.gz', temporal_file=data_directory + 'temporal_test_data_eav.csv.gz') dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print('Finish data loading.') #%% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print('Finish preprocessing.') #%% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=args.treatment) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print('Finish defining problem.') #%% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type='static') temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type='temporal') imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print('Finish imputation.') #%% Step 5: Feature selection (4 options) static_feature_selection = \ FeatureSelection(feature_selection_model_name = args.static_feature_selection_model, feature_type = 'static', feature_number = args.static_feature_selection_number, task = args.task, metric_name = args.metric_name, metric_parameters = metric_parameters) temporal_feature_selection = \ FeatureSelection(feature_selection_model_name = args.temporal_feature_selection_model, feature_type = 'temporal', feature_number = args.temporal_feature_selection_number, task = args.task, metric_name = args.metric_name, metric_parameters = metric_parameters) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print('Finish feature selection.') #%% Step 6: Fit and Predict (6 options) # Set predictor model parameters model_parameters = { 'h_dim': args.h_dim, 'n_layer': args.n_layer, 'n_head': args.n_head, 'batch_size': args.batch_size, 'epoch': args.epochs, 'model_type': args.model_name, 'learning_rate': args.learning_rate, 'static_mode': args.static_mode, 'time_mode': args.time_mode, 'verbose': True } # Set the validation data for best model saving dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0) pred_class = prediction(args.model_name, model_parameters, args.task) pred_class.fit(dataset_training) test_y_hat = pred_class.predict(dataset_testing) print('Finish predictor model training and testing.') #%% Step 7: Estimate Uncertainty (1 option) uncertainty_model = uncertainty(args.uncertainty_model_name, model_parameters, pred_class, args.task) uncertainty_model.fit(dataset_training) test_ci_hat = uncertainty_model.predict(dataset_testing) print('Finish uncertainty estimation') #%% Step 8: Interpret Predictions (1 option) interpretor = interpretation(args.interpretation_model_name, model_parameters, pred_class, args.task) interpretor.fit(dataset_training) test_s_hat = interpretor.predict(dataset_testing) print('Finish model interpretation') #%% Step 9: Visualize Results idx = np.random.permutation(len(test_y_hat))[:2] # Evaluate predictor model result = Metrics(metric_sets, metric_parameters).evaluate(dataset_testing.label, test_y_hat) print('Finish predictor model evaluation.') # Visualize the output # (1) Performance print('Overall performance') print_performance(result, metric_sets, metric_parameters) # (2) Predictions print('Each prediction') print_prediction(test_y_hat[idx], metric_parameters) # (3) Uncertainty print('Uncertainty estimations') print_uncertainty(test_y_hat[idx], test_ci_hat[idx], metric_parameters) # (4) Model interpretation print('Model interpretation') print_interpretation(test_s_hat[idx], dataset_training.feature_name, metric_parameters, model_parameters) return
def k_selection_ensemble(labels, k_threshold, logger, weighted=False, alpha=0, mlset=None, nlset=None, ctype='both'): """ do selection ensemble using k as criteria clusteing with k smaller than k_threshold will be removed :param labels: :param k_threshold: :param logger: :param weighted: weighted version or not :param alpha: balance factor that control the importance of clustering/cluster consistency in weights (weighted version only) :param mlset: cannot-link set (weighted version only) :param nlset: must-link set (weighted version only) :param ctype: type of consistency (weighted version only) :return: """ k_value = [] class_num = len(np.unique(labels[-1])) # select those clusterings that k larger than the threshold. for label in labels[0:-5]: k_value.append(len(np.unique(label))) k_value = np.array(k_value) idx = k_value.ravel() >= k_threshold selected_labels = labels[0:-5][idx] # weights con_per_cluster = [] con_clustering = [] if weighted: for label in selected_labels: con_per_cluster.append( Metrics.consistency_per_cluster(label, mlset, nlset, cons_type=ctype)) for label in selected_labels: con_clustering.append( Metrics.consistency(label, mlset, nlset, cons_type=ctype)) logger.debug('[K] Start consensus...shape=' + str(selected_labels.shape)) logger.debug('[K] Average k is ' + str(np.mean(k_value[idx]))) if weighted: logger.debug('[K] weighted consensus, alpha=' + str(alpha)) label_CSPA = ce.cluster_ensembles_CSPAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=alpha) label_HGPA = ce.cluster_ensembles_HGPAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=alpha) label_MCLA = ce.cluster_ensembles_MCLAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=alpha) nmi_CSPA = Metrics.normalized_max_mutual_info_score(label_CSPA, labels[-1]) nmi_HGPA = Metrics.normalized_max_mutual_info_score(label_HGPA, labels[-1]) nmi_MCLA = Metrics.normalized_max_mutual_info_score(label_MCLA, labels[-1]) logger.debug('CSPA performance:' + str(nmi_CSPA)) logger.debug('HGPA performance:' + str(nmi_HGPA)) logger.debug('MCLA performance:' + str(nmi_MCLA)) logger.debug('--------------------------------------------') return
import constrained_methods.constrained_clustering as cc import utils.load_dataset as ld import utils.io_func as io import time import evaluation.Metrics as Metrics data, target = ld.load_mnist_4000() print data.shape data = data.astype(float) ml, cl = io.read_constraints('Constraints/MNIST4000_diff_n_1.txt') t1 = time.clock() e2cp = cc.E2CP(data=data, ml=ml, cl=cl, n_clusters=10) t2 = time.clock() e2cp.fit_constrained() print e2cp.labels print Metrics.normalized_max_mutual_info_score(target, e2cp.labels) print t2 - t1
def consistency_selection_ensemble(labels, mlset, nlset, logger, must_threshold, cannot_threshold, normalized=True, weighted=False, weighted_type='both', alpha=1): """ do selection ensemble using must/cannot consistency as criteria clusteing with k smaller than k_threshold will be removed :param labels: :param mlset: :param nlset: :param logger: :param must_threshold: :param cannot_threshold: :param normalized: :param weighted: :param weighted_type: :param alpha: :return: """ class_num = len(np.unique(labels[-1])) must_consistencies = [] cannot_consistencies = [] clustering_weights = [] cluster_level_weights = [] k_value = [] for label in labels[0:-5]: must_cons = Metrics.consistency(label, mlset, nlset, cons_type='must') cannot_cons = Metrics.consistency(label, mlset, nlset, cons_type='cannot') if weighted: clustering_weights.append( Metrics.consistency(label, mlset, nlset, cons_type=weighted_type)) cluster_level_weights.append( Metrics.consistency_per_cluster(label, mlset, nlset, cons_type=weighted_type)) must_consistencies.append(must_cons) cannot_consistencies.append(cannot_cons) k_value.append(len(np.unique(label))) if normalized: scaler = preprocessing.MinMaxScaler() must_consistencies = scaler.fit_transform( np.array(must_consistencies).reshape(-1, 1)).ravel() cannot_consistencies = scaler.fit_transform( np.array(cannot_consistencies).reshape(-1, 1)).ravel() idx = np.logical_and(must_consistencies >= must_threshold, cannot_consistencies >= cannot_threshold) selected_labels = labels[0:-5][idx] k_value = np.array(k_value)[idx] logger.debug('[Consistency] Start consensus...shape=' + str(selected_labels.shape)) if selected_labels.shape[0] == 0: logger.debug('[Consistency] No clusterings are selected. Out.') return logger.debug('[Consistency] Average k is ' + str(np.mean(k_value))) label_CSPA = ce.cluster_ensembles_CSPAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=clustering_weights, cluster_level_weights=cluster_level_weights, alpha=alpha) label_HGPA = ce.cluster_ensembles_HGPAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=clustering_weights, cluster_level_weights=cluster_level_weights, alpha=alpha) label_MCLA = ce.cluster_ensembles_MCLAONLY(selected_labels, N_clusters_max=class_num) nmi_CSPA = Metrics.normalized_max_mutual_info_score(label_CSPA, labels[-1]) nmi_HGPA = Metrics.normalized_max_mutual_info_score(label_HGPA, labels[-1]) nmi_MCLA = Metrics.normalized_max_mutual_info_score(label_MCLA, labels[-1]) logger.debug('CSPA performance:' + str(nmi_CSPA)) logger.debug('HGPA performance:' + str(nmi_HGPA)) logger.debug('MCLA performance:' + str(nmi_MCLA)) return
def main(args): """Main function for AutoML in time-series predictions. Args: - data loading parameters: - data_names: mimic, ward, cf - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'one-shot' or 'online' - 'one-shot': one time prediction at the end of the time-series - 'online': preditcion at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addition, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected featuer number - predictor_parameters: - epochs: number of epochs - bo_itr: bayesian optimization iterations - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - task: classification or regression - metric_name: auc, apr, mae, mse """ #%% Step 0: Set basic parameters metric_sets = [args.metric_name] metric_parameters = { "problem": args.problem, "label_name": [args.label_name] } #%% Step 1: Upload Dataset # File names data_directory = "../datasets/data/" + args.data_name + "/" + args.data_name + "_" data_loader_training = CSVLoader( static_file=data_directory + "static_train_data.csv.gz", temporal_file=data_directory + "temporal_train_data_eav.csv.gz", ) data_loader_testing = CSVLoader( static_file=data_directory + "static_test_data.csv.gz", temporal_file=data_directory + "temporal_test_data_eav.csv.gz", ) dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print("Finish data loading.") #%% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print("Finish preprocessing.") #%% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=args.treatment) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print("Finish defining problem.") #%% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type="static") temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type="temporal") imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print("Finish imputation.") #%% Step 5: Feature selection (4 options) static_feature_selection = FeatureSelection( feature_selection_model_name=args.static_feature_selection_model, feature_type="static", feature_number=args.static_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters, ) temporal_feature_selection = FeatureSelection( feature_selection_model_name=args.temporal_feature_selection_model, feature_type="temporal", feature_number=args.temporal_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters, ) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print("Finish feature selection.") #%% Step 6: Bayesian Optimization ## Model define # RNN model rnn_parameters = { "model_type": "lstm", "epoch": args.epochs, "static_mode": args.static_mode, "time_mode": args.time_mode, "verbose": False, } general_rnn = GeneralRNN(task=args.task) general_rnn.set_params(**rnn_parameters) # CNN model cnn_parameters = { "epoch": args.epochs, "static_mode": args.static_mode, "time_mode": args.time_mode, "verbose": False, } temp_cnn = TemporalCNN(task=args.task) temp_cnn.set_params(**cnn_parameters) # Transformer transformer = TransformerPredictor(task=args.task, epoch=args.epochs, static_mode=args.static_mode, time_mode=args.time_mode) # Attention model attn_parameters = { "model_type": "lstm", "epoch": args.epochs, "static_mode": args.static_mode, "time_mode": args.time_mode, "verbose": False, } attn = Attention(task=args.task) attn.set_params(**attn_parameters) # model_class_list = [general_rnn, attn, temp_cnn, transformer] model_class_list = [general_rnn, attn] # train_validate split dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.1) # Bayesian Optimization Start metric = BOMetric(metric="auc", fold=0, split="test") ens_model_list = [] # Run BO for each model class for m in model_class_list: BO_model = automl.model.AutoTS(dataset_training, m, metric, model_path="tmp/") models, bo_score = BO_model.training_loop(num_iter=args.bo_itr) auto_ens_model = AutoEnsemble(models, bo_score) ens_model_list.append(auto_ens_model) # Load all ensemble models for ens in ens_model_list: for m in ens.models: m.load_model(BO_model.model_path + "/" + m.model_id + ".h5") # Stacking algorithm stacking_ens_model = StackingEnsemble(ens_model_list) stacking_ens_model.fit(dataset_training, fold=0, train_split="val") # Prediction assert not dataset_testing.is_validation_defined test_y_hat = stacking_ens_model.predict(dataset_testing, test_split="test") test_y = dataset_testing.label print("Finish AutoML model training and testing.") #%% Step 7: Visualize Results idx = np.random.permutation(len(test_y_hat))[:2] # Evaluate predictor model result = Metrics(metric_sets, metric_parameters).evaluate(test_y, test_y_hat) print("Finish predictor model evaluation.") # Visualize the output # (1) Performance print("Overall performance") print_performance(result, metric_sets, metric_parameters) # (2) Predictions print("Each prediction") print_prediction(test_y_hat[idx], metric_parameters) return
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ st.subheader('Overall Prediction Performance') if select_pred_task == 'Classification': metric_sets = ['auc', 'apr'] if select_pred_task == 'Regression': metric_sets = ['mse', 'mae'] metric_parameters = { 'problem': problem_type, 'label_name': label_name, } metrics = Metrics(metric_sets, metric_parameters) result = metrics.evaluate(dataset_v_5.label, test_y_hat) if problem_type == 'one-shot': text = print_performance( result, metric_sets, metric_parameters, ) st.text(text) if problem_type == 'online': figs = print_performance( result,
def do_7th_weighted_ensemble_for_library( library_folder, library_name, class_num, target, constraint_file, logger, alphas, internals, cons_type='both', ensemble_method=_default_ensemble_method, scale=False): """ :param library_folder: :param library_name: :param class_num: :param target: :param constraint_file: :param logger: :param alphas: :param cons_type: :param ensemble_method :return: """ logger.debug( '===========================================================================================' ) logger.debug('-----------------New Weighted Ensemble for library:' + str(library_name) + '-------------------') logger.debug('-----------------Weight type = ' + cons_type + '-------------------------------------------') logger.debug('-----------------Scale type = ' + str(scale) + '-------------------------------------------') logger.debug('-----------------Constraint File name = ' + constraint_file + '----------------------------') labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',') labels = labels.astype(int) k_values = [] expected_cons = {} # if the library is not pure, i.e, ensemble results and targets are also included. # then, last 5 rows should be removed (single kmeans, cspa, hgpa, mcla, real labels) if 'pure' not in library_name: labels = labels[0:-5] mlset, nlset = io_func.read_constraints(constraint_file) # get cluster/clustering level weights con_per_cluster = [] con_clustering = [] for label in labels: con_per_cluster.append( Metrics.consistency_per_cluster(label, mlset, nlset, cons_type=cons_type)) for label in labels: con_clustering.append( Metrics.consistency(label, mlset, nlset, cons_type=cons_type)) k_values.append(len(np.unique(label))) k_values = np.array(k_values, dtype=int) possible_k = np.unique(k_values) cons = np.array(con_clustering) for k in possible_k: mean_value = np.mean(cons[k_values == k]) if mean_value == 0: mean_value = 1 expected_cons[k] = mean_value for i in range(0, labels.shape[0]): con_clustering[i] /= expected_cons[k_values[i]] con_clustering[i] *= internals[i] if scale: scaler = preprocessing.MinMaxScaler() con_clustering = scaler.fit_transform(np.array(con_clustering)) nmis = [] for alpha in alphas: logger.debug( '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------' ) cur_nmis = [] for method in ensemble_method: ensemble_labels = _ensemble_method[method]( labels, N_clusters_max=class_num, weighted=True, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=alpha) ensemble_nmi = Metrics.normalized_max_mutual_info_score( ensemble_labels, target) logger.debug(method + ' alpha=' + str(alpha) + ', NMI=' + str(ensemble_nmi)) cur_nmis.append(ensemble_nmi) nmis.append(cur_nmis) logger.debug( '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------' ) logger.debug( '===========================================================================================' ) return nmis
# print '--------------------' # for doo in data_selected.as_matrix(): # print doo d, t = ed.dataset['waveform']['data']() # d, t = ed.dataset['Wap']['data'](sparse_type='csr') # d, t = ed.dataset['k1b']['data']() # d, t = ed.dataset['hitech']['data']() # d, t = ed.dataset['re0']['data']() print d.shape print np.unique(t) km = cluster.KMeans(n_clusters=3) t1 = time.clock() km.fit(d) t2 = time.clock() print metrics.normalized_max_mutual_info_score(t, km.labels_) # metrics print t2 - t1 # import member_generation.subspace as sub # subd = sub.feature_sampling(d, 2000) # print d.shape # print subd.shape # data_selected, data_unselected, \ # target_selected, target_unselected = train_test_split(d, t, # train_size=500, # random_state=154) # print data_selected # print data_unselected # print target_selected # print target_unselected # print d
def run(self): """ Runs the full pipeline as configured. :return: list of run parameters and evaluation metrics. """ # TODO try/catch to ensure proper shutdown even if error encountered params = self._get_params_for_run() result_rows = [] # Check for valid configuration if self._test_docs is None and self._k_folds == 0: self._logger.error("Explicit test set or number of cross-validation folds must be specified.") metrics = Metrics() result_row = {**params, **metrics.get_scores_as_dict()} result_rows.append(result_row) return result_rows # Continue while there are configured parameter settings to evaluate while params is not None: # Get collection of training and test sets for current run data_sets = self._get_training_and_test_sets() for set_index, (training_docs, test_docs) in enumerate(data_sets): # Retrieve an encoder module trained with the specified configuration self._encoder = self._get_encoder(params) set_index += 1 # Only used for user output, so start index at 1 num_sets = len(data_sets) if num_sets > 1: self._logger.info("Training and evaluating fold {} of {}.".format(set_index, num_sets)) start = time.time() self._train_and_evaluate(params, self._encoder, training_docs, test_docs) runtime = time.time() - start self._logger.info( "Trained and evaluated fold {} of sequence model in {} seconds.".format(set_index, runtime)) # Combine run parameters with evaluation results and store result_row = {**params, **self._evaluator.get_score_as_dict()} result_rows.append(result_row) # Check if model should be saved if self._is_model_saving_enabled(): operator = self._evaluator.get_operator() current_score = self._evaluator.get_score() best_model = self._get_best_model() if best_model is not None: (best_metric, _) = best_model if not operator(best_metric, current_score): self._set_best_model(current_score, (params, self._encoder, self._sequence_learner)) else: # New model is the best one if no previous existed self._set_best_model(current_score, (params, self._encoder, self._sequence_learner)) # Invoke optimizer callback to report on results of this run if self._optimizer is not None: self._optimizer.process_run_result(params=params, score=self._evaluator.get_score_as_dict(), encoder=self._encoder, sequence_learner=self._sequence_learner) # Check if there are additional runs to execute if self._optimizer is not None: params = self._optimizer.get_next_params() else: params = None # Store best model, if configured if self._is_model_saving_enabled(): path, name = self._get_model_save_path_and_name() try: self.save(path, name) except Exception: self._logger.error("Failed to save model, clearing Keras session and trying again.") self._sequence_learner.clear_session() self.save(path, name) # Clear Keras/Tensorflow models # TODO why a second time? if self._sequence_learner is not None: self._sequence_learner.clear_session() return pd.DataFrame(result_rows)
def test_evaluate_model(report1): metrics = Metrics(report1.get_gold(), report1.get_pred()) print(metrics.bleu_score()) print(metrics.ds_score())
import member_generation.library_generation as lg import utils.io_func as io import utils.settings as settings import ensemble.Cluster_Ensembles as ce import evaluation.Metrics as metrics # 导入数据集,目前我有使用过的数据集都在utils.load_dataset模块中封装成函数了 # 调用时返回两个量,其一是特征矩阵,尺寸是(#Instances * #Features) # 部分数据集内置了一些参数,如进行0-1规范化等,自行查看 name = 'Iris' d, t = ld.load_iris() # 生成聚类成员集的函数我都包装在member_generation.library_generation中 # 主要函数是generate_library,它可以提供random-subspace的聚类成员集生成 # 以及半监督的聚类成员集(目前有E2CP和COP_KMEANS两种半监督聚类方法)的生成 # 该函数返回的是library的名字,实际library保存在Results/[数据集名字]/ # 具体参数,请见函数注释说明,可能写的比较乱,如果不太明白再问 # p.s. 如果使用random-subspace的方式生成聚类成员,其生成方法主要是对样本or特征的随机采样 # 具体函数封装在member_generation.subspace中,library_generation进行调用 lib_name = lg.generate_library(d, t, name, 10, 3) # 根据名字读入,这里读进来的是一个(#members * #instances)的矩阵 lib = io.read_matrix(settings.default_library_path + name + '/' + lib_name) # 进行ensemble,这里ensemble返回的是集成之后的簇标签 ensemble_result = ce.cluster_ensembles_CSPAONLY(lib, N_clusters_max=3) # print出来看看 print ensemble_result print metrics.normalized_max_mutual_info_score(t, ensemble_result)
def generate_library(data, target, dataset_name, n_members, class_num, n_cluster_lower_bound=0, n_cluster_upper_bound=0, feature_sampling=1.0, sample_sampling=0.7, feature_sampling_lower_bound=0.05, sample_sampling_lower_bound=0.1, f_stable_sample=True, s_stable_sample=True, constraints_file=None, sampling_method='FSRSNC', verbose=True, path=_default_result_path, metric='nid', manifold_type='MDS', subfolder=True, generate_only=True): """ generate a single library of ensemble member. Parameters ---------- :param data: dataset in a ndarray :param target: target in a ndarray or list :param dataset_name: name of dataset :param n_members: #clusters :param class_num: #real_class :param n_cluster_lower_bound: lower bound of k :param n_cluster_upper_bound: upper bound of k :param feature_sampling: fixed sampling rate of feature, or upper bound if not stable :param sample_sampling: fixed sampling rate of instances, or upper bound if not stable :param feature_sampling_lower_bound: lower bound of sampling rate of feature, only available if not stable :param sample_sampling_lower_bound: lower bound of sampling rate of instance, only available if not stable :param f_stable_sample: stable feature sampling or not :param s_stable_sample: stable instance sampling or not :param constraints_file: name of constraint file, only available when :param sampling_method: 'FSRSNC' and 'FSRSNN' supported :param verbose: print debug info. :param path: path to store the library :param metric: used for visualization only :param manifold_type: used for visualization only :param subfolder: save library in a separated sub-folder or not. Return ------ :return: name of the library generated (the library itself will be stored as a file) """ print('start generating library for dataset:' + dataset_name) # make sure that path to store the library existing if not os.path.isdir(path): os.mkdir(path) if subfolder: savepath = path + dataset_name + '/' if not os.path.isdir(savepath): os.mkdir(savepath) else: savepath = path # we set the range of cluster number to [k, 10k] if not defined if n_cluster_lower_bound == 0 or n_cluster_upper_bound == 0: n_cluster_lower_bound = class_num n_cluster_upper_bound = class_num * 10 # get sampling method, if not exist, it will raise a exception if sampling_method in _sampling_methods.keys(): is_constrained = False elif sampling_method in _constrained_methods.keys(): is_constrained = True else: raise ValueError('ensemble generation : Method should be set properly.') # read constraints file if existing if constraints_file is not None: mlset, nlset = io_func.read_constraints(constraints_file) else: if is_constrained: raise Exception('ensemble generation : Constrained Member must be with a constraints file.') constraints_file = '' mlset = [] nlset = [] # lower bound of sampling rate (use only if 'stable' set to be false) if feature_sampling_lower_bound > feature_sampling: feature_sampling_lower_bound = feature_sampling / 2 if sample_sampling_lower_bound > sample_sampling: sample_sampling_lower_bound = sample_sampling / 2 # there should be at least 2 clusters in the clustering if n_cluster_lower_bound < 2: n_cluster_lower_bound = 2 if n_cluster_upper_bound < n_cluster_lower_bound: n_cluster_upper_bound = n_cluster_lower_bound # path and filename to write the file filename = _get_file_name(dataset_name, n_cluster_lower_bound, n_cluster_upper_bound, feature_sampling, feature_sampling_lower_bound, sample_sampling, sample_sampling_lower_bound, n_members, f_stable_sample, s_stable_sample, sampling_method, is_constraint_method=is_constrained, constraint_file=constraints_file) # we won't generate the library with same sampling rate and size if existing if os.path.isfile(savepath + filename + '.res'): print ('[Library Generation] : library already exists.') return filename+'.res' elif os.path.isfile(savepath + filename + '_pure.res'): print ('[Library Generation] : corresponding pure library already exists.') return filename+'_pure.res' tag = True # matrix to store clustering results mat = np.empty(data.shape[0]) # generate ensemble members for i in range(0, n_members): # determine k randomly cluster_num = np.random.randint(n_cluster_lower_bound, n_cluster_upper_bound + 1) random_state = np.random.randint(0, _INT_MAX - 1) cur_feature_sampling = feature_sampling cur_sample_sampling = sample_sampling if not f_stable_sample: cur_feature_sampling = rand.uniform(feature_sampling_lower_bound, feature_sampling) if not s_stable_sample: cur_sample_sampling = rand.uniform(sample_sampling_lower_bound, sample_sampling) print('For this base clustering, cluster number is ' + str(cluster_num)) # generate ensemble member by given method if sampling_method == 'Cop_KMeans': result = _constrained_methods[sampling_method](data, cluster_num, mlset, nlset) elif sampling_method == 'E2CP': e2cp = _constrained_methods[sampling_method](data=data, ml=mlset, cl=nlset, n_clusters=cluster_num) e2cp.fit_constrained() result = e2cp.labels else: result = _sampling_methods[sampling_method](data, target, r_clusters=cluster_num, r_state=random_state, fsr=cur_feature_sampling, ssr=cur_sample_sampling) # print diversity diver = Metrics.normalized_max_mutual_info_score(result, target) if verbose: print ('Base clustering' + str(i) + ' nmi_max between real labels = ' + str(diver)) # stack the result into the matrix if tag: mat = np.array(result) mat = np.reshape(mat, (1, data.shape[0])) tag = False else: temp = np.array(result) temp = np.reshape(temp, (1, data.shape[0])) mat = np.vstack([mat, np.array(temp)]) # change element type to int for consensus mat = mat.astype(int) if generate_only or is_constrained: np.savetxt(savepath + filename + '_pure' + '.res', mat, fmt='%d', delimiter=',') return filename+'_pure.res' # single k-means model, for comparison clf = cluster.KMeans(n_clusters=class_num) clf.fit(data) kmlabels = clf.labels_ # do consensus labels_CSPA = ce.cluster_ensembles_CSPAONLY(mat, N_clusters_max=class_num) labels_HGPA = ce.cluster_ensembles_HGPAONLY(mat, N_clusters_max=class_num) labels_MCLA = ce.cluster_ensembles_MCLAONLY(mat, N_clusters_max=class_num) # put consensus results into the matrix mat = np.vstack([mat, np.reshape(kmlabels, (1, data.shape[0]))]) mat = np.vstack([mat, np.reshape(labels_CSPA, (1, data.shape[0]))]) mat = np.vstack([mat, np.reshape(labels_HGPA, (1, data.shape[0]))]) mat = np.vstack([mat, np.reshape(labels_MCLA, (1, data.shape[0]))]) # put real labels into the matrix temp = np.reshape(target, (1, data.shape[0])) mat = np.vstack([mat, np.array(temp)]) print ('Dataset ' + dataset_name + ', consensus finished, saving...') # write results to external file, use %d to keep integer part only np.savetxt(savepath + filename + '.res', mat, fmt='%d', delimiter=',') # print labels and diversities (between the real labels) nmi_CSPA = Metrics.normalized_max_mutual_info_score(labels_CSPA, target) nmi_HGPA = Metrics.normalized_max_mutual_info_score(labels_HGPA, target) nmi_MCLA = Metrics.normalized_max_mutual_info_score(labels_MCLA, target) print ('consensus NMI (CSPA) =' + str(nmi_CSPA)) print ('consensus NMI (HGPA) =' + str(nmi_HGPA)) print ('consensus NMI (MCLA) =' + str(nmi_MCLA)) kmnmi = Metrics.normalized_max_mutual_info_score(kmlabels, target) print ('single-model diversity (K-means) =' + str(kmnmi)) # save performances perf = np.array([nmi_CSPA, nmi_HGPA, nmi_MCLA, kmnmi]) np.savetxt(savepath + filename + '_performance.txt', perf, fmt='%.6f', delimiter=',') if metric == 'diversity': distance_matrix = Metrics.diversityMatrix(mat) np.savetxt(savepath + filename + '_diversity.txt', distance_matrix, delimiter=',') else: distance_matrix = Metrics.NIDMatrix(mat) np.savetxt(savepath + filename + '_nid.txt', distance_matrix, delimiter=',') if manifold_type == 'MDS': # transform distance matrix into 2-d or 3-d coordinates to visualize mds2d = manifold.MDS(n_components=2, max_iter=10000, eps=1e-12, dissimilarity='precomputed') mds3d = manifold.MDS(n_components=3, max_iter=10000, eps=1e-12, dissimilarity='precomputed') pos2d = mds2d.fit(distance_matrix).embedding_ pos3d = mds3d.fit(distance_matrix).embedding_ np.savetxt(savepath + filename + '_mds2d.txt', pos2d, fmt="%.6f", delimiter=',') np.savetxt(savepath + filename + '_mds3d.txt', pos3d, fmt="%.6f", delimiter=',') # draw odm, k distribution and nmi distribution cv.plot_ordered_distance_matrix(distance_matrix, savepath + filename + '_original_distance.png', savepath + filename + '_odm.png') cv.plot_k_distribution(mat, pos2d, savepath + filename+'_k_distribution.png') cv.plot_nmi_max(mat, pos2d, savepath + filename + '_nmimax_distribution.png') # consistencies are calculated while constraints file exists. if constraints_file != '': cv.plot_consistency(mat, pos2d, mlset, nlset, savepath + filename+'_consistency_both.png', consistency_type='both') cv.plot_consistency(mat, pos2d, mlset, nlset, savepath + filename+'_consistency_must.png', consistency_type='must') cv.plot_consistency(mat, pos2d, mlset, nlset, savepath + filename+'_consistency_cannot.png', consistency_type='cannot') cv.plt_consistency_corelation_with_k(mat, mlset, nlset, savepath + filename+'_normalized.png') return