def main(args): """Main function of DVRL for data valuation experiment. Args: args: data_name, train_no, valid_no, normalization, network parameters, number of examples """ # Data loading and sample corruption data_name = args.data_name # The number of training and validation samples dict_no = dict() dict_no['train'] = args.train_no dict_no['valid'] = args.valid_no # Network parameters parameters = dict() parameters['hidden_dim'] = args.hidden_dim parameters['comb_dim'] = args.comb_dim parameters['iterations'] = args.iterations parameters['activation'] = tf.nn.relu parameters['inner_iterations'] = args.inner_iterations parameters['layer_number'] = args.layer_number parameters['learning_rate'] = args.learning_rate parameters['batch_size'] = args.batch_size parameters['batch_size_predictor'] = args.batch_size_predictor # The number of examples n_exp = args.n_exp # Checkpoint file name checkpoint_file_name = args.checkpoint_file_name # Data loading _ = data_loading.load_tabular_data(data_name, dict_no, 0.0) print('Finished data loading.') # Data preprocessing # Normalization methods: 'minmax' or 'standard' normalization = args.normalization # Extracts features and labels. Then, normalizes features x_train, y_train, x_valid, y_valid, x_test, y_test, col_names = \ data_loading.preprocess_data(normalization, 'train.csv', 'valid.csv', 'test.csv') print('Finished data preprocess.') # Run DVRL # Resets the graph tf.reset_default_graph() keras.backend.clear_session() # Here, we assume a classification problem and we assume a predictor model # in the form of a simple multi-layer perceptron. problem = 'classification' # Predictive model define pred_model = keras.models.Sequential() pred_model.add( keras.layers.Dense(parameters['hidden_dim'], activation='relu')) pred_model.add( keras.layers.Dense(parameters['hidden_dim'], activation='relu')) pred_model.add(keras.layers.Dense(2, activation='softmax')) pred_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # Flags for using stochastic gradient descent / pre-trained model flags = {'sgd': True, 'pretrain': False} # Initializes DVRL dvrl_class = dvrl.Dvrl(x_train, y_train, x_valid, y_valid, problem, pred_model, parameters, checkpoint_file_name, flags) # Trains DVRL dvrl_class.train_dvrl('auc') print('Finished dvrl training.') # Outputs # Data valuation dve_out = dvrl_class.data_valuator(x_train, y_train) print('Finished data valuation.') # Evaluations # 1. Data valuation # Data valuation sorted_idx = np.argsort(-dve_out) sorted_x_train = x_train[sorted_idx] # Indices of top n high valued samples print('Indices of top ' + str(n_exp) + ' high valued samples: ' + str(sorted_idx[:n_exp])) print( pd.DataFrame(data=sorted_x_train[:n_exp, :], index=range(n_exp), columns=col_names).head()) # Indices of top n low valued samples print('Indices of top ' + str(n_exp) + ' low valued samples: ' + str(sorted_idx[-n_exp:])) print( pd.DataFrame(data=sorted_x_train[-n_exp:, :], index=range(n_exp), columns=col_names).head()) # 2. Performance after removing high/low values # Here, as the evaluation model, we use LightGBM. eval_model = lightgbm.LGBMClassifier() # Performance after removing high/low values _ = dvrl_metrics.remove_high_low(dve_out, eval_model, x_train, y_train, x_valid, y_valid, x_test, y_test, 'accuracy', plot=True) return
def main(args): """Main function of DVRL for corrupted sample discovery experiment. Args: args: data_name, train_no, valid_no, noise_rate, normalization, network parameters """ # Data loading and sample corruption data_name = args.data_name # The number of training and validation samples dict_no = dict() dict_no['train'] = args.train_no dict_no['valid'] = args.valid_no # Additional noise ratio noise_rate = args.noise_rate # Checkpoint file name checkpoint_file_name = args.checkpoint_file_name # Data loading and label corruption noise_idx = data_loading.load_tabular_data(data_name, dict_no, noise_rate) # noise_idx: ground truth noisy label indices print('Finished data loading.') # Data preprocessing # Normalization methods: 'minmax' or 'standard' normalization = args.normalization # Extracts features and labels. Then, normalizes features x_train, y_train, x_valid, y_valid, x_test, y_test, _ = \ data_loading.preprocess_data(normalization, 'train.csv', 'valid.csv', 'test.csv') print('Finished data preprocess.') # Run DVRL # Resets the graph tf.reset_default_graph() # Network parameters parameters = dict() parameters['hidden_dim'] = args.hidden_dim parameters['comb_dim'] = args.comb_dim parameters['activation'] = tf.nn.relu parameters['iterations'] = args.iterations parameters['layer_number'] = args.layer_number parameters['batch_size'] = args.batch_size parameters['learning_rate'] = args.learning_rate # In this example, we consider a classification problem and we use Logistic # Regression as the predictor model. problem = 'classification' pred_model = linear_model.LogisticRegression(solver='lbfgs') # Flags for using stochastic gradient descent / pre-trained model flags = {'sgd': False, 'pretrain': False} # Initalizes DVRL dvrl_class = dvrl.Dvrl(x_train, y_train, x_valid, y_valid, problem, pred_model, parameters, checkpoint_file_name, flags) # Trains DVRL dvrl_class.train_dvrl('auc') print('Finished dvrl training.') # Outputs # Data valuation dve_out = dvrl_class.data_valuator(x_train, y_train) print('Finished date valuation.') # Evaluations # Evaluation model eval_model = lightgbm.LGBMClassifier() # 1. Robust learning (DVRL-weighted learning) robust_perf = dvrl_metrics.learn_with_dvrl(dve_out, eval_model, x_train, y_train, x_valid, y_valid, x_test, y_test, 'accuracy') print('DVRL-weighted learning performance: ' + str(np.round(robust_perf, 4))) # 2. Performance after removing high/low values _ = dvrl_metrics.remove_high_low(dve_out, eval_model, x_train, y_train, x_valid, y_valid, x_test, y_test, 'accuracy', plot=True) # 3. Corrupted sample discovery # If noise_rate is positive value. if noise_rate > 0: # Evaluates corrupted_sample_discovery # and plot corrupted sample discovery results _ = dvrl_metrics.discover_corrupted_sample(dve_out, noise_idx, noise_rate, plot=True)