예제 #1
0
def main(args):
    """Main function of DVRL for data valuation experiment.

  Args:
    args: data_name, train_no, valid_no,
          normalization, network parameters, number of examples
  """
    # Data loading and sample corruption
    data_name = args.data_name

    # The number of training and validation samples
    dict_no = dict()
    dict_no['train'] = args.train_no
    dict_no['valid'] = args.valid_no

    # Network parameters
    parameters = dict()
    parameters['hidden_dim'] = args.hidden_dim
    parameters['comb_dim'] = args.comb_dim
    parameters['iterations'] = args.iterations
    parameters['activation'] = tf.nn.relu
    parameters['inner_iterations'] = args.inner_iterations
    parameters['layer_number'] = args.layer_number
    parameters['learning_rate'] = args.learning_rate
    parameters['batch_size'] = args.batch_size
    parameters['batch_size_predictor'] = args.batch_size_predictor

    # The number of examples
    n_exp = args.n_exp

    # Checkpoint file name
    checkpoint_file_name = args.checkpoint_file_name

    # Data loading
    _ = data_loading.load_tabular_data(data_name, dict_no, 0.0)

    print('Finished data loading.')

    # Data preprocessing
    # Normalization methods: 'minmax' or 'standard'
    normalization = args.normalization

    # Extracts features and labels. Then, normalizes features
    x_train, y_train, x_valid, y_valid, x_test, y_test, col_names = \
    data_loading.preprocess_data(normalization, 'train.csv',
                                 'valid.csv', 'test.csv')

    print('Finished data preprocess.')

    # Run DVRL
    # Resets the graph
    tf.reset_default_graph()
    keras.backend.clear_session()

    # Here, we assume a classification problem and we assume a predictor model
    # in the form of a simple multi-layer perceptron.
    problem = 'classification'
    # Predictive model define
    pred_model = keras.models.Sequential()
    pred_model.add(
        keras.layers.Dense(parameters['hidden_dim'], activation='relu'))
    pred_model.add(
        keras.layers.Dense(parameters['hidden_dim'], activation='relu'))
    pred_model.add(keras.layers.Dense(2, activation='softmax'))
    pred_model.compile(optimizer='adam',
                       loss='categorical_crossentropy',
                       metrics=['accuracy'])

    # Flags for using stochastic gradient descent / pre-trained model
    flags = {'sgd': True, 'pretrain': False}

    # Initializes DVRL
    dvrl_class = dvrl.Dvrl(x_train, y_train, x_valid, y_valid, problem,
                           pred_model, parameters, checkpoint_file_name, flags)

    # Trains DVRL
    dvrl_class.train_dvrl('auc')

    print('Finished dvrl training.')

    # Outputs
    # Data valuation
    dve_out = dvrl_class.data_valuator(x_train, y_train)

    print('Finished data valuation.')

    # Evaluations
    # 1. Data valuation
    # Data valuation
    sorted_idx = np.argsort(-dve_out)
    sorted_x_train = x_train[sorted_idx]

    # Indices of top n high valued samples
    print('Indices of top ' + str(n_exp) + ' high valued samples: ' +
          str(sorted_idx[:n_exp]))
    print(
        pd.DataFrame(data=sorted_x_train[:n_exp, :],
                     index=range(n_exp),
                     columns=col_names).head())

    # Indices of top n low valued samples
    print('Indices of top ' + str(n_exp) + ' low valued samples: ' +
          str(sorted_idx[-n_exp:]))
    print(
        pd.DataFrame(data=sorted_x_train[-n_exp:, :],
                     index=range(n_exp),
                     columns=col_names).head())

    # 2. Performance after removing high/low values
    # Here, as the evaluation model, we use LightGBM.
    eval_model = lightgbm.LGBMClassifier()

    # Performance after removing high/low values
    _ = dvrl_metrics.remove_high_low(dve_out,
                                     eval_model,
                                     x_train,
                                     y_train,
                                     x_valid,
                                     y_valid,
                                     x_test,
                                     y_test,
                                     'accuracy',
                                     plot=True)

    return
def main(args):
    """Main function of DVRL for corrupted sample discovery experiment.

  Args:
    args: data_name, train_no, valid_no, noise_rate,
          normalization, network parameters
  """
    # Data loading and sample corruption
    data_name = args.data_name

    # The number of training and validation samples
    dict_no = dict()
    dict_no['train'] = args.train_no
    dict_no['valid'] = args.valid_no

    # Additional noise ratio
    noise_rate = args.noise_rate

    # Checkpoint file name
    checkpoint_file_name = args.checkpoint_file_name

    # Data loading and label corruption
    noise_idx = data_loading.load_tabular_data(data_name, dict_no, noise_rate)
    # noise_idx: ground truth noisy label indices

    print('Finished data loading.')

    # Data preprocessing
    # Normalization methods: 'minmax' or 'standard'
    normalization = args.normalization

    # Extracts features and labels. Then, normalizes features
    x_train, y_train, x_valid, y_valid, x_test, y_test, _ = \
    data_loading.preprocess_data(normalization, 'train.csv',
                                 'valid.csv', 'test.csv')

    print('Finished data preprocess.')

    # Run DVRL
    # Resets the graph
    tf.reset_default_graph()

    # Network parameters
    parameters = dict()
    parameters['hidden_dim'] = args.hidden_dim
    parameters['comb_dim'] = args.comb_dim
    parameters['activation'] = tf.nn.relu
    parameters['iterations'] = args.iterations
    parameters['layer_number'] = args.layer_number
    parameters['batch_size'] = args.batch_size
    parameters['learning_rate'] = args.learning_rate

    # In this example, we consider a classification problem and we use Logistic
    # Regression as the predictor model.
    problem = 'classification'
    pred_model = linear_model.LogisticRegression(solver='lbfgs')

    # Flags for using stochastic gradient descent / pre-trained model
    flags = {'sgd': False, 'pretrain': False}

    # Initalizes DVRL
    dvrl_class = dvrl.Dvrl(x_train, y_train, x_valid, y_valid, problem,
                           pred_model, parameters, checkpoint_file_name, flags)

    # Trains DVRL
    dvrl_class.train_dvrl('auc')

    print('Finished dvrl training.')

    # Outputs
    # Data valuation
    dve_out = dvrl_class.data_valuator(x_train, y_train)

    print('Finished date valuation.')

    # Evaluations
    # Evaluation model
    eval_model = lightgbm.LGBMClassifier()

    # 1. Robust learning (DVRL-weighted learning)
    robust_perf = dvrl_metrics.learn_with_dvrl(dve_out, eval_model, x_train,
                                               y_train, x_valid, y_valid,
                                               x_test, y_test, 'accuracy')

    print('DVRL-weighted learning performance: ' +
          str(np.round(robust_perf, 4)))

    # 2. Performance after removing high/low values
    _ = dvrl_metrics.remove_high_low(dve_out,
                                     eval_model,
                                     x_train,
                                     y_train,
                                     x_valid,
                                     y_valid,
                                     x_test,
                                     y_test,
                                     'accuracy',
                                     plot=True)

    # 3. Corrupted sample discovery
    # If noise_rate is positive value.
    if noise_rate > 0:
        # Evaluates corrupted_sample_discovery
        # and plot corrupted sample discovery results
        _ = dvrl_metrics.discover_corrupted_sample(dve_out,
                                                   noise_idx,
                                                   noise_rate,
                                                   plot=True)