Python get_preprocessed示例，preprocess.get_preprocessed Python示例

示例#1

0

显示文件

def test_model(save_path, split, test_on_all_data):
    (x_test, other_test,
     _) = preprocess.get_preprocessed('test',
                                      remove_overlap_chunks=True,
                                      scale=NORMALIZE_FEATURES)
    train_val_split = preprocess.train_val_split(remove_overlap_chunks=True)
    num_folds = len(train_val_split[split])
    num_test = x_test.shape[0]
    num_prediction_models = num_folds + int(test_on_all_data)
    model_preds = np.zeros((num_test, num_prediction_models))
    for fold in range(num_prediction_models):
        print("Making test predictions {} of {}".format(
            fold + 1, num_prediction_models))
        fold_description = get_fold_description(fold, num_folds)
        model_path = '{}-{}-{}.txt'.format(save_path, split, fold_description)
        model = lgb.Booster(model_file=model_path)
        model_preds[:, fold] = model.predict(x_test)

    # Write the output pandas data frame
    preds_test = np.mean(model_preds, 1)
    submission = pd.read_csv(
        '/home/tom/Kaggle/LANL/Data/sample_submission.csv')
    submission.time_to_failure = preds_test
    the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M')
    submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv'
    submission.to_csv(submission_path, index=False)

示例#2

0

显示文件

文件： train_valid_test_lightgbm_sequential.py 项目： ttvand/Kaggle-LANL

def train_models(save_path,
                 hyperpars,
                 overwrite_train,
                 train_on_all_data,
                 remove_overlap_chunks,
                 train_all_previous,
                 target_quantile,
                 train_last_six_complete=False):
    (features, other_features, targets) = preprocess.get_preprocessed(
        'train',
        remove_overlap_chunks,
        scale=NORMALIZE_FEATURES,
        remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
        target_quantile=target_quantile,
        train_last_six_complete=train_last_six_complete)

    train_val_split = preprocess.train_val_split(
        remove_overlap_chunks,
        ordered=True,
        num_folds=NUM_FOLDS,
        remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
        train_all_previous=train_all_previous,
        target_quantile=target_quantile,
        train_last_six_complete=train_last_six_complete)
    num_folds = len(train_val_split)
    num_train_models = num_folds + int(train_on_all_data)
    for fold in range(num_train_models):
        print('\nProcessing fold {} of {}'.format(fold + 1, num_train_models))
        fit_model(save_path, fold, num_folds, train_val_split, hyperpars,
                  overwrite_train, features, other_features, targets,
                  target_quantile)

示例#3

0

显示文件

def validate_models(save_path, splits, remove_overlap_chunks):
    (features, other_features,
     targets) = preprocess.get_preprocessed('train',
                                            remove_overlap_chunks,
                                            scale=NORMALIZE_FEATURES)

    train_val_split = preprocess.train_val_split(remove_overlap_chunks)
    num_folds = len(train_val_split[0])
    valid_maes = []
    for split_id, split in enumerate(splits):
        print('Processing split {} of {}'.format(split_id + 1, len(splits)))
        sum_split_maes = []
        split_maes = []
        for fold in range(num_folds):
            print('Processing fold {} of {}'.format(fold + 1, num_folds))
            fold_mae, oof_count = get_fold_mae(save_path, split, fold,
                                               num_folds, train_val_split,
                                               features, targets)
            sum_split_maes.append(fold_mae * oof_count)
            split_maes.append((fold_mae, oof_count))
        split_mae = np.array(sum_split_maes).sum() / features.shape[0]
        split_maes = [split_mae] + split_maes
        print('Split OOF MAE: {}'.format(np.round(split_mae, 3)))
        valid_maes.append((split_maes, splits[split_id]))
    return valid_maes

示例#4

0

显示文件

def test_model(save_path, split, model_on_all_data):
    (x_test, other_test,
     _) = preprocess.get_preprocessed('test',
                                      remove_overlap_chunks=True,
                                      scale=True)
    (x_test_reshaped, _) = utils.reshape_time_dim(x_test,
                                                  np.zeros_like(x_test),
                                                  np.arange(x_test.shape[0]))
    train_val_split = preprocess.train_val_split(remove_overlap_chunks=True)
    num_folds = len(train_val_split[split])
    num_test = x_test.shape[0]
    num_prediction_models = num_folds + int(model_on_all_data)
    model_preds = np.zeros((num_test, num_prediction_models))
    for fold in range(num_prediction_models):
        print("Making test predictions {} of {}".format(
            fold + 1, num_prediction_models))
        model_preds[:, fold] = make_predictions(save_path, split, fold,
                                                num_folds, x_test_reshaped)

    # Write the output pandas data frame
    preds_test = np.mean(model_preds, 1)
    submission = pd.read_csv(
        '/home/tom/Kaggle/LANL/Data/sample_submission.csv')
    submission.time_to_failure = preds_test
    the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M')
    submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv'
    submission.to_csv(submission_path, index=False)

示例#5

0

显示文件

文件： train_valid_test_cpc_main.py 项目： ttvand/Kaggle-LANL

def valid_order(model_path, split, data_folder):
  train_val_split = preprocess.train_val_split_gaps()
  num_folds = len(train_val_split[0])
  num_train_models = 1
  
  # Set the validation data to the first eleven validation earthquake ids since
  # the first validation earthquake
  first_val_ranges = train_val_split[split][0][1] # First fold, validation
  other_train_features = preprocess.get_preprocessed(
      'train', remove_overlap_chunks=True)[1]
  first_eq_id = other_train_features.eq_id.values[np.where(
      other_train_features.start_row.values == first_val_ranges[0][0])[0][0]]
  eq_ids = TRAIN_DATA.notrain_eq_id.values
  valid_rows = np.where(np.logical_and(eq_ids >= first_eq_id,
                                       eq_ids < (first_eq_id + 11)))[0]
  VALID_DATA = TRAIN_DATA.iloc[valid_rows]
  num_valid_files = int(VALID_DATA.shape[0]/(150000/hyperpars['block_steps']))
  
  order_probs = np.zeros((num_train_models, num_valid_files, num_valid_files))
  
  for fold in range(num_train_models):
    print('\nProcessing fold {} of {}'.format(fold+1, num_train_models))
    K.clear_session()
    encoder_model = load_model(ENCODER_PATH, custom_objects={
        'Attention': models.Attention})
    comp_rows_per_it = 4
    
    fold_description = get_fold_description(fold, num_folds)
    fold_model_path = '{}-{}-{}.h5'.format(model_path, split, fold_description)
    model = load_model(fold_model_path, custom_objects={
              'Attention': models.Attention,
              'GradientReversal': models.GradientReversal,
              'tf': tf,
              })
    
    num_iterations = int(num_valid_files/comp_rows_per_it)
    for i in range(num_iterations):
      gc.collect()
      print('\nIteration {} of {}'.format(i+1, num_iterations))
      first_test_id = int(comp_rows_per_it*i)
      test_gen = utils.generator_cpc_main_batch_test(
          VALID_DATA, hyperpars, encoder_model, first_test_id=first_test_id)
    
      # Generate the test data by calling the generator *N* times
      N = int(num_valid_files/4*comp_rows_per_it)
      test_data = list(itertools.islice(test_gen, N))
      test_preds = make_predictions(model_path, split=-1, fold=-1,
                                    num_folds=-1, data=test_data, model=model)
      order_preds = test_preds[3][:, :, :4].mean(-1).reshape(
          [test_preds[3].shape[0], -1, 4]).mean(-1)
      order_probs[fold, first_test_id:(first_test_id+comp_rows_per_it)] = (
          order_preds.reshape([comp_rows_per_it, -1]))
      
    save_path = data_folder + 'valid_order_probs.npy'
    np.save(save_path, order_probs) # np.load(save_path)

示例#6

0

显示文件

def validate_save_gap_preds(model_path, save_path, split, hyperpars):
    # Determine the first eleven validation earthquake ids
    num_first_eqs = 11
    train_val_split = preprocess.train_val_split_gaps()
    first_val_ranges = train_val_split[split][0][1]  # First fold, validation
    other_train_features = preprocess.get_preprocessed(
        'train', remove_overlap_chunks=True)[1]
    first_eq_id_other_features = np.where(
        other_train_features.start_row.values == first_val_ranges[0][0])[0][0]
    first_eq_id = other_train_features.eq_id.values[first_eq_id_other_features]
    first_row_next_eq = other_train_features.start_row.values[np.where(
        other_train_features.eq_id.values == first_eq_id +
        num_first_eqs)[0][0]]
    first_valid_eq_ids = np.arange(first_val_ranges[0][0], first_row_next_eq)

    # Drop the last part of the valid_eq_ids that don't contain an entire chunk
    valid_file_steps = 150000
    new_eq_ids = np.where(
        np.diff(other_train_features.eq_id.values) > 0)[0] + 1
    drop_eq_end_ids = new_eq_ids[
        new_eq_ids > first_eq_id_other_features][:num_first_eqs]
    drop_ids = np.array([])
    for i in range(num_first_eqs):
        drop_ids_eq = np.arange(
            other_train_features.start_row.values[drop_eq_end_ids[i] - 2] +
            valid_file_steps,
            other_train_features.start_row.values[drop_eq_end_ids[i]])
        drop_ids = np.append(drop_ids, drop_ids_eq)

    first_valid_eq_ids = np.setdiff1d(first_valid_eq_ids,
                                      drop_ids,
                                      assume_unique=True)

    # Same logic as in test to generate the gap predicted probabilities
    x_valid = GAP_DATA.iloc[first_valid_eq_ids]
    #  x_valid = x_valid[:600000]
    num_valid_files = int(x_valid.shape[0] / valid_file_steps)
    x_valid = x_valid.iloc[np.arange(valid_file_steps * num_valid_files)]
    valid_ranges = (valid_file_steps * np.arange(num_valid_files),
                    valid_file_steps * (1 + np.arange(num_valid_files)) -
                    (hyperpars['block_steps']))
    (x_valid_batched, _, valid_start_rows) = utils.get_gap_prediction_features(
        x_valid, valid_ranges, hyperpars, order_start_rows=True)
    file_names = ['valid_' + str(i + 1) for i in range(num_valid_files)]
    valid_preds = make_predictions(model_path,
                                   split,
                                   fold=0,
                                   num_folds=1,
                                   x_features=x_valid_batched)
    valid_gap_preds_aligned = utils.align_test_gap_preds(
        valid_preds, valid_file_steps, valid_start_rows, hyperpars, file_names)
    data_path = save_path + '_aligned_predictions_valid' + '.csv'
    valid_gap_preds_aligned.to_csv(data_path, index=False)

示例#7

0

显示文件

文件： train_valid_test_lightgbm_sequential.py 项目： ttvand/Kaggle-LANL

def test_model(save_path,
               test_on_all_folds,
               train_all_previous,
               target_quantile,
               median_test_cyle_length,
               seed_ext=None,
               train_last_six_complete=False,
               drop_first_test_fold=False):
    (x_test, other_test, _) = preprocess.get_preprocessed(
        'test',
        remove_overlap_chunks=True,
        scale=NORMALIZE_FEATURES,
        remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
        target_quantile=target_quantile)
    train_val_split = preprocess.train_val_split(
        ordered=True,
        remove_overlap_chunks=True,
        num_folds=NUM_FOLDS,
        remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
        train_all_previous=train_all_previous,
        target_quantile=target_quantile)

    num_folds = len(train_val_split)
    num_folds = 1 if train_last_six_complete else num_folds
    pred_folds = [f for f in range(num_folds)
                  ] if test_on_all_folds else [num_folds - 1]
    model_preds = np.zeros((x_test.shape[0], len(pred_folds)))
    for (i, fold) in enumerate(pred_folds):
        print("Making test predictions {} of {}".format(
            i + 1, len(pred_folds)))
        fold_description = get_fold_description(fold, num_folds)
        model_path = '{}-{}.txt'.format(save_path, fold_description)
        model = lgb.Booster(model_file=model_path)
        model_preds[:, i] = model.predict(x_test)
    model_preds = model_preds[:, 1:] if drop_first_test_fold else model_preds
    preds_test = np.mean(model_preds, 1)

    if target_quantile:
        preds_test = median_test_cyle_length * (1 - preds_test)

    # Write the output pandas data frame
    submission = pd.read_csv(DATA_FOLDER + 'sample_submission.csv')
    submission.time_to_failure = preds_test
    the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M')
    the_date = the_date if seed_ext is None else the_date + seed_ext
    submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv'
    submission.to_csv(submission_path, index=False)

示例#8

0

显示文件

def train_models(save_path, splits, hyperpars, overwrite_train, early_stopping,
                 train_on_all_data, remove_overlap_chunks):
    (features, other_features,
     targets) = preprocess.get_preprocessed('train',
                                            remove_overlap_chunks,
                                            scale=NORMALIZE_FEATURES)

    train_val_split = preprocess.train_val_split(remove_overlap_chunks)
    num_folds = len(train_val_split[0])
    num_train_models = num_folds + int(train_on_all_data)
    for split_id, split in enumerate(splits):
        print('Processing split {} of {}'.format(split_id + 1, len(splits)))
        for fold in range(num_train_models):
            print('\nProcessing fold {} of {}'.format(fold + 1,
                                                      num_train_models))
            fit_model(save_path, split, fold, num_folds, train_val_split,
                      hyperpars, overwrite_train, features, other_features,
                      targets, early_stopping)

示例#9

0

显示文件

文件： train_valid_test_lightgbm_sequential.py 项目： ttvand/Kaggle-LANL

def validate_models(save_path,
                    remove_overlap_chunks,
                    train_all_previous,
                    target_quantile,
                    train_last_six_complete=False):
    (features, other_features, targets) = preprocess.get_preprocessed(
        'train',
        remove_overlap_chunks,
        scale=NORMALIZE_FEATURES,
        remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
        target_quantile=target_quantile)

    train_val_split = preprocess.train_val_split(
        remove_overlap_chunks,
        ordered=True,
        num_folds=NUM_FOLDS,
        remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
        train_all_previous=train_all_previous,
        target_quantile=target_quantile)

    if target_quantile:
        targets = other_features.target_original.values
    num_folds = len(train_val_split)
    sum_maes = []
    maes = []
    fold_mae_norms = []
    total_count = 0
    for fold in range(num_folds):
        print('\nProcessing fold {} of {}'.format(fold + 1, num_folds))
        fold_mae, fold_mae_norm, oof_count = get_fold_mae(
            save_path, fold, num_folds, train_val_split, features, targets,
            target_quantile)
        sum_maes.append(fold_mae * oof_count)
        maes.append((fold_mae, oof_count))
        fold_mae_norms.append(fold_mae_norm)
        total_count += oof_count
    av_mae_norm = np.array([
        n * c for (n, c) in zip(fold_mae_norms, [c for (m, c) in maes])
    ]).sum() / total_count
    mae = np.array(sum_maes).sum() / total_count
    maes = [mae] + maes
    print('\nAverage OOF MAE: {}'.format(np.round(mae, 3)))
    print('Average OOF MAE normalized: {}'.format(np.round(av_mae_norm, 3)))
    return maes, av_mae_norm

示例#10

0

显示文件

def train_models(custom_model, save_path, splits, hyperpars, overwrite_train,
                 model_on_all_data, remove_overlap_chunks):
    (features, other_features,
     targets) = preprocess.get_preprocessed('train',
                                            remove_overlap_chunks,
                                            scale=True)

    train_val_split = preprocess.train_val_split(remove_overlap_chunks)
    num_folds = len(train_val_split[0])
    num_train_models = num_folds + int(model_on_all_data)
    for split_id, split in enumerate(splits):
        print('Processing split {} of {}'.format(split_id + 1, len(splits)))
        for fold in range(num_train_models):
            print('\nProcessing fold {} of {}'.format(fold + 1,
                                                      num_train_models))
            K.clear_session()
            fit_model(custom_model, save_path, split, fold, num_folds,
                      train_val_split, hyperpars, overwrite_train, features,
                      other_features, targets)

示例#11

0

显示文件

def valid_order(model_path, split, data_folder, hyperpars):
  # Determine the first eleven validation earthquake ids
  num_first_eqs = 11
  valid_file_steps = 150000
  comp_rows_per_it = 4
  train_val_split = preprocess.train_val_split_gaps()
  num_folds = len(train_val_split[0])
  
  
  first_val_ranges = train_val_split[split][0][1] # First fold, validation
  other_train_features = preprocess.get_preprocessed(
      'train', remove_overlap_chunks=True)[1]
  first_eq_id_other_features = np.where(
      other_train_features.start_row.values == first_val_ranges[0][0])[0][0]
  first_eq_id = other_train_features.eq_id.values[first_eq_id_other_features]
  first_row_next_eq = other_train_features.start_row.values[np.where(
      other_train_features.eq_id.values == first_eq_id+num_first_eqs)[0][0]]
  first_valid_eq_ids = np.arange(first_val_ranges[0][0], first_row_next_eq)
  
  # Drop the last part of the valid_eq_ids that don't contain an entire chunk
  new_eq_ids = np.where(np.diff(other_train_features.eq_id.values) > 0)[0] + 1
  drop_eq_end_ids = new_eq_ids[new_eq_ids > first_eq_id_other_features][
      :num_first_eqs]
  drop_ids = np.array([])
  for i in range(num_first_eqs):
    drop_ids_eq = np.arange(
        other_train_features.start_row.values[
            drop_eq_end_ids[i]-2]+valid_file_steps,
        other_train_features.start_row.values[drop_eq_end_ids[i]])
    drop_ids = np.append(drop_ids, drop_ids_eq)
    
  first_valid_eq_ids = np.setdiff1d(first_valid_eq_ids, drop_ids,
                                    assume_unique=True)
  
  # Same logic as in test to generate the gap predicted probabilities
  VALID_DATA = TRAIN_AUGMENT.iloc[first_valid_eq_ids]
#  VALID_DATA = VALID_DATA[:(150000*16)]
  num_valid_files = int(VALID_DATA.shape[0]/valid_file_steps)
  VALID_DATA = VALID_DATA.iloc[np.arange(valid_file_steps*num_valid_files)]
  
  fold_description = get_fold_description(num_folds, num_folds)
  ENCODER_PATH = '{}-{}-{}.h5'.format(model_path, split, fold_description)
  encoder_model = load_model(ENCODER_PATH, custom_objects={
        'Attention': models.Attention,
        'GradientReversal': models.GradientReversal,
        'tf': tf,})
  num_iterations = int(num_valid_files/comp_rows_per_it)
  order_probs = np.zeros((num_valid_files, num_valid_files))
  
  for i in range(num_iterations):
    gc.collect()
    print('\nIteration {} of {}'.format(i+1, num_iterations))
    first_test_id = int(comp_rows_per_it*i)
    test_gen = utils.generator_cpc_batch_test(
        VALID_DATA, hyperpars, encoder_model, first_test_id=first_test_id)
  
    # Generate the test data by calling the generator *N* times
    N = int(num_valid_files/4*comp_rows_per_it)
    test_data = list(itertools.islice(test_gen, N))
    test_preds = make_predictions(model_path, split=-1, fold=-1, num_folds=-1,
                                  data=test_data, model=encoder_model)
    order_preds = test_preds[3][:, :, :4].mean(-1).reshape(
        [test_preds[3].shape[0], -1, 4]).mean(-1)
    order_probs[first_test_id:(first_test_id+comp_rows_per_it)] = (
        order_preds.reshape([comp_rows_per_it, -1]))
    
  save_path = data_folder + 'valid_order_probs_raw_signal.npy'
  np.save(save_path, order_probs) # np.load(save_path)