def train_models(save_path,
                 hyperpars,
                 overwrite_train,
                 train_on_all_data,
                 remove_overlap_chunks,
                 train_all_previous,
                 target_quantile,
                 train_last_six_complete=False):
    (features, other_features, targets) = preprocess.get_preprocessed(
        'train',
        remove_overlap_chunks,
        scale=NORMALIZE_FEATURES,
        remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
        target_quantile=target_quantile,
        train_last_six_complete=train_last_six_complete)

    train_val_split = preprocess.train_val_split(
        remove_overlap_chunks,
        ordered=True,
        num_folds=NUM_FOLDS,
        remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
        train_all_previous=train_all_previous,
        target_quantile=target_quantile,
        train_last_six_complete=train_last_six_complete)
    num_folds = len(train_val_split)
    num_train_models = num_folds + int(train_on_all_data)
    for fold in range(num_train_models):
        print('\nProcessing fold {} of {}'.format(fold + 1, num_train_models))
        fit_model(save_path, fold, num_folds, train_val_split, hyperpars,
                  overwrite_train, features, other_features, targets,
                  target_quantile)
예제 #2
0
def validate_models(save_path, splits, remove_overlap_chunks):
    (features, other_features,
     targets) = preprocess.get_preprocessed('train',
                                            remove_overlap_chunks,
                                            scale=NORMALIZE_FEATURES)

    train_val_split = preprocess.train_val_split(remove_overlap_chunks)
    num_folds = len(train_val_split[0])
    valid_maes = []
    for split_id, split in enumerate(splits):
        print('Processing split {} of {}'.format(split_id + 1, len(splits)))
        sum_split_maes = []
        split_maes = []
        for fold in range(num_folds):
            print('Processing fold {} of {}'.format(fold + 1, num_folds))
            fold_mae, oof_count = get_fold_mae(save_path, split, fold,
                                               num_folds, train_val_split,
                                               features, targets)
            sum_split_maes.append(fold_mae * oof_count)
            split_maes.append((fold_mae, oof_count))
        split_mae = np.array(sum_split_maes).sum() / features.shape[0]
        split_maes = [split_mae] + split_maes
        print('Split OOF MAE: {}'.format(np.round(split_mae, 3)))
        valid_maes.append((split_maes, splits[split_id]))
    return valid_maes
예제 #3
0
def test_model(save_path, split, test_on_all_data):
    (x_test, other_test,
     _) = preprocess.get_preprocessed('test',
                                      remove_overlap_chunks=True,
                                      scale=NORMALIZE_FEATURES)
    train_val_split = preprocess.train_val_split(remove_overlap_chunks=True)
    num_folds = len(train_val_split[split])
    num_test = x_test.shape[0]
    num_prediction_models = num_folds + int(test_on_all_data)
    model_preds = np.zeros((num_test, num_prediction_models))
    for fold in range(num_prediction_models):
        print("Making test predictions {} of {}".format(
            fold + 1, num_prediction_models))
        fold_description = get_fold_description(fold, num_folds)
        model_path = '{}-{}-{}.txt'.format(save_path, split, fold_description)
        model = lgb.Booster(model_file=model_path)
        model_preds[:, fold] = model.predict(x_test)

    # Write the output pandas data frame
    preds_test = np.mean(model_preds, 1)
    submission = pd.read_csv(
        '/home/tom/Kaggle/LANL/Data/sample_submission.csv')
    submission.time_to_failure = preds_test
    the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M')
    submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv'
    submission.to_csv(submission_path, index=False)
예제 #4
0
def validate_models(save_path, hyperpars, remove_overlap_chunks,
                    train_all_previous, target_quantile):
  train_val_split = preprocess.train_val_split(
      remove_overlap_chunks, ordered=True, num_folds=NUM_FOLDS,
      remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
      train_all_previous=train_all_previous,
      target_quantile=target_quantile)
  
  num_folds = len(train_val_split)
  sum_maes = []
  maes = []
  fold_mae_norms = []
  total_count = 0
  for fold in range(num_folds):
    print('Processing fold {} of {}'.format(fold+1, num_folds))
    fold_mae, fold_mae_norm, oof_count = get_fold_mae(
        save_path, fold, num_folds, train_val_split, hyperpars,
        target_quantile, hyperpars['validation_valid_batch'])
    sum_maes.append(fold_mae*oof_count)
    maes.append((fold_mae, oof_count))
    fold_mae_norms.append(fold_mae_norm)
    total_count += oof_count
  av_mae_norm = np.array([n*c for (n, c) in zip(
      fold_mae_norms, [c for (m, c) in maes])]).sum()/total_count
  mae = np.array(sum_maes).sum()/total_count
  maes = [mae] + maes
  print('\nAverage OOF MAE: {}'.format(np.round(mae, 3)))
  print('Average OOF MAE normalized: {}'.format(np.round(av_mae_norm, 3)))
  return maes, av_mae_norm
예제 #5
0
def test_model(save_path, split, model_on_all_data):
    (x_test, other_test,
     _) = preprocess.get_preprocessed('test',
                                      remove_overlap_chunks=True,
                                      scale=True)
    (x_test_reshaped, _) = utils.reshape_time_dim(x_test,
                                                  np.zeros_like(x_test),
                                                  np.arange(x_test.shape[0]))
    train_val_split = preprocess.train_val_split(remove_overlap_chunks=True)
    num_folds = len(train_val_split[split])
    num_test = x_test.shape[0]
    num_prediction_models = num_folds + int(model_on_all_data)
    model_preds = np.zeros((num_test, num_prediction_models))
    for fold in range(num_prediction_models):
        print("Making test predictions {} of {}".format(
            fold + 1, num_prediction_models))
        model_preds[:, fold] = make_predictions(save_path, split, fold,
                                                num_folds, x_test_reshaped)

    # Write the output pandas data frame
    preds_test = np.mean(model_preds, 1)
    submission = pd.read_csv(
        '/home/tom/Kaggle/LANL/Data/sample_submission.csv')
    submission.time_to_failure = preds_test
    the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M')
    submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv'
    submission.to_csv(submission_path, index=False)
def test_model(save_path,
               test_on_all_folds,
               train_all_previous,
               target_quantile,
               median_test_cyle_length,
               seed_ext=None,
               train_last_six_complete=False,
               drop_first_test_fold=False):
    (x_test, other_test, _) = preprocess.get_preprocessed(
        'test',
        remove_overlap_chunks=True,
        scale=NORMALIZE_FEATURES,
        remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
        target_quantile=target_quantile)
    train_val_split = preprocess.train_val_split(
        ordered=True,
        remove_overlap_chunks=True,
        num_folds=NUM_FOLDS,
        remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
        train_all_previous=train_all_previous,
        target_quantile=target_quantile)

    num_folds = len(train_val_split)
    num_folds = 1 if train_last_six_complete else num_folds
    pred_folds = [f for f in range(num_folds)
                  ] if test_on_all_folds else [num_folds - 1]
    model_preds = np.zeros((x_test.shape[0], len(pred_folds)))
    for (i, fold) in enumerate(pred_folds):
        print("Making test predictions {} of {}".format(
            i + 1, len(pred_folds)))
        fold_description = get_fold_description(fold, num_folds)
        model_path = '{}-{}.txt'.format(save_path, fold_description)
        model = lgb.Booster(model_file=model_path)
        model_preds[:, i] = model.predict(x_test)
    model_preds = model_preds[:, 1:] if drop_first_test_fold else model_preds
    preds_test = np.mean(model_preds, 1)

    if target_quantile:
        preds_test = median_test_cyle_length * (1 - preds_test)

    # Write the output pandas data frame
    submission = pd.read_csv(DATA_FOLDER + 'sample_submission.csv')
    submission.time_to_failure = preds_test
    the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M')
    the_date = the_date if seed_ext is None else the_date + seed_ext
    submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv'
    submission.to_csv(submission_path, index=False)
예제 #7
0
def train_models(custom_model, save_path, hyperpars, overwrite_train,
                 train_on_all_data, remove_overlap_chunks,
                 train_all_previous, skip_last_train_fold, target_quantile,
                 train_last_six_complete=False):
  train_val_split = preprocess.train_val_split(
      remove_overlap_chunks, ordered=True, num_folds=NUM_FOLDS,
      remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
      train_all_previous=train_all_previous,
      target_quantile=target_quantile)
  num_folds = len(train_val_split) if not train_last_six_complete else 1
  num_train_models = num_folds + int(train_on_all_data)
  for fold in range(num_train_models):
    print('\nProcessing fold {} of {}'.format(fold+1, num_train_models))
    K.clear_session()
    fit_model(custom_model, save_path, fold, num_folds, skip_last_train_fold,
              train_val_split, hyperpars, overwrite_train, target_quantile,
              train_last_six_complete)
예제 #8
0
def test_model(save_path, test_on_all_folds, train_all_previous,
               target_quantile, median_test_cyle_length, seed_ext=None,
               train_last_six_complete=False, drop_first_test_fold=False):
  train_val_split = preprocess.train_val_split(
      ordered=True, remove_overlap_chunks=True, num_folds=NUM_FOLDS,
      remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
      train_all_previous=train_all_previous,
      target_quantile=target_quantile)
  
  test_file_steps = int(150000/hyperpars['block_steps'])
  num_test_files = int(TEST_DATA.shape[0]/test_file_steps)
  test_ranges = (test_file_steps*np.arange(num_test_files),
                 test_file_steps*(1+np.arange(num_test_files))-(
                     hyperpars['chunk_blocks']))
  (x_test_batched, test_start_rows) = utils.get_rnn_prediction_features(
      TEST_DATA, test_ranges, hyperpars, order_start_rows=True)
  x_test_batched = np.repeat(x_test_batched, 20, 0)
  
  num_folds = len(train_val_split)
  num_folds = 1 if train_last_six_complete else num_folds
  pred_folds = [f for f in range(num_folds)] if test_on_all_folds else [
      num_folds-1]
  model_preds = np.zeros((num_test_files, len(pred_folds)))
  for (i, fold) in enumerate(pred_folds):
    print('Making test predictions {} of {}'.format(i+1,
          len(pred_folds)))
#    K.clear_session() # DO NOT UNCOMMENT
    fold_test_preds = make_predictions(save_path, hyperpars, fold, num_folds,
                                       x_test_batched)
    model_preds[:, fold] = np.mean(
        fold_test_preds.reshape(num_test_files, -1), 1)
    
  model_preds = model_preds[:, 1:] if drop_first_test_fold else model_preds
  preds_test = np.median(model_preds, 1)
  
  if target_quantile:
    preds_test = median_test_cyle_length*(1-preds_test)
  
  # Write the output pandas data frame
  submission = pd.read_csv(data_folder + 'sample_submission.csv')
  submission.time_to_failure = preds_test
  the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M')
  the_date = the_date if seed_ext is None else the_date + seed_ext
  submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv'
  submission.to_csv(submission_path, index=False)
예제 #9
0
def train_models(save_path, splits, hyperpars, overwrite_train, early_stopping,
                 train_on_all_data, remove_overlap_chunks):
    (features, other_features,
     targets) = preprocess.get_preprocessed('train',
                                            remove_overlap_chunks,
                                            scale=NORMALIZE_FEATURES)

    train_val_split = preprocess.train_val_split(remove_overlap_chunks)
    num_folds = len(train_val_split[0])
    num_train_models = num_folds + int(train_on_all_data)
    for split_id, split in enumerate(splits):
        print('Processing split {} of {}'.format(split_id + 1, len(splits)))
        for fold in range(num_train_models):
            print('\nProcessing fold {} of {}'.format(fold + 1,
                                                      num_train_models))
            fit_model(save_path, split, fold, num_folds, train_val_split,
                      hyperpars, overwrite_train, features, other_features,
                      targets, early_stopping)
def validate_models(save_path,
                    remove_overlap_chunks,
                    train_all_previous,
                    target_quantile,
                    train_last_six_complete=False):
    (features, other_features, targets) = preprocess.get_preprocessed(
        'train',
        remove_overlap_chunks,
        scale=NORMALIZE_FEATURES,
        remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
        target_quantile=target_quantile)

    train_val_split = preprocess.train_val_split(
        remove_overlap_chunks,
        ordered=True,
        num_folds=NUM_FOLDS,
        remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS,
        train_all_previous=train_all_previous,
        target_quantile=target_quantile)

    if target_quantile:
        targets = other_features.target_original.values
    num_folds = len(train_val_split)
    sum_maes = []
    maes = []
    fold_mae_norms = []
    total_count = 0
    for fold in range(num_folds):
        print('\nProcessing fold {} of {}'.format(fold + 1, num_folds))
        fold_mae, fold_mae_norm, oof_count = get_fold_mae(
            save_path, fold, num_folds, train_val_split, features, targets,
            target_quantile)
        sum_maes.append(fold_mae * oof_count)
        maes.append((fold_mae, oof_count))
        fold_mae_norms.append(fold_mae_norm)
        total_count += oof_count
    av_mae_norm = np.array([
        n * c for (n, c) in zip(fold_mae_norms, [c for (m, c) in maes])
    ]).sum() / total_count
    mae = np.array(sum_maes).sum() / total_count
    maes = [mae] + maes
    print('\nAverage OOF MAE: {}'.format(np.round(mae, 3)))
    print('Average OOF MAE normalized: {}'.format(np.round(av_mae_norm, 3)))
    return maes, av_mae_norm
예제 #11
0
def train_models(custom_model, save_path, splits, hyperpars, overwrite_train,
                 model_on_all_data, remove_overlap_chunks):
    (features, other_features,
     targets) = preprocess.get_preprocessed('train',
                                            remove_overlap_chunks,
                                            scale=True)

    train_val_split = preprocess.train_val_split(remove_overlap_chunks)
    num_folds = len(train_val_split[0])
    num_train_models = num_folds + int(model_on_all_data)
    for split_id, split in enumerate(splits):
        print('Processing split {} of {}'.format(split_id + 1, len(splits)))
        for fold in range(num_train_models):
            print('\nProcessing fold {} of {}'.format(fold + 1,
                                                      num_train_models))
            K.clear_session()
            fit_model(custom_model, save_path, split, fold, num_folds,
                      train_val_split, hyperpars, overwrite_train, features,
                      other_features, targets)
예제 #12
0
####################################################################################################
if is_train:
    for seed in SEED:
        # This use the context manager to operate in the data directory
        with cd(Name+f'-{seed}'):
            pickle.dump(sym_params, open("sym_params.sav", "wb"))
            logfile = open('log.txt','w+')
            resultfile = open('result.txt','w+')
            
            if os.path.exists('test.sav'):
                logfile.write('Did not calculate symfunctions.\n')
            else:
                data_dict = snn2sav(db, Name, elements, params_set,
                                    element_energy=element_energy)
                train_dict = train_test_split(data_dict,1-test_percent,seed=seed)
                train_val_split(train_dict,1-val_percent,seed=seed)
                
            logfile.flush()
            
            train_dict = torch.load('final_train.sav')
            val_dict = torch.load('final_val.sav')
            test_dict = torch.load('test.sav')
            
            scaling = get_scaling(train_dict, fp_scale_method, e_scale_method)
            
            
            n_nodes = hp['n_nodes']
            activations = hp['activations']
            lr = hp['lr']
            model = MultiLayerNet(N_sym, n_nodes, activations, nelem, scaling=scaling)
            if opt_method == 'lbfgs':
예제 #13
0
def main():
    idx = pd.IndexSlice
    date_col = 'start_date'

    target = pd.read_hdf(cfg.data_target_file)
    data = pd.read_hdf(cfg.data_cov_file)

    train_start_date = cfg.train_start_date
    end_date = cfg.end_date

    time_index = pd.date_range(train_start_date, end_date, freq='1D')

    existing_dates = [str(t[2]).split(" ")[0] for t in target.index]
    unique_dates = list(set(existing_dates))

    target = target.loc[idx[:, :, unique_dates], :]

    data = data.loc[idx[unique_dates], :]

    cv_path = cfg.rootpath_cv
    forecast_path = cfg.forecast_rootpath

    target_var = cfg.target_var

    val_years = cfg.val_years
    test_years = cfg.test_years

    val_train_range = cfg.val_train_range
    test_train_range = cfg.test_train_range

    past_years = cfg.past_kyears

    val_range = cfg.val_range
    val_freq = cfg.val_freq

    test_start_date = cfg.test_start_date
    test_time_index_all = pd.date_range(test_start_date, end_date, freq='7D')

    # to create train-validation sets

    for year in val_years:

        for num_forecast in range(1, 2):

            preprocess.train_val_split(cv_path,
                                       data,
                                       target,
                                       target_var,
                                       year,
                                       num_forecast,
                                       train_range=val_train_range,
                                       past_years=past_years,
                                       test_range=val_range,
                                       test_freq=val_freq,
                                       n_jobs=20)

    # to create train-test sets

    for year in test_years:

        for num_forecast in range(1, 2):

            preprocess.train_test_split(forecast_path,
                                        data,
                                        target,
                                        target_var,
                                        test_time_index_all,
                                        year,
                                        num_forecast,
                                        train_range=test_train_range,
                                        past_years=past_years,
                                        n_jobs=20)