def predictTests_Bulk(model, inference_config, validpath, plot, cnt=None): print('BATCH SIZE', inference_config.IMAGES_PER_GPU) dataset_valid = ShapesDataset() dataset_valid.load_imgs(validpath) dataset_valid.prepare() new_test_ids = [] rles = [] tot = len(dataset_valid.image_ids) prog = tqdm(total=tot) for i in range(0, tot, inference_config.IMAGES_PER_GPU): if cnt is not None and i > cnt: break imgs = [] img_metas = [] file_ids = [] for j in range(inference_config.IMAGES_PER_GPU): id = i + j image_id = dataset_valid.image_ids[id] file_id = dataset_valid.image_info[image_id]['id'] scaled_image, image_meta, _, _, _ =\ modellib.load_image_gt(dataset_valid, inference_config, image_id, use_mini_mask=True, augment=False) file_ids.append(file_id) imgs.append(scaled_image) img_metas.append(image_meta) results = model.detect(imgs, verbose=0) for i in range(len(results)): rle = convert_result(imgs[i], results[i], img_metas[i], dataset_valid, plot) file_id = file_ids[i] rles.extend(rle) new_test_ids.extend([file_id] * len(rle)) prog.update(inference_config.IMAGES_PER_GPU) sub = pd.DataFrame() sub['ImageId'] = new_test_ids sub['EncodedPixels'] = pd.Series(rles).apply( lambda x: ' '.join(str(y) for y in x)) sub.to_csv('../result/{}_org.csv'.format(inference_config.NAME), index=False) kaggle_util.save_result(sub, '../result/{}.csv'.format(inference_config.NAME), competition='airbus-ship-detection', send=True, index=False) return sub
def doodle_predict(model, model_path, x_test): model.load_weights(model_path) test_predictions = model.predict(x_test, batch_size=128, verbose=1) top3 = preds2catids(test_predictions) top3cats = top3.replace(id2cat) test['word'] = top3cats['a'] + ' ' + top3cats['b'] + ' ' + top3cats['c'] submission = test[['key_id', 'word']] import kaggle_util kaggle_util.save_result(submission, '../result/{}.csv'.format(model_prefix), 'quickdraw-doodle-recognition', send=True, index=False)
def save_ensemble(LABELS, prediction, prefix, mname, send): np.save('../result/ensembles/{}_{}.npy'.format(mname, prefix), prediction) # Make a submission file top_3 = np.array(LABELS)[np.argsort(-prediction, axis=1)[:, :3]] predicted_labels = [' '.join(list(x)) for x in top_3] test = pd.read_csv('../data/sample_submission.csv') test['label'] = predicted_labels filename = '../result/{}_{}.csv'.format(mname, prefix) kaggle_util.save_result(test[['fname', 'label']], filename, 'freesound-audio-tagging', send=send, index=False)
def predictTests(model, config, validpath, plot, flex='sub', send=False, fold=-1, valid=-1): print('fold', fold, 'valid', valid) dataset_valid = ShapesDataset(fold, valid, -1) if fold >= 0 and valid == 1: dataset_valid.load_imgs(validpath) else: dataset_valid.load_test_imgs(validpath) dataset_valid.prepare() #file_id, rle = predictOne(model, dataset_valid, inference_config, 1) new_test_ids = [] rles = [] for i in tqdm(range(len(dataset_valid.image_ids))): file_id, rle = predictOne(model, dataset_valid, config, i, plot=plot) rles.extend(rle) new_test_ids.extend([file_id] * len(rle)) #if i > 10: # break from keras import backend as K K.clear_session() sub = pd.DataFrame() sub['ImageId'] = new_test_ids sub['EncodedPixels'] = pd.Series(rles).apply( lambda x: ' '.join(str(y) for y in x)) sub.to_csv('../result/{}_{}.csv'.format(flex, config.NAME), index=False) kaggle_util.save_result(sub, '../result/{}_{}.csv'.format(config.NAME, flex), competition='airbus-ship-detection', send=send, index=False) return sub
def main_crossvalid_xgboost(frm, to): import xgboost as xgb nfold = 5 df, y, testing, ready_df, tfvocab, predictors, len_train, categorical, tfvocab, testdex = get_crossvalid_data( frm, to) cat_features = [] cols = list(df.columns) for col in categorical: cat_features.append(cols.index(col)) #lgtest = xgb.DMatrix(testing.toarray()) #del testing #gc.collect() skf = StratifiedKFold(y, n_folds=nfold) for i, (train_split, val_split) in enumerate(skf): #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=5) print(train_split) X_train = hstack( [csr_matrix(df.iloc[train_split].values), ready_df[train_split]]) X_valid = hstack( [csr_matrix(df.iloc[val_split].values), ready_df[val_split]]) # Sparse Matrix y_train = y[train_split] y_valid = y[val_split] #lgtrain = xgb.DMatrix(X_train.toarray(), label = y_train) #lgvalid = xgb.DMatrix(X_valid.toarray(), label = y_valid) #del X_train, X_valid, y_train #gc.collect() modelstart = time.time() bst = xgb.XGBRegressor(n_estimators=400, booster='gbtree', learning_rate=0.016, gamma=0, subsample=0.75, colsample_bylevel=0.5, max_depth=16, nthread=6) bst.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=False, early_stopping_rounds=100) print("Model Evaluation Stage") ypre = bst.predict(X_valid) rmse = np.sqrt(metrics.mean_squared_error(y_valid, ypre)) print('RMSE:', rmse) """ f, ax = plt.subplots(figsize=[7,10]) xgb.plot_importance(bst, ax=ax, max_num_features = 50) plt.title("Light GBM Feature Importance") plt.savefig('xgb_feature_import.png', bbox_inches='tight') """ lgpred = bst.predict(testing) lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex) lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1 subfile = '../result/xgb_dense_feature_{}.csv'.format(i) if debug: subfile = '../result/xgb_dense_feature_debug{}.csv'.format(i) kaggle_util.save_result(lgsub, subfile, competition='avito-demand-prediction', send=False, index=True) result_list = [] for i in range(nfold): subfile = '../result/xgb_dense_feature_{}.csv'.format(i) if debug: subfile = '../result/xgb_dense_feature_debug{}.csv'.format(i) result_list.append((subfile, 1 / nfold)) kaggle_util.ensemble(result_list, not debug, competition='avito-demand-prediction', score_col='deal_probability', prefix='xgb_avg')
def main_crossvalid(frm, to): nfold = 5 df, y, testing, ready_df, tfvocab, predictors, len_train, categorical, tfvocab, testdex = get_crossvalid_data( frm, to) lgbm_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'num_leaves': 270, # 37, 'feature_fraction': 0.4, 'bagging_fraction': 0.65, 'bagging_freq': 2, 'learning_rate': 0.016, #'max_depth' : 8, #'min_split_gain' : 0.0222415, #'min_child_weight' : 20, 'nthread': 5, 'verbose': 0, #'reg_alpha' : 0.041545473, #'reg_lambda' : 0.0735294, 'drop_rate': 0.08 } skf = StratifiedKFold(y, n_folds=nfold) for i, (train_split, val_split) in enumerate(skf): #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=5) print(train_split) X_train = hstack( [csr_matrix(df.iloc[train_split].values), ready_df[train_split]]) X_valid = hstack( [csr_matrix(df.iloc[val_split].values), ready_df[val_split]]) # Sparse Matrix y_train = y[train_split] y_valid = y[val_split] lgtrain = lgb.Dataset(X_train, y_train, feature_name=tfvocab, categorical_feature=categorical) lgvalid = lgb.Dataset(X_valid, y_valid, feature_name=tfvocab, categorical_feature=categorical) modelstart = time.time() lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=26000, valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], early_stopping_rounds=100, verbose_eval=100) print("Model Evaluation Stage") rmse = np.sqrt( metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))) print('RMSE:', rmse) f, ax = plt.subplots(figsize=[7, 10]) lgb.plot_importance(lgb_clf, max_num_features=100, ax=ax) plt.title("Light GBM Feature Importance") plt.savefig('feature_import.png', bbox_inches='tight') str_now = datetime.now().strftime("%m-%d-%H-%M") if not debug: lgb_clf.save_model('../model/model_{}.txt'.format(i), lgb_clf.best_iteration) else: lgb_clf.save_model('../model/model_debug_{}.txt'.format(i), lgb_clf.best_iteration) lgpred = lgb_clf.predict(testing, num_iteration=lgb_clf.best_iteration) lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex) lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1 subfile = '../result/dense_feature_{}.csv'.format(i) if debug: subfile = '../result/dense_feature_debug{}.csv'.format(i) kaggle_util.save_result(lgsub, subfile, competition='avito-demand-prediction', send=False, index=True) result_list = [] for i in range(nfold): subfile = '../result/dense_feature_{}.csv'.format(i) if debug: subfile = '../result/dense_feature_debug{}.csv'.format(i) result_list.append((subfile, 1 / nfold)) kaggle_util.ensemble(result_list, False, competition='avito-demand-prediction', score_col='deal_probability', prefix='lgb_avg')
def main(frm, to): testing = pd.read_csv('../input/test.csv', skiprows=range(1, frm), nrows=to - frm, index_col="item_id", parse_dates=["activation_date"]) testdex = testing.index len_test = len(testing) tot_filename = '/media/extend/cache/total_{}_{}.csv'.format(frm, to) tot_yname = '/media/extend/cache/total_y_{}_{}.csv'.format(frm, to) if os.path.exists(tot_filename) and os.path.exists(tot_yname): print('load from feather') #df = pd.read_feather(tot_filename).set_index("item_id") #y = pd.read_feather(tot_yname).set_index("item_id").deal_probability.copy() df = pd.read_csv(tot_filename).set_index("item_id") y = pd.read_csv(tot_yname).set_index("item_id").deal_probability.copy() len_train = to - frm else: training = pd.read_csv('../input/train.csv', skiprows=range(1, frm), nrows=to - frm, index_col="item_id", parse_dates=["activation_date"]) len_train = len(training) y = training.deal_probability.copy() training.drop("deal_probability", axis=1, inplace=True) #y.reset_index().to_feather(tot_yname) y.reset_index().to_csv(tot_yname) print('Train shape: {} Rows, {} Columns'.format(*training.shape)) print('Test shape: {} Rows, {} Columns'.format(*testing.shape)) df = pd.concat([training, testing], axis=0) del training, testing predictors = [] y, df, ready_df, tfvocab, predictors, len_train, categorical = \ preparTotalData(y, df, predictors, len_train, len_test, frm, to, tot_filename) none_categorical = [x for x in df.columns if x not in categorical] df = df[predictors] print(df.info()) print("Modeling Stage") X = hstack([csr_matrix(df[:len_train].values), ready_df[0:len_train]]) # Sparse Matrix testing = hstack([csr_matrix(df[len_train:].values), ready_df[len_train:]]) tfvocab = df.columns.tolist() + tfvocab for shape in [X, testing]: print("{} Rows and {} Cols".format(*shape.shape)) print("Feature Names Length: ", len(tfvocab)) del df gc.collect() print("\nModeling Stage") # Training and Validation Set """ Using Randomized train/valid split doesn't seem to generalize LB score, so I will try time cutoff """ X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=5) """ total_len = X.shape[0] train_len = int(total_len * 0.9) X = X.tocsr() X_train = X[:train_len] X_valid = X[train_len:] y_train = y[:train_len] y_valid = y[train_len:] """ print("Light Gradient Boosting Regressor") lgbm_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', #'max_depth': 15, 'num_leaves': 270, # 37, 'feature_fraction': 0.5, 'bagging_fraction': 0.85, # 'bagging_freq': 5, 'learning_rate': 0.018, 'nthread': 6, 'verbose': 0, #'device':'gpu', #'gpu_platform_id':0, #'gpu_device_id':0 } # LGBM Dataset Formatting lgtrain = lgb.Dataset(X_train, y_train, feature_name=tfvocab, categorical_feature=categorical) lgvalid = lgb.Dataset(X_valid, y_valid, feature_name=tfvocab, categorical_feature=categorical) # Go Go Go modelstart = time.time() lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=26000, valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], early_stopping_rounds=100, verbose_eval=100) # Feature Importance Plot #f, ax = plt.subplots(figsize=[7,10]) #lgb.plot_importance(lgb_clf, max_num_features=50, ax=ax) #plt.title("Light GBM Feature Importance") #plt.savefig('feature_import.png', bbox_inches='tight') print("Model Evaluation Stage") rmse = np.sqrt( metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))) print('RMSE:', rmse) str_now = datetime.now().strftime("%m-%d-%H-%M") if not debug: lgb_clf.save_model('../model/model_{}.txt'.format(str_now), lgb_clf.best_iteration) else: lgb_clf.save_model('../model/model_debug.txt', lgb_clf.best_iteration) #lgb_clf = lgb.Booster(model_file='../model/model_05-13-21-50.txt') lgpred = lgb_clf.predict(testing, num_iteration=lgb_clf.best_iteration) lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex) lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1 #lgsub.to_csv("lgsub.csv",index=True,header=True) if not debug: kaggle_util.save_result( lgsub, '../result/dense_feature_{}.csv'.format(str_now), competition='avito-demand-prediction', send=True, index=True) print("Model Runtime: %0.2f Minutes" % ((time.time() - modelstart) / 60)) print("Notebook Runtime: %0.2f Minutes" % ((time.time() - notebookstart) / 60))
RMSE_idx = RMSE.index(min_value) print(RMSE_idx) pred_final2 = Kfold_preds_final[RMSE_idx] print(pred_final2.shape) #del Kfold_preds_final, train1 gc.collect() test_cols = ['item_id'] test = pd.read_csv('../input/test.csv', skiprows=range(1,frm), nrows=to-frm, usecols = test_cols) # using Average of KFOLD preds submission1 = pd.DataFrame( columns = ['item_id', 'deal_probability']) submission1['item_id'] = test['item_id'] submission1['deal_probability'] = pred_final1 print("Check Submission NOW!!!!!!!!@") #submission1.to_csv("Avito_Shanth_RNN_AVERAGE.csv", index=False) kaggle_util.save_result(submission1, '../result/rnn_avg.csv', competition = 'avito-demand-prediction', send = not debug, index = False) # Using KFOLD preds with Minimum value submission2 = pd.DataFrame( columns = ['item_id', 'deal_probability']) submission2['item_id'] = test['item_id'] submission2['deal_probability'] = pred_final2 print("Check Submission NOW!!!!!!!!@") #submission2.to_csv("Avito_Shanth_RNN_MIN.csv", index=False) kaggle_util.save_result(submission2, '../result/rnn_min.csv', competition = 'avito-demand-prediction', send = False, index = False)
return X_tra, X_val, y_tra, y_val, x_test if __name__ == "__main__": X_tra, X_val, y_tra, y_val, x_test = loadData() model = getModel() batch_size = 3000 epochs = 10 # filepath="../input/best-model/best.hdf5" filepath="../model/weights_base.best.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_root_mean_squared_error', verbose=1, save_best_only=True, mode='min') early = EarlyStopping(monitor="val_root_mean_squared_error", mode="min", patience=5) callbacks_list = [checkpoint, early] #model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks = callbacks_list,verbose=1) #Loading model weights model.load_weights(filepath) print('Predicting....') y_pred = model.predict(x_test,batch_size=1024,verbose=1) sub = pd.read_csv('../input/sample_submission.csv') sub['deal_probability'] = y_pred sub['deal_probability'].clip(0.0, 1.0, inplace=True) sub.to_csv('gru_capsule_description.csv', index=False) kaggle_util.save_result(sub, '../result/capsule.csv', competition = 'avito-demand-prediction', send = True)
lgsub = pd.DataFrame(y_pred, columns=["deal_probability"], index=testdex) lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1 del modelRNN gc.collect() print("Number of folds completed...." + str(k)) #print(Kfold_preds_final[k][0:10]) k += 1 K.clear_session() kaggle_util.save_result(lgsub, '../result/rnn_{}.csv'.format(k), competition='avito-demand-prediction', send=False, index=True) print("All Folds completed" + str(k + 1)) print("RNN FOLD MODEL Done") result_list = [] for i in range(nfold): subfile = 'rnn_{}.csv'.format(i) result_list.append((subfile, 1 / nfold)) kaggle_util.ensemble(result_list, not debug, competition='avito-demand-prediction', score_col='deal_probability',
lgb_clf.save_model('../model/ridge_debug_{}.txt'.format(i), lgb_clf.best_iteration) lgpred = lgb_clf.predict(testing, num_iteration=lgb_clf.best_iteration) lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex) lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1 subfile = '../result/ridge_{}.csv'.format(i) if debug: subfile = '../result/ridge_debug{}.csv'.format(i) kaggle_util.save_result(lgsub, subfile, competition='avito-demand-prediction', send=False, index=True) result_list = [] for i in range(nfold): subfile = '../result/ridge_{}.csv'.format(i) if debug: subfile = '../result/ridge_debug{}.csv'.format(i) result_list.append((subfile, 1 / nfold)) kaggle_util.ensemble(result_list, not debug, competition='avito-demand-prediction', score_col='deal_probability')
epochs = 200 sample_weight = np.ones(y.shape) sample_weight[y < 1e-7] = 1 + len(y[y < 1e-7]) / len(y) history = model.fit(X_train, y, sample_weight=sample_weight, batch_size=batch_size, epochs=epochs, validation_split=0.05, verbose=1, callbacks=[check_point, early_stop]) model.load_weights(file_path) pred = model.predict(X_test, batch_size=batch_size, verbose=1) print('pred shape {}'.format(pred.shape)) sub = pd.read_csv(DATA_DIR + 'sample_submission.csv') print('sub shape {}'.format(sub.shape)) sub[target_col] = pred scr = min(history.history['val_root_mean_squared_error']) print('save to ' + f'mixnn_{scr}.csv') str_now = datetime.now().strftime("%m-%d-%H-%M") kaggle_util.save_result(sub, '../result/mixnn_{}.csv'.format(str_now), competition='avito-demand-prediction', send=not debug, index=False)