def run_ridge_on_cat(cat): if not is_in_cache('cat_ridges_blend_l3_' + cat): print_step(cat + ' > Subsetting') train_c = train_[train['parent_category_name'] == cat].copy() test_c = test_[test['parent_category_name'] == cat].copy() print(train_c.shape) print(test_c.shape) target = train_c['deal_probability'].values train_id = train_c['item_id'] test_id = test_c['item_id'] train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test_c.drop('item_id', axis=1, inplace=True) print_step(cat + ' > Modeling') results = run_cv_model(train_c, test_c, target, runLasso, params, rmse, cat + '-ridge-blend') train_c['cat_ridge'] = results['train'] test_c['cat_ridge'] = results['test'] print_step(cat + ' > RMSE: ' + str(rmse(target, train_c['cat_ridge']))) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step(cat + ' > Saving in Cache') train_c['item_id'] = train_id test_c['item_id'] = test_id save_in_cache('cat_ridges_blend_l3_' + cat, train_c[['item_id', 'cat_ridge']], test_c[['item_id', 'cat_ridge']]) return True else: print_step('Already have ' + cat + '...') return True
train_fe.drop([ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ], axis=1, inplace=True) print_step('Saving') save_in_cache('fe_lgb_data', train_fe, test_fe) del train_fe del test_fe gc.collect() print('~~~~~~~~~~~~') print_step('Run LGB') train, test = run_cv_model(label='sparse_fe_lgb', data_key='tfidf_char_union', model_fn=runSparseLGB, train=train, test=test, kf=kf) # toxic CV scores : [0.9826496062603199, 0.9830212932736853, 0.9815062563553301, 0.98022789149499, 0.981731541721145] # toxic mean CV : 0.9818273178210941 # severe_toxic CV scores : [0.9907376375169112, 0.9888719942493184, 0.9903119467039991, 0.9922809301301098, 0.9887765464258907] # severe_toxic mean CV : 0.9901958110052458 # obscene CV scores : [0.9933673973796135, 0.993919978856799, 0.9926754787135739, 0.9927263904855579, 0.9933408309332551] # obscene mean CV : 0.9932060152737598 # threat CV scores : [0.9893472977361535, 0.9912972922362948, 0.9904282818441812, 0.99134220616599, 0.9881882482937496] # threat mean CV : 0.9901206652552738 # insult CV scores : [0.9832124677272037, 0.9835326755212629, 0.9839356436291075, 0.986883748038697, 0.9858095196238779] # insult mean CV : 0.9846748109080299 # identity_hate CV scores : [0.9843095304682539, 0.9885634545571751, 0.981675404744786, 0.9885268357417188, 0.988652610966542] # identity_hate mean CV : 0.9863455672956952 # ('sparse_fe_lgb overall : ', 0.987728364593183)
del train_ del test_ del test_ohe del test_ohe2 del train_ohe del train_ohe2 gc.collect() print_step('Caching') save_in_cache('deep_text_feats2', train, test) else: train, test = load_cache('deep_text_feats2') print('~~~~~~~~~~~~') print_step('Run LGB') results = run_cv_model(train, test, target, runLGB, params, rmse, 'deep_lgb2') import pdb pdb.set_trace() print('~~~~~~~~~~') print_step('Cache') save_in_cache('deep_lgb2', pd.DataFrame({'deep_lgb2': results['train']}), pd.DataFrame({'deep_lgb2': results['test']})) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['item_id'] = test_id submission['deal_probability'] = results['test'].clip(0.0, 1.0) submission.to_csv('submit/submit_deep_lgb2.csv', index=False) print_step('Done!')
del train_fe del test_fe gc.collect() print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) print('~~~~~~~~~~~~') print_step('Run LGB') print(train.columns.values) train, test = get_data() train, test = run_cv_model(label='fe_lgb', data_key='lgb_fe_with_embeddings_and_svd', model_fn=runLGB, train=train, test=test, kf=kf) import pdb pdb.set_trace() print('~~~~~~~~~~') print_step('Cache') save_in_cache('lvl1_fe_lgb', train, test) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['id'] = test['id'] submission['toxic'] = test['fe_lgb_toxic'] submission['severe_toxic'] = test['fe_lgb_severe_toxic']
print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) print('~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 2/2') train, test = load_cache('complete_fm_data') print('~~~~~~~~~~~~~~') print_step('Run TFFM2') results = run_cv_model(train, test, target, runTFFM, params2, rmse, 'tffm2') print_step('Cache') save_in_cache('tffm2', pd.DataFrame({'tffm2': results['train']}), pd.DataFrame({'tffm2': results['test']})) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['item_id'] = test_id submission['deal_probability'] = results['test'].clip(0.0, 1.0) submission.to_csv('submit/submit_tffm2.csv', index=False) print_step('Done!') # [2018-06-06 05:26:11.306486] tffm2 cv scores : [0.22541019023816908, 0.22463933727489538, 0.22452885067937228, 0.2245032642720666, 0.22523463698732962] # [2018-06-06 05:26:11.306561] tffm2 mean cv score : 0.22486325589036663 # [2018-06-06 05:26:11.306664] tffm2 std cv score : 0.0003817385117403105
def run_ridge_on_regioncat(regioncat): if not is_in_cache('regioncat_ridges_' + regioncat): print_step(regioncat + ' > Subsetting') train_c = train[train['region_X_cat'] == regioncat].copy() test_c = test[test['region_X_cat'] == regioncat].copy() print(train_c.shape) print(test_c.shape) target = train_c['deal_probability'].values train_id = train_c['item_id'] test_id = test_c['item_id'] train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test_c.drop(['item_id'], axis=1, inplace=True) print_step(regioncat + ' > Titlecat TFIDF 1/3') train_c['titlecat'] = train_c['category_name'].fillna( '') + ' ' + train_c['param_1'].fillna('') + ' ' + train_c[ 'param_2'].fillna('') + ' ' + train_c['param_3'].fillna( '') + ' ' + train_c['title'].fillna('') test_c['titlecat'] = test_c['category_name'].fillna('') + ' ' + test_c[ 'param_1'].fillna('') + ' ' + test_c['param_2'].fillna( '') + ' ' + test_c['param_3'].fillna( '') + ' ' + test_c['title'].fillna('') print_step(regioncat + ' > Titlecat TFIDF 2/3') tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=100000, min_df=2, max_df=0.8, binary=True, encoding='KOI8-R') tfidf_train = tfidf.fit_transform(train_c['titlecat']) print(tfidf_train.shape) print_step(regioncat + ' > Titlecat TFIDF 3/3') tfidf_test = tfidf.transform(test_c['titlecat']) print(tfidf_test.shape) print_step(regioncat + ' > Titlecat TFIDF Ridge') results = run_cv_model(tfidf_train, tfidf_test, target, runRidge, {'alpha': 5.0}, rmse, regioncat + '-titlecat-ridge') train_c['regioncat_title_ridge'] = results['train'] test_c['regioncat_title_ridge'] = results['test'] print_step(regioncat + ' > Description TFIDF 1/3') train_c['desc'] = train_c['title'].fillna( '') + ' ' + train_c['description'].fillna('') test_c['desc'] = test_c['title'].fillna( '') + ' ' + test_c['description'].fillna('') print_step(regioncat + ' > Description TFIDF 2/3') tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=100000, min_df=2, max_df=0.8, binary=True, encoding='KOI8-R') tfidf_train2 = tfidf.fit_transform(train_c['desc'].fillna('')) print(tfidf_train2.shape) print_step(regioncat + ' > Description TFIDF 3/3') tfidf_test2 = tfidf.transform(test_c['desc'].fillna('')) print(tfidf_test2.shape) results = run_cv_model(tfidf_train2, tfidf_test2, target, runRidge, {'alpha': 5.0}, rmse, regioncat + '-desc-ridge') train_c['regioncat_desc_ridge'] = results['train'] test_c['regioncat_desc_ridge'] = results['test'] print_step(regioncat + ' > Text Char TFIDF 1/2') # Using char n-grams ends up being surprisingly good, HT https://www.kaggle.com/c/avito-demand-prediction/discussion/56061#325063 tfidf = TfidfVectorizer(ngram_range=(2, 5), max_features=100000, min_df=2, max_df=0.8, binary=True, analyzer='char', encoding='KOI8-R') tfidf_train3 = tfidf.fit_transform(train_c['desc']) print(tfidf_train3.shape) print_step(regioncat + ' > Text Char TFIDF 2/2') tfidf_test3 = tfidf.transform(test_c['desc']) print(tfidf_test3.shape) results = run_cv_model(tfidf_train3, tfidf_test3, target, runRidge, {'alpha': 5.0}, rmse, regioncat + '-desc-char-ridge') train_c['regioncat_desc_char_ridge'] = results['train'] test_c['regioncat_desc_char_ridge'] = results['test'] print_step('Merging 1/2') train_c2 = hstack((tfidf_train, tfidf_train2, tfidf_train3)).tocsr() print_step('Merging 2/2') test_c2 = hstack((tfidf_test, tfidf_test2, tfidf_test3)).tocsr() print(train_c2.shape) print(test_c2.shape) print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Run Full Text Ridge') results = run_cv_model(train_c2, test_c2, target, runRidge, {'alpha': 8.0}, rmse, regioncat + '-text-ridge') train_c['regioncat_all_text_ridge'] = results['train'] test_c['regioncat_all_text_ridge'] = results['test'] print('~~~~~~~~~~~~~~~~~~~~~~') print_step(regioncat + ' > Dropping') train_c.drop([c for c in train_c.columns if 'ridge' not in c], axis=1, inplace=True) test_c.drop([c for c in test_c.columns if 'ridge' not in c], axis=1, inplace=True) train_c['item_id'] = train_id test_c['item_id'] = test_id print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step(regioncat + ' > Saving in Cache') save_in_cache('regioncat_ridges_' + regioncat, train_c, test_c) else: print(regioncat + ' already in cache! Skipping...') return True
else: train_, test_ = load_cache('convai_with_fe') print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) print('~~~~~~~~~~~~~~~~~~~') print_step('Run ConvAI LGB') print(train_.columns.values) train, test = get_data() train_, test_ = run_cv_model(label='convai_lgb', data_key='convai_with_fe', model_fn=runLGB, train=train, test=test, kf=kf) import pdb pdb.set_trace() print('~~~~~~~~~~~~~~~~~~') print_step('Cache Level 2') save_in_cache('lvl2_convai_lgb', train, test) print_step('Done!') print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['id'] = test['id'] submission['toxic'] = test_['convai_lgb_toxic']
test_features.shape) print_step('Saving') save_in_cache('fm_data', train_features, test_features) del train_features del test_features gc.collect() print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) print('~~~~~~~~~~~') print_step('Run Ridge') train, test = run_cv_model(label='ridge', data_key='fm_data', model_fn=runRidge, train=train, test=test, kf=kf) import pdb pdb.set_trace() print('~~~~~~~~~~~~~~~~~~') print_step('Cache Level 1') save_in_cache('lvl1_ridge', train, test) # toxic CV scores : [0.9809843104555062, 0.9818160662139189, 0.9810818473334081, 0.9785535369240607, 0.9805031449391929] # toxic mean CV : 0.9805877811732173 # severe_toxic CV scores : [0.9910152906145414, 0.989781576288062, 0.9905538900693087, 0.9910741898469113, 0.9895167135389562] # severe_toxic mean CV : 0.990388332071556 # obscene CV scores : [0.9928806730585695, 0.99347239882342, 0.9933801187817354, 0.9926410905084246, 0.9931233899573142] # obscene mean CV : 0.9930995342258928 # threat CV scores : [0.9898491598311281, 0.9926748758603351, 0.9905821469352692, 0.9904258099519968, 0.9854784282977856] # threat mean CV : 0.9898020841753029
print_step('Saving') save_in_cache('lvl2_all', train_, test_) else: train_, test_ = load_cache('lvl2_all') print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) print('~~~~~~~~~~~~~~~~~~~~') print_step('Run Level 2 LGB') print(train_.columns.values) train, test = get_data() train_, test_ = run_cv_model(label='lvl2_all_lgb', data_key='lvl2_all', model_fn=runLGB, train=train, test=test, kf=kf) import pdb pdb.set_trace() print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['id'] = test_['id'] submission['toxic'] = test_['lvl2_all_lgb_toxic'] submission['severe_toxic'] = test_['lvl2_all_lgb_severe_toxic'] submission['obscene'] = test_['lvl2_all_lgb_obscene'] submission['threat'] = test_['lvl2_all_lgb_threat'] submission['insult'] = test_['lvl2_all_lgb_insult'] submission['identity_hate'] = test_['lvl2_all_lgb_identity_hate']
print_step('Titlecat SVD 1/4') svd = TruncatedSVD(n_components=NCOMP, algorithm='arpack') svd.fit(tfidf_train) print_step('Titlecat SVD 2/4') train_svd = pd.DataFrame(svd.transform(tfidf_train)) print_step('Titlecat SVD 3/4') test_svd = pd.DataFrame(svd.transform(tfidf_test)) print_step('Titlecat SVD 4/4') train_svd.columns = ['svd_titlecat_' + str(i + 1) for i in range(NCOMP)] test_svd.columns = ['svd_titlecat_' + str(i + 1) for i in range(NCOMP)] train = pd.concat([train, train_svd], axis=1) test = pd.concat([test, test_svd], axis=1) print_step('Titlecat TFIDF Ridge') results = run_cv_model(tfidf_train, tfidf_test, target, runRidge, {'alpha': 5.0}, rmse, 'titlecat-ridge') train['title_ridge'] = results['train'] test['title_ridge'] = results['test'] print('~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Description TFIDF 1/3') train['desc'] = train['title'].fillna( '') + ' ' + train['description'].fillna('') test['desc'] = test['title'].fillna('') + ' ' + test['description'].fillna( '') print_step('Description TFIDF 2/3') tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=300000, min_df=2, max_df=0.8, binary=True,
test_features.shape) print_step('Saving') save_in_cache('fm_data', train_features, test_features) del train_features del test_features gc.collect() print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) print('~~~~~~~~~~~') print_step('Run FM') train, test = run_cv_model(label='fm', data_key='fm_data', model_fn=runFM, train=train, test=test, kf=kf) import pdb pdb.set_trace() print('~~~~~~~~~~~~~~~~~~') print_step('Cache Level 1') save_in_cache('lvl1_fm', train, test) # toxic CV scores : [0.9809843104555062, 0.9818160662139189, 0.9810818473334081, 0.9785535369240607, 0.9805031449391929] # toxic mean CV : 0.9805877811732173 # severe_toxic CV scores : [0.9910152906145414, 0.989781576288062, 0.9905538900693087, 0.9910741898469113, 0.9895167135389562] # severe_toxic mean CV : 0.990388332071556 # obscene CV scores : [0.9928806730585695, 0.99347239882342, 0.9933801187817354, 0.9926410905084246, 0.9931233899573142] # obscene mean CV : 0.9930995342258928 # threat CV scores : [0.9898491598311281, 0.9926748758603351, 0.9905821469352692, 0.9904258099519968, 0.9854784282977856] # threat mean CV : 0.9898020841753029
del post_trainc gc.collect() post_test = csr_matrix(hstack([post_testw, post_testc])) del post_testw del post_testc gc.collect() save_in_cache('tfidf_char_union_extra_data_toxic', post_train, post_test) print('~~~~~~~~~~~~~~~~~~') print_step('Run Attack LR') train, test = run_cv_model(label='extra_data_attack_lr', data_key='tfidf_char_union_extra_data_attack', train_key='extra_data_attack', model_fn=runSagLR, train=attack, test=test, targets=[ 'attack', 'quoting_attack', 'recipient_attack', 'third_party_attack', 'other_attack' ], kf=kf) print('~~~~~~~~~~~~~~~~~~~~~') print_step('Run Attack Ridge') train, test = run_cv_model(label='extra_data_attack_lr', data_key='tfidf_char_union_extra_data_attack', train_key='extra_data_attack', model_fn=runRidge, train=attack, test=test, targets=['attack_score'],
numeric_cols=train_.columns.values.tolist(), dummy_cols=[]) print_step('Importing Data 10/10 3/4') train = hstack((train, train_)).tocsr() print(train.shape) print_step('Importing Data 10/10 4/4') test = hstack((test, test_)).tocsr() print(test.shape) print_step('Caching') save_in_cache('complete_fm_data', train, test) else: train, test = load_cache('complete_fm_data') print_step('Run Complete FM') results = run_cv_model(train, test, target, runFM, params, rmse, 'complete-fm') import pdb pdb.set_trace() print('~~~~~~~~~~') print_step('Cache') save_in_cache('complete_fm', pd.DataFrame({'complete_fm': results['train']}), pd.DataFrame({'complete_fm': results['test']})) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['item_id'] = test_id submission['deal_probability'] = results['test'].clip(0.0, 1.0) submission.to_csv('submit/submit_fm.csv', index=False) print_step('Done!')
svd.fit(train_embeddings_df) print_step('Embedding SVD 2/4') train_svd = pd.DataFrame(svd.transform(train_embeddings_df)) print_step('Embedding SVD 3/4') test_svd = pd.DataFrame(svd.transform(test_embeddings_df)) print_step('Embedding SVD 4/4') train_svd.columns = ['svd_embed_' + str(i + 1) for i in range(NCOMP)] test_svd.columns = ['svd_embed_' + str(i + 1) for i in range(NCOMP)] train_fe = pd.concat([train_fe, train_svd], axis=1) test_fe = pd.concat([test_fe, test_svd], axis=1) print('~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Run Flat Blender LGB') print(train_fe.shape) print(test_fe.shape) results = run_cv_model(train_fe, test_fe, target, runLGB, params, rmse, 'flat_blender_lgb') import pdb pdb.set_trace() print('~~~~~~~~~~') print_step('Cache') save_in_cache('flat_blender_lgb', pd.DataFrame({'flat_blender_lgb': results['train']}), pd.DataFrame({'flat_blender_lgb': results['test']})) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['item_id'] = test_id submission['deal_probability'] = results['test'].clip(0.0, 1.0) submission.to_csv('submit/submit_flat_blender_lgb.csv', index=False)
if not is_in_cache('tfidf_word_cleaned'): TFIDF_PARAMS_WORD.update({'train': train_cleaned, 'test': test_cleaned}) post_train_cleaned, post_test_cleaned = run_tfidf(**TFIDF_PARAMS_WORD) save_in_cache('tfidf_word_cleaned', post_train_cleaned, post_test_cleaned) del post_train_cleaned del post_test_cleaned gc.collect() del train_cleaned del test_cleaned gc.collect() print('~~~~~~~~~~~~') print_step('Run LGB') train, test = run_cv_model(label='tfidf_word_sparse_lgb', data_key='tfidf_word', model_fn=runSparseLGB, train=train, test=test, kf=kf) # toxic CV scores : [0.9702901662371838, 0.9696223100754018, 0.9678153536674818, 0.9676149003746513, 0.9711870679257228] # toxic mean CV : 0.9693059596560882 # severe_toxic CV scores : [0.9801895978261603, 0.9726377779905455, 0.982170654159893, 0.9874727212204224, 0.9781001815195353] # severe_toxic mean CV : 0.9801141865433113 # obscene CV scores : [0.9830528922626651, 0.9837422487164804, 0.9814396979867874, 0.9815393581964723, 0.9841164068501664] # obscene mean CV : 0.9827781208025144 # threat CV scores : [0.9818704364268729, 0.9649259614276585, 0.9764352339273181, 0.9867757740570546, 0.9802860678000866] # threat mean CV : 0.9780586947277982 # insult CV scores : [0.9750866289607637, 0.9725628946207349, 0.9733409509578796, 0.9770282683977928, 0.9761217897403539] # insult mean CV : 0.974828106535505 # identity_hate CV scores : [0.9657562463199535, 0.9649726686386453, 0.9607666236203398, 0.968004717808433, 0.9670525723564731] # identity_hate mean CV : 0.965310565748769 # ('tfidf_word_sparse_lgb overall : ', 0.9750659390023312)
print_step('Pre-flight checks') for col in train_fe.columns: print('##') print(col) print('-') print(train_fe[col].values) print('-') print(test_fe[col].values) print('-') print('-') print('~~~~~~~~~~~~') print_step('Run LGB') print(train_fe.shape) print(test_fe.shape) results = run_cv_model(train_fe, test_fe, target, runLGB, params, rmse, 'base_lgb') import pdb pdb.set_trace() print('~~~~~~~~~~') print_step('Cache') save_in_cache('base_lgb', pd.DataFrame({'base_lgb': results['train']}), pd.DataFrame({'base_lgb': results['test']})) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['item_id'] = test_id submission['deal_probability'] = results['test'].clip(0.0, 1.0) submission.to_csv('submit/submit_base_lgb.csv', index=False)
del post_trainc gc.collect() post_test = csr_matrix(hstack([post_testw, post_testc])) del post_testw del post_testc gc.collect() save_in_cache('tfidf_char_union', post_train, post_test) print('~~~~~~~~~~~~~~~~~~~~') print_step('Run Word LR Sag') train, test = run_cv_model(label='tfidf_word_lr_sag', data_key='tfidf_word', model_fn=runSagLR, train=train, test=test, train_key='extra_label', targets=[ 'toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate', 'non_toxic' ], kf=kf) # toxic CV scores : [0.9757770727127603, 0.9754469511129109, 0.9748022104865504, 0.9727014869411932, 0.9753668774625703] # toxic mean CV : 0.9748189197431969 # severe_toxic CV scores : [0.9822782217978469, 0.9809759688772627, 0.982837995178992, 0.9888689680969123, 0.9832500976058173] # severe_toxic mean CV : 0.9836422503113663 # obscene CV scores : [0.9863031895889313, 0.9859709183099142, 0.986069037576627, 0.984788656923766, 0.9868893669717265] # obscene mean CV : 0.986004233874193 # threat CV scores : [0.9892926265229371, 0.9904405583142148, 0.986893640592099, 0.9938866116828939, 0.982301808641914] # threat mean CV : 0.9885630491508117 # insult CV scores : [0.9778670021983397, 0.9786535248142688, 0.9773924913032992, 0.9796894980895773, 0.9802901280493739] # insult mean CV : 0.9787785288909717
cat_cols = ['region', 'city', 'parent_category_name', 'category_name', 'cat_bin', 'param_1', 'param_2', 'param_3', 'user_type', 'image_top_1', 'day_of_week'] for col in train_.columns: print(col) if col in cat_cols: train_[col] = train_[col].astype('category') test_[col] = test_[col].astype('category') else: train_[col] = train_[col].astype(np.float64) test_[col] = test_[col].astype(np.float64) print('~~~~~~~~~~~~') print_step('Run LGB') print(train_.shape) print(test_.shape) results = run_cv_model(train_, test_, target, runLGB, params, rmse, 'lgb_blender') import pdb pdb.set_trace() print('~~~~~~~~~~') print_step('Cache') save_in_cache('lgb_blender', pd.DataFrame({'lgb_blender': results['train']}), pd.DataFrame({'lgb_blender': results['test']})) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['item_id'] = test_id submission['deal_probability'] = results['test'].clip(0.0, 1.0) submission.to_csv('submit/submit_lgb_blender.csv', index=False) print_step('Done!')