pprint(zip(classes, cv_score)) cv_scores.append(cv_score) i += 1 print_step('All folds done!') print('CV scores') pprint(zip(classes, np.mean(cv_scores, axis=0))) mean_cv_score = np.mean(np.mean(cv_scores, axis=0)) print('mean cv score : ' + str(mean_cv_score)) pred_full_test = pred_full_test / 5. for k, classx in enumerate(classes): train_df['gru128_' + classx] = pred_train[:, k] test_df['gru128_' + classx] = pred_full_test[:, k] print('~~~~~~~~~~~~~~~~~~') print_step('Cache Level 1') save_in_cache('lvl1_gru128', train_df, test_df) print_step('Done!') print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['id'] = test_df['id'] submission['toxic'] = test_df['gru128_toxic'] submission['severe_toxic'] = test_df['gru128_severe_toxic'] submission['obscene'] = test_df['gru128_obscene'] submission['threat'] = test_df['gru128_threat'] submission['insult'] = test_df['gru128_insult'] submission['identity_hate'] = test_df['gru128_identity_hate'] submission.to_csv('submit/submit_lvl1_gru128.csv', index=False) print_step('Done') # ('toxic', 0.9833006646207911),
pprint(zip(classes, cv_score)) cv_scores.append(cv_score) i += 1 print_step('All folds done!') print('CV scores') pprint(zip(classes, np.mean(cv_scores, axis=0))) mean_cv_score = np.mean(np.mean(cv_scores, axis=0)) print('mean cv score : ' + str(mean_cv_score)) pred_full_test = pred_full_test / 5. for k, classx in enumerate(classes): train_df['2dconv_' + classx] = pred_train[:, k] test_df['2dconv_' + classx] = pred_full_test[:, k] print('~~~~~~~~~~~~~~~~~~') print_step('Cache Level 1') save_in_cache('lvl1_2dconv', train_df, test_df) print_step('Done!') print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['id'] = test_df['id'] submission['toxic'] = test_df['2dconv_toxic'] submission['severe_toxic'] = test_df['2dconv_severe_toxic'] submission['obscene'] = test_df['2dconv_obscene'] submission['threat'] = test_df['2dconv_threat'] submission['insult'] = test_df['2dconv_insult'] submission['identity_hate'] = test_df['2dconv_identity_hate'] submission.to_csv('submit/submit_lvl1_2dconv.csv', index=False) print_step('Done')
del train_fe del test_fe del train_img del test_img del train_active del test_active del train_ del test_ del test_ohe del test_ohe2 del train_ohe del train_ohe2 gc.collect() print_step('Caching') save_in_cache('deep_text_feats4', train, test) else: train, test = load_cache('deep_text_feats4') print('~~~~~~~~~~~~') print_step('Run LGB') results = run_cv_model(train, test, target, runLGB, params, rmse, 'deep_lgb4') import pdb pdb.set_trace() print('~~~~~~~~~~') print_step('Cache') save_in_cache('deep_lgb4', pd.DataFrame({'deep_lgb4': results['train']}), pd.DataFrame({'deep_lgb4': results['test']}))
'') + ' ' + test['title'].fillna('') if not is_in_cache('titlecat_tfidf'): print_step('Titlecat TFIDF 2/3') tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=300000, min_df=2, max_df=0.8, binary=True, encoding='KOI8-R') tfidf_train = tfidf.fit_transform(train['titlecat']) print(tfidf_train.shape) print_step('Titlecat TFIDF 3/3') tfidf_test = tfidf.transform(test['titlecat']) print(tfidf_test.shape) print_step('Saving to cache...') save_in_cache('titlecat_tfidf', tfidf_train, tfidf_test) else: print_step('Loading from cache...') tfidf_train, tfidf_test = load_cache('titlecat_tfidf') print_step('Titlecat Stats 1/6') train['titlecat_tfidf_sum'] = tfidf_train.sum(axis=1) print_step('Titlecat Stats 2/6') train['titlecat_tfidf_mean'] = tfidf_train.mean(axis=1) print_step('Titlecat Stats 3/6') train['titlecat_tfidf_nnz'] = tfidf_train.getnnz(axis=1) print_step('Titlecat Stats 4/6') test['titlecat_tfidf_sum'] = tfidf_test.sum(axis=1) print_step('Titlecat Stats 5/6') test['titlecat_tfidf_mean'] = tfidf_test.mean(axis=1) print_step('Titlecat Stats 6/6')
pprint(zip(classes, cv_score)) cv_scores.append(cv_score) i += 1 print_step('All folds done!') print('CV scores') pprint(zip(classes, np.mean(cv_scores, axis=0))) mean_cv_score = np.mean(np.mean(cv_scores, axis=0)) print('mean cv score : ' + str(mean_cv_score)) pred_full_test = pred_full_test / 5. for k, classx in enumerate(classes): train_df['cudnngru_' + classx] = pred_train[:, k] test_df['cudnngru_' + classx] = pred_full_test[:, k] print('~~~~~~~~~~~~~~~~~~') print_step('Cache Level 1') save_in_cache('lvl1_cudnngru', train_df, test_df) print_step('Done!') print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['id'] = test_df['id'] submission['toxic'] = test_df['cudnngru_toxic'] submission['severe_toxic'] = test_df['cudnngru_severe_toxic'] submission['obscene'] = test_df['cudnngru_obscene'] submission['threat'] = test_df['cudnngru_threat'] submission['insult'] = test_df['cudnngru_insult'] submission['identity_hate'] = test_df['cudnngru_identity_hate'] submission.to_csv('submit/submit_lvl1_cudnngru.csv', index=False) print_step('Done')
lambda x: x.replace('test_jpg/', '')) print_step('Merging 6/9') merge['img_path'] = merge['img_path'].apply( lambda x: x.replace('train_jpg/', '')) print_step('Merging 7/9') merge['img_path'] = merge['img_path'].apply(lambda x: x.replace('.jpg', '')) print_step('Merging 8/9') train2 = train.merge(merge, left_on='image', right_on='img_path', how='left') print_step('Merging 9/9') test2 = test.merge(merge, left_on='image', right_on='img_path', how='left') print_step('Dropping 1/2') drops = list( set(train2.columns.values) - set(merge.columns.values) - {'deal_probability', 'item_id'}) drops += [ 'img_aspect_ratio', 'img_moment_m11', 'img_moment_mu21', 'img_moment_mu30', 'img_moment_nu02', 'img_moment_nu12', 'img_moment_m00', 'img_moment_m10', 'img_moment_mu03', 'img_moment_mu12', 'img_moment_m03', 'img_moment_m12', 'img_moment_mu11', 'img_moment_nu11', 'img_moment_mu20', 'img_moment_m20', 'img_moment_nu30', 'img_moment_m01', 'img_moment_m30', 'img_moment_nu03', 'img_moment_mu02', 'img_moment_m21', 'img_moment_nu21', 'img_moment_m02', 'img_moment_nu20' ] train2.drop(drops, axis=1, inplace=True) print_step('Dropping 2/2') test2.drop(drops, axis=1, inplace=True) print_step('Saving...') save_in_cache('img_data2', train2, test2)
test_[col] = test_[col].astype('category') else: train_[col] = train_[col].astype(np.float64) test_[col] = test_[col].astype(np.float64) print('~~~~~~~~~~~~') print_step('Run LGB') print(train_.shape) print(test_.shape) results = run_cv_model(train_, test_, target, runLGB, params, rmse, 'lgb_blender') import pdb pdb.set_trace() print('~~~~~~~~~~') print_step('Cache') save_in_cache('lgb_blender', pd.DataFrame({'lgb_blender': results['train']}), pd.DataFrame({'lgb_blender': results['test']})) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['item_id'] = test_id submission['deal_probability'] = results['test'].clip(0.0, 1.0) submission.to_csv('submit/submit_lgb_blender.csv', index=False) print_step('Done!') print('~~~~~~~~~~~~~~~~~~~~') print_step('Run Poisson LGB') print(train_.shape) print(test_.shape) poisson_results = run_cv_model(train_, test_, target, runLGB, poisson_params, rmse, 'possion_lgb_blender') import pdb
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) if not is_in_cache('title_countvec'): print('~~~~~~~~~~~~~~~~~~~~') print_step('Title CountVec 1/2') cv = CountVectorizer(stop_words=stopwords.words('russian'), lowercase=True, min_df=2) tfidf_train = cv.fit_transform(train['title']) print(tfidf_train.shape) print_step('Title CountVec 2/2') tfidf_test = cv.transform(test['title']) print(tfidf_test.shape) print_step('Saving to cache...') save_in_cache('title_countvec', tfidf_train, tfidf_test) if not is_in_cache('deep_text_feats3'): print('~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 2/13') tfidf_train, tfidf_test = load_cache('title_countvec') print_step('Importing Data 3/13') tfidf_train2, tfidf_test2 = load_cache('text_tfidf') print_step('Importing Data 4/13') tfidf_train3, tfidf_test3 = load_cache('text_char_tfidf') print_step('Importing Data 5/13') train = hstack((tfidf_train2, tfidf_train3)).tocsr() print_step('Importing Data 6/13')
test_embeddings = (test['title'].str.cat( [ test['description'], ], sep=' ', na_rep='').astype(str).fillna('missing').apply( clean_text).apply(text_to_embedding)) print_step('Embedding 4/5') train_embeddings_df = pd.DataFrame( train_embeddings.values.tolist(), columns=['embed' + str(i) for i in range(EMBED_SIZE)]) print_step('Embedding 5/5') test_embeddings_df = pd.DataFrame( test_embeddings.values.tolist(), columns=['embed' + str(i) for i in range(EMBED_SIZE)]) print_step('Caching...') save_in_cache('avito_fasttext_300d', train_embeddings_df, test_embeddings_df) else: train_embeddings_df, test_embeddings_df = load_cache('avito_fasttext_300d') train_fe['embedding_mean'] = train_embeddings_df.mean(axis=1) train_fe['embedding_std'] = train_embeddings_df.std(axis=1) train_fe['embedding_skew'] = skew(train_embeddings_df, axis=1) train_fe['embedding_kurtosis'] = kurtosis(train_embeddings_df, axis=1) test_fe['embedding_mean'] = test_embeddings_df.mean(axis=1) test_fe['embedding_std'] = test_embeddings_df.std(axis=1) test_fe['embedding_skew'] = skew(test_embeddings_df, axis=1) test_fe['embedding_kurtosis'] = kurtosis(test_embeddings_df, axis=1) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 12/19 1/7') cat_cols = [
pool.join() pool.terminate() pool.restart() print_step('Merging 2/5') train_dfs = map(lambda x: x[0], dfs) test_dfs = map(lambda x: x[1], dfs) print_step('Merging 3/5') train_df = pd.concat(train_dfs) test_df = pd.concat(test_dfs) print_step('Merging 4/5') train_ridge = train.merge(train_df, on='item_id') print_step('Merging 5/5') test_ridge = test.merge(test_df, on='item_id') print_step('RMSEs') print(rmse(train_ridge['deal_probability'], train_ridge['cat_bin_title_ridge'])) print(rmse(train_ridge['deal_probability'], train_ridge['cat_bin_desc_ridge'])) print( rmse(train_ridge['deal_probability'], train_ridge['cat_bin_desc_char_ridge'])) print( rmse(train_ridge['deal_probability'], train_ridge['cat_bin_all_text_ridge'])) import pdb pdb.set_trace() print('~~~~~~~~~~~~~~~') print_step('Caching...') save_in_cache('cat_bin_ridges', train_ridge, test_ridge)
def run_ridge_on_cat_bin(cat_bin): if not is_in_cache('cat_bin_ridges_' + cat_bin): print_step(cat_bin + ' > Subsetting') train_c = train[train['cat_bin'] == cat_bin].copy() test_c = test[test['cat_bin'] == cat_bin].copy() print(train_c.shape) print(test_c.shape) target = train_c['deal_probability'].values train_id = train_c['item_id'] test_id = test_c['item_id'] train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test_c.drop(['item_id'], axis=1, inplace=True) print_step(cat_bin + ' > Titlecat TFIDF 1/3') train_c['titlecat'] = train_c['category_name'].fillna( '') + ' ' + train_c['param_1'].fillna('') + ' ' + train_c[ 'param_2'].fillna('') + ' ' + train_c['param_3'].fillna( '') + ' ' + train_c['title'].fillna('') test_c['titlecat'] = test_c['category_name'].fillna('') + ' ' + test_c[ 'param_1'].fillna('') + ' ' + test_c['param_2'].fillna( '') + ' ' + test_c['param_3'].fillna( '') + ' ' + test_c['title'].fillna('') print_step(cat_bin + ' > Titlecat TFIDF 2/3') tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=50000, binary=True, encoding='KOI8-R') tfidf_train = tfidf.fit_transform(train_c['titlecat']) print(tfidf_train.shape) print_step(cat_bin + ' > Titlecat TFIDF 3/3') tfidf_test = tfidf.transform(test_c['titlecat']) print(tfidf_test.shape) print_step(cat_bin + ' > Titlecat TFIDF Ridge') results = run_cv_model(tfidf_train, tfidf_test, target, runRidge, {'alpha': 5.0}, rmse, cat_bin + '-titlecat-ridge') train_c['cat_bin_title_ridge'] = results['train'] test_c['cat_bin_title_ridge'] = results['test'] print_step(cat_bin + ' > Description TFIDF 1/3') train_c['desc'] = train_c['title'].fillna( '') + ' ' + train_c['description'].fillna('') test_c['desc'] = test_c['title'].fillna( '') + ' ' + test_c['description'].fillna('') print_step(cat_bin + ' > Description TFIDF 2/3') tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=50000, binary=True, encoding='KOI8-R') tfidf_train2 = tfidf.fit_transform(train_c['desc'].fillna('')) print(tfidf_train2.shape) print_step(cat_bin + ' > Description TFIDF 3/3') tfidf_test2 = tfidf.transform(test_c['desc'].fillna('')) print(tfidf_test2.shape) results = run_cv_model(tfidf_train2, tfidf_test2, target, runRidge, {'alpha': 5.0}, rmse, cat_bin + '-desc-ridge') train_c['cat_bin_desc_ridge'] = results['train'] test_c['cat_bin_desc_ridge'] = results['test'] print_step(cat_bin + ' > Text Char TFIDF 1/2') # Using char n-grams ends up being surprisingly good, HT https://www.kaggle.com/c/avito-demand-prediction/discussion/56061#325063 tfidf = TfidfVectorizer(ngram_range=(2, 5), max_features=50000, binary=True, analyzer='char', encoding='KOI8-R') tfidf_train3 = tfidf.fit_transform(train_c['desc']) print(tfidf_train3.shape) print_step(cat_bin + ' > Text Char TFIDF 2/2') tfidf_test3 = tfidf.transform(test_c['desc']) print(tfidf_test3.shape) results = run_cv_model(tfidf_train3, tfidf_test3, target, runRidge, {'alpha': 5.0}, rmse, cat_bin + '-desc-char-ridge') train_c['cat_bin_desc_char_ridge'] = results['train'] test_c['cat_bin_desc_char_ridge'] = results['test'] print_step('Merging 1/2') train_c2 = hstack((tfidf_train, tfidf_train2, tfidf_train3)).tocsr() print_step('Merging 2/2') test_c2 = hstack((tfidf_test, tfidf_test2, tfidf_test3)).tocsr() print(train_c2.shape) print(test_c2.shape) print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Run Full Text Ridge') results = run_cv_model(train_c2, test_c2, target, runRidge, {'alpha': 8.0}, rmse, cat_bin + '-text-ridge') train_c['cat_bin_all_text_ridge'] = results['train'] test_c['cat_bin_all_text_ridge'] = results['test'] print('~~~~~~~~~~~~~~~~~~~~~~') print_step(cat_bin + ' > Dropping') train_c.drop([c for c in train_c.columns if 'ridge' not in c], axis=1, inplace=True) test_c.drop([c for c in test_c.columns if 'ridge' not in c], axis=1, inplace=True) train_c['item_id'] = train_id test_c['item_id'] = test_id print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step(cat_bin + ' > Saving in Cache') save_in_cache('cat_bin_ridges_' + cat_bin, train_c, test_c) else: print(cat_bin + ' already in cache! Skipping...') return True
] all_models = [ 'count_logreg', 'bad_word_logreg', 'tfidf_logreg', 'char_vdcnn', 'glove_gru', 'glove_lstm', 'glove_scnn', 'fasttext_dpcnn', 'fasttext_gru', 'fasttext_scnn', 'glove_dpcnn', 'word2vec_scnn', 'fasttext_lstm', 'word2vec_gru', 'word2vec_lstm', 'word2vec_dpcnn' ] train, test = get_data() train.drop(['comment_text'], axis=1, inplace=True) test.drop(['comment_text'], axis=1, inplace=True) for model in all_models: train_ = pd.read_csv(base + model + train_tail).drop(['fold_id'], axis=1) test_ = (pd.read_csv(base + model + test_tail).groupby('id').mean().drop( ['fold_id'], axis=1).reset_index()) for label in labels: train_['neptune_' + model + '_' + label] = train_[label] train_.drop(label, axis=1, inplace=True) test_['neptune_' + model + '_' + label] = test_[label] test_.drop(label, axis=1, inplace=True) train = pd.merge(train, train_, on='id') test = pd.merge(test, test_, on='id') for label in labels: print(model + ' ' + label + ' AUC: ' + str( roc_auc_score(train[label], train['neptune_' + model + '_' + label]))) print('Saving...') save_in_cache('neptune_models', train, test) print('Done')
]).tocsr() del train_word_features, train_num_features, train_char_features, train_subword_features gc.collect() print_step('Merging 2/2') test_features = hstack([ test_char_features, test_word_features, test_num_features, test_subword_features ]).tocsr() del test_word_features, test_num_features, test_char_features, test_subword_features gc.collect() print("Shapes just to be sure : ", train_features.shape, test_features.shape) print_step('Saving') save_in_cache('fm_data', train_features, test_features) del train_features del test_features gc.collect() print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) print('~~~~~~~~~~~') print_step('Run Ridge') train, test = run_cv_model(label='ridge', data_key='fm_data', model_fn=runRidge, train=train, test=test,
pprint(zip(classes, cv_score)) cv_scores.append(cv_score) i += 1 print_step('All folds done!') print('CV scores') pprint(zip(classes, np.mean(cv_scores, axis=0))) mean_cv_score = np.mean(np.mean(cv_scores, axis=0)) print('mean cv score : ' + str(mean_cv_score)) pred_full_test = pred_full_test / 5. for k, classx in enumerate(classes): train_df['rnncnn_' + classx] = pred_train[:, k] test_df['rnncnn_' + classx] = pred_full_test[:, k] print('~~~~~~~~~~~~~~~~~~') print_step('Cache Level 1') save_in_cache('lvl1_rnncnn', train_df, test_df) print_step('Done!') print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['id'] = test_df['id'] submission['toxic'] = test_df['rnncnn_toxic'] submission['severe_toxic'] = test_df['rnncnn_severe_toxic'] submission['obscene'] = test_df['rnncnn_obscene'] submission['threat'] = test_df['rnncnn_threat'] submission['insult'] = test_df['rnncnn_insult'] submission['identity_hate'] = test_df['rnncnn_identity_hate'] submission.to_csv('submit/submit_lvl1_rnncnn.csv', index=False) print_step('Done') # [('toxic', 0.9817250985709102),
]).tocsr() del train_word_features, train_num_features, train_char_features, train_subword_features gc.collect() print_step('Merging 2/2') test_features = hstack([ test_char_features, test_word_features, test_num_features, test_subword_features ]).tocsr() del test_word_features, test_num_features, test_char_features, test_subword_features gc.collect() print("Shapes just to be sure : ", train_features.shape, test_features.shape) print_step('Saving') save_in_cache('fm_data', train_features, test_features) del train_features del test_features gc.collect() print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) print('~~~~~~~~~~~') print_step('Run FM') train, test = run_cv_model(label='fm', data_key='fm_data', model_fn=runFM, train=train, test=test,
wordbatch_train = wb.fit_transform(train['titlecat']) print(wordbatch_train.shape) print_step('Titlecat Wordbatch 3/5') wordbatch_test = wb.transform(test['titlecat']) print(wordbatch_test.shape) del(wb) gc.collect() print_step('Titlecat Wordbatch 4/5') mask = np.where(wordbatch_train.getnnz(axis=0) > 3)[0] wordbatch_train = wordbatch_train[:, mask] print(wordbatch_train.shape) print_step('Titlecat Wordbatch 5/5') wordbatch_test = wordbatch_test[:, mask] print(wordbatch_test.shape) print_step('Saving to cache...') save_in_cache('titlecat_wordbatch', wordbatch_train, wordbatch_test) else: print_step('Loading from cache...') wordbatch_train, wordbatch_test = load_cache('titlecat_wordbatch') print('~~~~~~~~~~~~~~~~~~~~~~~') print_step('Text Wordbatch 1/5') train['desc'] = train['title'].fillna('') + ' ' + train['description'].fillna('') test['desc'] = test['title'].fillna('') + ' ' + test['description'].fillna('') if not is_in_cache('text_wordbatch'): print_step('Text Wordbatch 2/5') wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
del dpcnn_test del rnncnn_train del rnncnn_test del rnncnn2_train del rnncnn2_test del capsule_net_train del capsule_net_test del attention_train del attention_test del neptune_train del neptune_test gc.collect() print('Train shape: {}'.format(train_.shape)) print('Test shape: {}'.format(test_.shape)) print_step('Saving') save_in_cache('lvl2_all', train_, test_) else: train_, test_ = load_cache('lvl2_all') print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) print('~~~~~~~~~~~~~~~~~~~~') print_step('Run Level 2 LGB') print(train_.columns.values) train, test = get_data() train_, test_ = run_cv_model(label='lvl2_all_lgb', data_key='lvl2_all', model_fn=runLGB, train=train,
print_step('Processing 4/9') attack['attack_score'] = attack['attack'] attack['quoting_attack'] = attack['quoting_attack'].apply( lambda x: 1 if x > 0.1 else 0) attack['recipient_attack'] = attack['recipient_attack'].apply( lambda x: 1 if x > 0.1 else 0) attack['third_party_attack'] = attack['third_party_attack'].apply( lambda x: 1 if x > 0.1 else 0) attack['other_attack'] = attack['other_attack'].apply(lambda x: 1 if x > 0.1 else 0) attack['attack'] = attack['attack_score'].apply(lambda x: 1 if x > 0.1 else 0) attack['comment_text'] = attack['comment'] attack.drop('comment', axis=1, inplace=True) print_step('Processing 5/9') save_in_cache('extra_data_attack', attack, test) print_step('Processing 6/9') toxic = toxic.drop('worker_id', axis=1).groupby('rev_id').mean().reset_index() print_step('Processing 7/9') toxic = toxic_comments[['rev_id', 'comment']].merge(toxic, on='rev_id').drop('rev_id', axis=1) print_step('Processing 8/9') toxic['toxicity_label'] = toxic['toxicity'].apply(lambda x: 1 if x > 0.1 else 0) toxic['comment_text'] = toxic['comment'] toxic.drop('comment', axis=1, inplace=True) print_step('Processing 9/9')
pool.restart() print_step('Merging 2/5') train_dfs = map(lambda x: x[0], dfs) test_dfs = map(lambda x: x[1], dfs) print_step('Merging 3/5') train_df = pd.concat(train_dfs) test_df = pd.concat(test_dfs) print_step('Merging 4/5') train_lasso = train.merge(train_df, on='item_id') print_step('Merging 5/5') test_lasso = test.merge(test_df, on='item_id') print_step( 'RMSE: ' + str(rmse(train_lasso['deal_probability'], train_lasso['cat_ridge']))) import pdb pdb.set_trace() print('~~~~~~~~~~') print_step('Cache') save_in_cache('pc_lasso_l3', pd.DataFrame({'pc_lasso_l3': train_lasso['cat_ridge']}), pd.DataFrame({'pc_lasso_l3': test_lasso['cat_ridge']})) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['item_id'] = test_id submission['deal_probability'] = test_lasso['cat_ridge'].clip(0.0, 1.0) submission.to_csv('submit/submit_pc_lasso_l3_blender.csv', index=False) print_step('Done!')
gp = all_periods.groupby(['user_id'])[['days_up_sum', 'times_put_up' ]].agg(['min', 'max', 'mean']) gp = pd.DataFrame(gp.to_records()) gp.columns = [ 'user_id', 'days_up_sum_min', 'days_up_sum_max', 'days_up_sum_mean', 'times_put_up_min', 'times_put_up_max', 'times_put_up_mean' ] print_step('Grouping 4/4 1/3') n_user_items = all_samples.groupby(['user_id'])[['item_id']].count().reset_index() \ .rename(index=str, columns={ 'item_id': 'n_user_items' }) print_step('Grouping 4/4 2/3') gp = gp.merge(n_user_items, on='user_id', how='outer') print_step('Grouping 4/4 3/3') gp.fillna(0, inplace=True) print('~~~~~~~~~~~~~~~~~~~~') print_step('Merging 2/2 1/4') train = train.merge(gp, on='user_id', how='left') print_step('Merging 2/2 2/4') test = test.merge(gp, on='user_id', how='left') print_step('Merging 2/2 3/4') train = train[gp.columns.values] print_step('Merging 2/2 4/4') test = test[gp.columns.values] print('~~~~~~~~~~~~') print_step('Caching') save_in_cache('active_feats', train, test)