def train_card_merchant_embeddings(df, path, iter=1): df['merchant_id'] = df['merchant_id'].astype('category').cat.as_ordered() val_idx = get_validation_index(df, frac=0.2, random=True) x, y, nas = proc_df(df, 'percent', do_scale=False) cat_vars = ['card_id', 'merchant_id'] md = ColumnarModelData.from_data_frame(path, val_idx, x, y.astype(np.float32), cat_flds=cat_vars, is_reg=True, is_multi=False, bs=128, test_df=None) embedding_sizes = get_embedding_sizes(cat_vars, df) learner = md.get_learner(embedding_sizes, 0, 0.04, 1, [1000, 500], [0.001, 0.01], y_range=(0.0, 1.0)) # lr_find(learner) # Load model, train, save try: learner.load('embedding_model') except FileNotFoundError: pass for i in range(iter): print(f'training iter {i}') learner.fit(1e-4, 1) learner.save('embedding_model') return learner
def train_card_embeddings(df, iter=1): # We may use a smaller set of data to get a sense of the performance of the model, comment out before final # training # df = df.sample(frac=0.1) df = df[cat_vars + cont_vars + ['purchase_amount']] df = df[(df.purchase_amount < 5) & (df.avg_purchases_lag12 < 5) & (df.avg_sales_lag12 < 5)] df.reset_index(inplace=True, drop=True) val_idx = get_validation_index(df, frac=0.25, random=False) # make cat_vars, but card_id needs special treatment cat_var_no_cid = cat_vars.copy() cat_var_no_cid.remove('card_id') transform_columns(df, cat_var_no_cid, cont_vars) x, y, nas, mapper = proc_df(df, 'purchase_amount', do_scale=True) md = ColumnarModelData.from_data_frame(PATH, val_idx, x, y.astype(np.float32), cat_flds=cat_vars, is_reg=True, is_multi=False, bs=128, test_df=None) embedding_sizes = get_embedding_sizes(cat_vars, df) learner = md.get_learner(embedding_sizes, len(x.columns) - len(cat_vars), 0.5, 1, [50, 7], [0.5, 0.5], y_range=(-1.0, 0.0)) # Load model, train, save try: learner.load(MODEL) except FileNotFoundError: pass for i in range(iter): print(f'training iter {i}') learner.fit(1e-2, 1) learner.save(MODEL) return learner
def train_with_card_embedding(debug): c_m_df, train, test = load_all_category(PATH, debug) card_learner = train_card_merchant_embeddings(c_m_df, PATH, 0) train_cat_flds = ['card_id', 'first_active_month'] set_common_categorical([train, test], 'first_active_month') test['target'] = 0 train.replace([np.inf, -np.inf], np.nan, inplace=True) test.replace([np.inf, -np.inf], np.nan, inplace=True) train.reset_index(inplace=True, drop=True) train_x, train_y, nas, mapper = proc_df(train, 'target', do_scale=True) test_x, _, nas, mapper = proc_df(test, 'target', do_scale=True, mapper=mapper, na_dict=nas) train_val_idx = get_validation_index(train, frac=0.25) md = ColumnarModelData.from_data_frame(PATH, train_val_idx, train_x, train_y.astype(np.float32), cat_flds=train_cat_flds, is_reg=True, bs=128, test_df=test_x) embedding_sizes = get_embedding_sizes(train_cat_flds, train) learner = md.get_learner(embedding_sizes, len(train_x.columns) - len(train_cat_flds), 0.5, 1, [20, 5], [0.5, 0.5], y_range=(-35.0, 20.0)) # learner.lr_find() # learner.sched.plot(100) learner.model.embs[0].weight = Parameter( card_learner.model.embs[0].weight.data.clone()) learner.model.embs[0].weight.requires_grad = False learner.fit(1e-3, 10) predict_and_save(learner, test, 'base') print('done') return
def mutate(self, info, users, books, ratings): if len(set([len(users), len(books), len(ratings)])) != 1: return Retrain(ok=False) if len(users) < 10: return Retrain(ok=False) data = pd.DataFrame.from_dict({'userID': users, 'bookID': books, 'rating': ratings}) u_uniq = data.userID.unique() user2idx = {o: i for i, o in enumerate(u_uniq)} data.userID = data.userID.apply(lambda x: user2idx[x]) m_uniq = data.bookID.unique() book2idx = {o: i for i, o in enumerate(m_uniq)} data.bookID = data.bookID.apply(lambda x: book2idx[x]) n_users = int(data.userID.nunique()) n_books = int(data.bookID.nunique()) X = data.drop(['rating'], axis=1) y = data['rating'].astype(np.float32) val_idxs = get_cv_idxs(len(data)) model_data = ColumnarModelData.from_data_frame( path, val_idxs, X, y, ['userID', 'bookID'], 64) N_FACTORS = 50 WD = 1e-5 model = EmbeddingDot(n_users, n_books, N_FACTORS) opt = optim.SGD( model.parameters(), 1e-1, weight_decay=WD, momentum=0.9) fit(model, model_data, 20, opt, F.mse_loss) set_lrs(opt, 0.01) fit(model, model_data, 20, opt, F.mse_loss) torch.save(model.state_dict(), 'bookweb-embed-dot.pth') with open('model-params.conf', 'w') as conf_file: conf_file.write(f'{n_users}\n{n_books}\n') return Retrain(ok=True)
def calculateTorchEmbeddingMatrix(emb_size, embedding_names, df, batch_size): n_days = 7 val_idx = get_cv_idxs(len(df)) data = ColumnarModelData.from_data_frame('', val_idx, df[['weekday']], df['scaled_users'], ['weekday'], 2) def get_emb(num_cat, num_emb): e = nn.Embedding(num_cat, num_emb) e.weight.data.uniform_(-0.01, 0.01) return (e) class weekdayEmbedding(nn.Module): def __init__(self, n_days): super().__init__() self.weekdays = get_emb(n_days, emb_size) self.lin1 = nn.Linear(emb_size, 40) self.lin2 = nn.Linear(40, 10) self.lin3 = nn.Linear(10, 1) #self.drop1 = nn.Dropout(0.5) def forward(self, cats, conts): weekdays = cats[:, 0] x = self.weekdays(weekdays) x = F.relu((self.lin1(x))) x = F.relu((self.lin2(x))) return (self.lin3(x)) model = weekdayEmbedding(n_days).cuda() opt = optim.Adam(model.parameters(), 1e-3) fit(model, data, 30, opt, F.mse_loss) fit(model, data, 30, opt, F.mse_loss) emb_matrix = model.weekdays.weight.data.cpu().numpy() emp_df = pd.DataFrame(emb_matrix, columns=embedding_names) emp_df['weekday'] = np.arange(0, 7) # list(model.parameters()) return (emp_df)
def calculateTorchManualEmbeddingMatrix(emb_size, embedding_names, df, batch_size): n_days = 7 val_idx = get_cv_idxs(len(df)) dummy_X = pd.get_dummies(df['weekday_name']) cols = dummy_X.columns.values.astype(str) data = ColumnarModelData.from_data_frame('', val_idx, dummy_X, df['scaled_users'], [], 2) class weekdayEmbeddingManual(nn.Module): def __init__(self, n_days): super().__init__() self.emb = nn.Linear(n_days, emb_size) self.lin1 = nn.Linear(emb_size, 40) self.lin2 = nn.Linear(40, 10) self.lin3 = nn.Linear(10, 1) #self.drop1 = nn.Dropout(0.5) def forward(self, cats, conts): x = self.emb(conts) x = F.relu((self.lin1(x))) x = F.relu((self.lin2(x))) return (self.lin3(x)) model = weekdayEmbeddingManual(n_days).cuda() opt = optim.Adam(model.parameters(), 1e-3) fit(model, data, 30, opt, F.mse_loss) fit(model, data, 30, opt, F.mse_loss) emb_matrix = np.transpose(model.emb.weight.data.cpu().numpy()) emp_df = pd.DataFrame(emb_matrix, columns=embedding_names) emp_df['weekday'] = np.arange(0, 7) # list(model.parameters()) return (emp_df)
def kfold_fc(train_df, test_df, num_folds, params, path, label_col, target_col, feats_excluded=None, out_cols=None, stratified=False, cat_cols=[], name=None): print("Starting FC. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) train_df[target_col] = train_df[target_col].astype(float) # Cross validation model if stratified: kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=326) else: kf = KFold(n_splits=num_folds, shuffle=True, random_state=326) # Create arrays and dataframes to store results test_df[target_col] = 0 if feats_excluded is None: feats_excluded = [label_col, target_col] feat_cols = [f for f in train_df.columns if f not in feats_excluded] if out_cols is None: out_cols = [label_col, target_col] print(f'features {feat_cols}') train_x, train_y, nas, mapper = proc_df(train_df, target_col, do_scale=True, skip_flds=[label_col]) test_x, _, nas, mapper = proc_df(test_df, target_col, do_scale=True, mapper=mapper, na_dict=nas, skip_flds=[label_col]) embedding_sizes = get_embedding_sizes(cat_cols, train_df) embedding_inputs = 0 for embs in embedding_sizes: embedding_inputs += embs[1] default_layer_size = max( 2, int((embedding_inputs + len(train_x.columns) - len(cat_cols))**(1 / 3))) y_range = [train_df[target_col].min(), train_df[target_col].max()] lr = params.get('lr', 1e-3) train_metrics = None param_metrics = params.get('metrics') if param_metrics is not None: train_metrics = [] for metric in param_metrics: train_metrics.append(metrics_map[metric]) for fold, (train_idx, valid_idx) in enumerate( kf.split(train_df[feat_cols], train_df[target_col])): print("Fold {}".format(fold)) model_name = f'{name}-{fold}' md = ColumnarModelData.from_data_frame(path, valid_idx, train_x, train_y.astype(np.float32), cat_flds=cat_cols, is_reg=True, bs=128, test_df=test_x) learner = md.get_learner( embedding_sizes, len(train_x.columns) - len(cat_cols), params.get('emb_drop', 0.1), params.get('out_sz', 1), params.get('layers', [default_layer_size**2, default_layer_size]), params.get('layers_drop'), metrics=train_metrics, y_range=y_range) if fold == 0 and params.get('lr_find'): plt.figure(figsize=(8, 10)) learner.lr_find() learner.sched.plot(100) plt.savefig('fc_lr_find.png') callback = SaveBestModel(learner, lr, model_name, params.get('early_stopping', 0)) if params.get('binary', False): learner.crit = F.binary_cross_entropy if name is not None: try: learner.load(model_name) except FileNotFoundError: pass learner.fit(lr, params.get('epochs', 20), callbacks=[callback]) # load the best model print( f'Best epoch is {callback.best_epoch} loss {callback.best_loss} metric {callback.best_metric}' ) learner.load(model_name) test_df.loc[:, target_col] += ( learner.predict(is_test=True).reshape(len(test_df)) / kf.n_splits) # save submission file test_df.reset_index(inplace=True) test_df[out_cols].to_csv(f'{path}/fc_pred.csv', index=False)
def main(): tables = get_tables(PATH, ['train', 'test']) train, test = tables val_idx = train.sample(frac=0.5).index family_survived = train[['LastName', 'Survived']].groupby('LastName').sum() add_family_survived(family_survived, test) # We can't train using the same family survived info in the training set because we would be cheating # by training on the result itself. family_count = train[['LastName', 'Survived']].groupby('LastName').count() remove_names = list(family_count[family_count['Survived'] == 1].index) remove_names_tuple = set([(x, ) for x in remove_names]) train_index = train[~train[['LastName']].apply(tuple, 1). isin(remove_names_tuple)].index add_family_survived_self(train, train_index, val_idx) # Not using last name or cabin directly right now - cardinality is too high cat_vars = ['Pclass', 'Sex', 'Embarked', 'Title', 'FamilySurvived'] cont_vars = ['Age', 'SibSp', 'Parch', 'Fare'] for v in cat_vars: train[v] = train[v].astype('category').cat.as_ordered() apply_cats(test, train) test['Survived'] = 0 train = train[cat_vars + cont_vars + ['Survived']] test = test[cat_vars + cont_vars + ['Survived', 'PassengerId']] df, y, nas, mapper = proc_df(train, 'Survived', do_scale=True) df_test, _, nas, mapper = proc_df(test, 'Survived', do_scale=True, skip_flds=['PassengerId'], mapper=mapper, na_dict=nas) md = ColumnarModelData.from_data_frame(PATH, val_idx, df, y.astype(np.float32), cat_flds=cat_vars, is_reg=True, is_multi=False, bs=128, test_df=df_test) embedding_sizes = get_embedding_sizes(cat_vars, train) model = md.get_learner(embedding_sizes, len(df.columns) - len(cat_vars), 0.5, 1, [10, 5], [0.5, 0.5], y_range=(0, 1)) model.summary() lr = 1e-3 model.fit(lr, 200) # model.load('m-1') # model.fit(lr, 10) # predict_and_save(model, test, PATH, 'base') print('done')
def train_with_card_embedding_inline(): df = get_train_test_with_features() card_emb_df = load_word2vec_merchant_embeddings() df = df.merge(card_emb_df, on=['card_id'], how='left') df.replace([np.inf, -np.inf], np.nan, inplace=True) with timer("split train & test"): train = df[df['target'].notnull()] test = df[df['target'].isnull()] del df gc.collect() set_to_float32(train) set_to_float32(test) FEATS_EXCLUDED = [ 'first_active_month', 'card_id', 'outliers', 'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size', 'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size', 'OOF_PRED', 'month_0' ] feats = [f for f in train.columns if f not in FEATS_EXCLUDED] train = train[feats].reset_index() test = test[feats].reset_index() # training and testing with the real train/test set train_cat_flds = [] train_x, train_y, nas, mapper = proc_df(train, 'target', do_scale=True, skip_flds=['card_id']) test_x, _, nas, mapper = proc_df(test, 'target', do_scale=True, mapper=mapper, na_dict=nas, skip_flds=['card_id']) train_val_idx = get_validation_index(train, frac=0.25) md = ColumnarModelData.from_data_frame(PATH, train_val_idx, train_x, train_y.astype(np.float32), cat_flds=train_cat_flds, is_reg=True, bs=128, test_df=test_x) embedding_sizes = get_embedding_sizes(train_cat_flds, train) learner = md.get_learner(embedding_sizes, len(train_x.columns) - len(train_cat_flds), 0.1, 1, [64, 8], [0.5, 0.5], y_range=(-35.0, 20.0)) # learner.lr_find() # learner.sched.plot(100) try: learner.load('w2v_card_embedding') except FileNotFoundError: pass for i in range(10): learner.fit(1e-3, 20) learner.save(f'w2v_card_embedding_{i}') predict_and_save(learner, test, f'base_{i}') print('done')