def fit_and_predict(self, X_train, X_test, y_train): if self.cv == "mcs": folds = MCSKFold(n_splits=5, shuffle_mc=True, max_iter=100) oof = np.zeros((len(X_train), NUM_CLASS)) predictions = np.zeros((len(X_test), NUM_CLASS)) feature_importance_df = pd.DataFrame() fold_scores = [] for fold, (train_idx, val_idx) in enumerate( folds.split(df=y_train, target_cols=["target"])): self.logger.debug("-" * 100) self.logger.debug(f"Fold {fold+1}") train_data = lgb.Dataset(X_train.iloc[train_idx], label=y_train.iloc[train_idx]) val_data = lgb.Dataset(X_train.iloc[val_idx], label=y_train.iloc[val_idx]) callbacks = [log_evaluation(self.logger, period=100)] clf = lgb.train(self.params, train_data, valid_sets=[train_data, val_data], verbose_eval=100, early_stopping_rounds=100, callbacks=callbacks, feval=eval_func) oof[val_idx, :] = clf.predict(X_train.iloc[val_idx].values, num_iteration=clf.best_iteration) fold_score = top2accuracy(oof[val_idx, :], y_train.iloc[val_idx].values) fold_scores.append(fold_score) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = X_train.columns.values fold_importance_df["importance"] = clf.feature_importance( importance_type="gain") fold_importance_df["fold"] = fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) predictions += clf.predict( X_test, num_iteration=clf.best_iteration) / folds.n_splits pred_labels = np.argsort(predictions, axis=1)[:, -2:] feature_importance_df = feature_importance_df[[ "feature", "importance" ]].groupby("feature").mean().sort_values(by="importance", ascending=False).head(50) self.logger.debug("##### feature importance #####") self.logger.debug(feature_importance_df) cv_score_fold_mean = sum(fold_scores) / len(fold_scores) self.logger.debug(f"cv_score_fold_mean: {cv_score_fold_mean}") return pred_labels, cv_score_fold_mean
#################### ## Train model #################### folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) oof = np.zeros(len(train_use)) predictions = np.zeros(len(test_use)) feature_importance_df = pd.DataFrame() for fold, (train_idx, val_idx) in enumerate(folds.split(train_use, train_use["district"])): print(f"Fold {fold+1}") train_data = lgb.Dataset(train_use.iloc[train_idx], label=target_log[train_idx], categorical_feature=categorical_cols) val_data = lgb.Dataset(train_use.iloc[val_idx], label=target_log[val_idx], categorical_feature=categorical_cols) num_round = N_ROUNDS callbacks = [log_evaluation(logger, period=100)] clf = lgb.train(params, train_data, num_round, valid_sets = [train_data, val_data], verbose_eval=False, early_stopping_rounds=100, callbacks=callbacks) oof[val_idx] = clf.predict(train_use.values[val_idx], num_iteration=clf.best_iteration) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = train_use.columns.values fold_importance_df["importance"] = clf.feature_importance(importance_type="gain") fold_importance_df["fold"] = fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) feature_importance_df = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).head(50) logger.debug("##### feature importance #####") logger.debug(feature_importance_df) predictions += clf.predict(test_use, num_iteration=clf.best_iteration) / folds.n_splits # inverse log transformation
def train(args, train_loader, train_val_loader, val_loader, test_loader): seed(args.seed) job_id = os.environ.get('SLURM_JOB_ID', 'local') print('Starting run {} with:\n{}'.format(job_id, args)) writer = SummaryWriter(args.logdir) columns = ['epoch', 'eval_loss', 'eval_acc', 'eval_prec', 'eval_recall', 'train_loss', 'train_acc', 'train_prec', 'train_recall', 'test_loss', 'test_acc', 'test_prec', 'test_recall'] stats_csv = pd.DataFrame(columns=columns) model = Network( k=args.network_k, att_type=args.network_att_type, kernel3=args.kernel3, width=args.network_width, dropout=args.network_dropout, compensate=True, norm=args.norm, inp_channels=args.input_channels) print(model) epochs = args.num_epochs * args.shrinkage milestones = np.array([80, 120, 160]) milestones *= args.shrinkage milestones = list(milestones) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") raw_model = model if torch.cuda.device_count() > 1: print('using multiple gpus') model = torch.nn.DataParallel(model) model.to(device) criterion = nn.CrossEntropyLoss() print(criterion) nn.utils.clip_grad_value_(raw_model.parameters(), 5.) if args.opt == 'rmsprop': optimizer = torch.optim.RMSprop(raw_model.parameters(), lr=args.lr, eps=1e-5, weight_decay=args.l2) elif args.opt == 'momentum': optimizer = torch.optim.SGD(raw_model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.l2) elif args.opt == 'adam': optimizer = torch.optim.Adam(raw_model.parameters(), lr=args.lr, eps=1e-5, weight_decay=args.l2) lr_schedule = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones) state = { 'epoch': 0, 'step': 0, 'state_dict': copy.deepcopy(raw_model.state_dict()), 'optimizer': copy.deepcopy(optimizer.state_dict()), 'lr_schedule': copy.deepcopy(lr_schedule.state_dict()), 'best_acc': None, 'best_epoch': 0, 'is_best': False, 'stats_csv': stats_csv, 'config': vars(args) } if load_checkpoint(args.logdir, state): raw_model.load_state_dict(state['state_dict']) optimizer.load_state_dict(state['optimizer']) lr_schedule.load_state_dict(state['lr_schedule']) stats_csv = state['stats_csv'] save_checkpoint(args.logdir, state) writer.add_text('args/str', str(args), state['epoch']) writer.add_text('job_id/str', job_id, state['epoch']) writer.add_text('model/str', str(model), state['epoch']) # Train the model for epoch in range(state['epoch'], epochs): lr_schedule.step() model.train() losses = [] tps = [] tns = [] fps = [] fns = [] batch_labels = [] delayed = 0 writer.add_scalar('stats/lr', optimizer.param_groups[0]['lr'], epoch + 1) with tqdm(train_loader, desc="Epoch [{}/{}]".format(epoch+1, epochs)) as pbar: for images, labels in pbar: batch_labels += list(labels) if torch.cuda.is_available(): if torch.cuda.device_count() == 1: images = images.cuda() labels = labels.cuda() # Forward pass outputs, att = model(images) loss = criterion(outputs, labels) predicted = torch.argmax(outputs.data, 1) TP, TN, FP, FN = pred_stats(predicted, labels) cpu_loss = loss.mean().cpu().item() losses += [cpu_loss] tps += [TP] tns += [TN] fps += [FP] fns += [FN] # Backward and optimize delayed += 1 if args.delayed_step > 0: (loss / args.delayed_step).backward() else: loss.backward() if args.delayed_step == 0 or (delayed + 1) % args.delayed_step == 0: optimizer.step() optimizer.zero_grad() precision, recall, accuracy = precision_recall_accuracy( np.sum(tps), np.sum(tns), np.sum(fps), np.sum(fns)) writer.add_scalar('train/loss', np.mean(losses), state['step']) writer.add_scalar('train/precision', precision, state['step']) writer.add_scalar('train/recall', recall, state['step']) writer.add_scalar('train/accuracy', accuracy, state['step']) writer.add_scalar('train/labels', np.mean(batch_labels), state['step']) state['step'] += 1 delayed = 0 losses = [] tps = [] tns = [] fps = [] fns = [] batch_labels = [] pbar.set_postfix(loss=cpu_loss) # step last backward if the step isn't done yet because of an 'incomplete' # delayed / accumulated batch if delayed > 0: optimizer.step() optimizer.zero_grad() precision, recall, accuracy = precision_recall_accuracy( np.sum(tps), np.sum(tns), np.sum(fps), np.sum(fns)) writer.add_scalar('train/loss', np.mean(losses), state['step']) writer.add_scalar('train/precision', precision, state['step']) writer.add_scalar('train/recall', recall, state['step']) writer.add_scalar('train/accuracy', accuracy, state['step']) writer.add_scalar('train/labels', np.mean(batch_labels), state['step']) state['step'] += 1 state['epoch'] = epoch + 1 state['state_dict'] = copy.deepcopy(raw_model.state_dict()) state['optimizer'] = copy.deepcopy(optimizer.state_dict()) state['lr_schedule'] = copy.deepcopy(lr_schedule.state_dict()) if args.opt == 'rmsprop': rms_m2 = get_rmsprop_m2(model, optimizer) writer.add_scalar('train/rmsprop_m2_min', rms_m2.min(), state['epoch']) writer.add_scalar('train/rmsprop_m2_mean', rms_m2.mean(), state['epoch']) writer.add_scalar('train/rmsprop_m2_max', rms_m2.max(), state['epoch']) writer.add_histogram('train/rmsprop_m2', rms_m2, state['epoch']) val_stats = evaluate(model, criterion, val_loader) log_evaluation(state['epoch'], val_stats, writer, 'eval') if state['best_acc'] is None or state['best_acc'] < val_stats['accuracy']: state['is_best'] = True state['best_acc'] = val_stats['accuracy'] state['best_epoch'] = state['epoch'] else: state['is_best'] = False if (state['is_best'] or state['epoch'] >= epochs or args.test_all): train_stats = evaluate(model, criterion, train_val_loader) log_evaluation(state['epoch'], train_stats, writer, 'train_eval') test_stats = evaluate(model, criterion, test_loader) log_evaluation(state['epoch'], test_stats, writer, 'test') stats_csv.loc[len(stats_csv)] = [ state['epoch'], val_stats['loss'], val_stats['accuracy'], val_stats['precision'], val_stats['recall'], train_stats['loss'], train_stats['accuracy'], train_stats['precision'], train_stats['recall'], test_stats['loss'], test_stats['accuracy'], test_stats['precision'], test_stats['recall']] else: stats_csv.loc[len(stats_csv)] = [ state['epoch'], val_stats['loss'], val_stats['accuracy'], val_stats['precision'], val_stats['recall'], np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] save_checkpoint(args.logdir, state) writer.add_text('done/str', 'true', state['epoch']) print('done - stopping now') writer.close()
def fit_and_predict(self, X_train, X_test, y_train, groups): if self.cv == "mcs": folds = MCSKFold(n_splits=5, shuffle_mc=True, max_iter=100) elif self.cv == "group": folds = GroupKFold(n_splits=10) elif self.cv == "stratified": folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) y_to_stratify = pd.cut(y_train["Global_Sales_log1p"], bins=7, labels=False) oof = np.zeros(len(X_train)) predictions = np.zeros(len(X_test)) feature_importance_df = pd.DataFrame() fold_scores = [] # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, groups=groups)): for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, y_to_stratify)): self.logger.debug("-" * 100) self.logger.debug(f"Fold {fold+1}") train_data = lgb.Dataset(X_train.iloc[train_idx], label=y_train.iloc[train_idx]) val_data = lgb.Dataset(X_train.iloc[val_idx], label=y_train.iloc[val_idx]) callbacks = [log_evaluation(self.logger, period=100)] clf = lgb.train(self.params, train_data, valid_sets=[train_data, val_data], verbose_eval=100, early_stopping_rounds=100, callbacks=callbacks) #, feval=eval_func) oof[val_idx] = clf.predict(X_train.iloc[val_idx].values, num_iteration=clf.best_iteration) fold_score = mean_squared_log_error( np.expm1(y_train.iloc[val_idx].values), np.expm1(oof[val_idx]))**.5 fold_scores.append(fold_score) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = X_train.columns.values fold_importance_df["importance"] = clf.feature_importance( importance_type="gain") fold_importance_df["fold"] = fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) predictions += np.expm1( clf.predict(X_test, num_iteration=clf.best_iteration)) / folds.n_splits _feature_importance_df = feature_importance_df[[ "feature", "importance" ]].groupby("feature").mean().sort_values(by="importance", ascending=False) # .head(50) self.logger.debug("##### feature importance #####") self.logger.debug(_feature_importance_df.head(50)) cv_score_fold_mean = sum(fold_scores) / len(fold_scores) self.logger.debug(f"cv_score_fold_mean: {cv_score_fold_mean}") # # RETRAIN # # exp057 # # RETRAIN # k = 500 # topk_features = _feature_importance_df.index[:k] # self.logger.debug(f"selected {len(topk_features)} features: {topk_features}") # oof = np.zeros(len(X_train)) # predictions = np.zeros(len(X_test)) # feature_importance_df = pd.DataFrame() # fold_scores = [] # # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, groups=groups)): # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, y_to_stratify)): # self.logger.debug("-" * 100) # self.logger.debug(f"Fold {fold+1}") # train_data = lgb.Dataset(X_train.loc[train_idx, topk_features], label=y_train.iloc[train_idx]) # val_data = lgb.Dataset(X_train.loc[val_idx, topk_features], label=y_train.iloc[val_idx]) # callbacks = [log_evaluation(self.logger, period=100)] # clf = lgb.train(self.params, train_data, valid_sets=[train_data, val_data], verbose_eval=100, early_stopping_rounds=100, callbacks=callbacks) #, feval=eval_func) # oof[val_idx] = clf.predict(X_train.loc[val_idx, topk_features].values, num_iteration=clf.best_iteration) # fold_score = mean_squared_log_error(np.expm1(y_train.iloc[val_idx].values), np.expm1(oof[val_idx])) ** .5 # fold_scores.append(fold_score) # fold_importance_df = pd.DataFrame() # fold_importance_df["feature"] = topk_features # fold_importance_df["importance"] = clf.feature_importance(importance_type="gain") # fold_importance_df["fold"] = fold + 1 # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) # predictions += np.expm1(clf.predict(X_test[topk_features], num_iteration=clf.best_iteration)) / folds.n_splits # feature_importance_df = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).head(50) # self.logger.debug("##### feature importance #####") # self.logger.debug(feature_importance_df) # cv_score_fold_mean = sum(fold_scores) / len(fold_scores) # self.logger.debug(f"cv_score_fold_mean: {cv_score_fold_mean}") return predictions, cv_score_fold_mean