def train(self, train_df, target_df): oof = np.zeros((len(train_df), self.cfg.model.n_classes)) cv = 0 for fold_, col in enumerate(self.fold_df.columns): print( f'\n========================== FOLD {fold_} ... ==========================\n' ) logging.debug( f'\n========================== FOLD {fold_} ... ==========================\n' ) trn_x, val_x = train_df[self.fold_df[col] == 0], train_df[ self.fold_df[col] > 0] val_y = target_df[self.fold_df[col] > 0].values train_loader = factory.get_dataloader(trn_x, self.cfg.data.train) valid_loader = factory.get_dataloader(val_x, self.cfg.data.valid) model = factory.get_nn_model(self.cfg).to(device) criterion = factory.get_loss(self.cfg) optimizer = factory.get_optim(self.cfg, model.parameters()) scheduler = factory.get_scheduler(self.cfg, optimizer) best_epoch = -1 best_val_score = -np.inf mb = master_bar(range(self.cfg.model.epochs)) train_loss_list = [] val_loss_list = [] val_score_list = [] for epoch in mb: start_time = time.time() model, avg_loss = self._train_epoch(model, train_loader, criterion, optimizer, mb) valid_preds, avg_val_loss = self._val_epoch( model, valid_loader, criterion) val_score = factory.get_metrics(self.cfg.common.metrics.name)( val_y, valid_preds) train_loss_list.append(avg_loss) val_loss_list.append(avg_val_loss) val_score_list.append(val_score) if self.cfg.scheduler.name != 'ReduceLROnPlateau': scheduler.step() elif self.cfg.scheduler.name == 'ReduceLROnPlateau': scheduler.step(avg_val_loss) elapsed = time.time() - start_time mb.write( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s' ) logging.debug( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s' ) if val_score > best_val_score: best_epoch = epoch + 1 best_val_score = val_score best_valid_preds = valid_preds if self.cfg.model.multi_gpu: best_model = model.module.state_dict() else: best_model = model.state_dict() oof[val_x.index, :] = best_valid_preds cv += best_val_score * self.fold_df[col].max() torch.save(best_model, f'../logs/{self.run_name}/weight_best_{fold_}.pt') self._save_loss_png(train_loss_list, val_loss_list, val_score_list, fold_) print(f'\nEpoch {best_epoch} - val_score: {best_val_score:.4f}') logging.debug( f'\nEpoch {best_epoch} - val_score: {best_val_score:.4f}') print('\n\n===================================\n') print(f'CV: {cv:.6f}') logging.debug(f'\n\nCV: {cv:.6f}') print('\n===================================\n\n') self.oof = oof.reshape(-1, 5) return cv
def train_model(run_name, df, fold_df, cfg): oof = np.zeros(len(df)) cv = 0 for fold_, col in enumerate(fold_df.columns): print(f'\n========================== FOLD {fold_} ... ==========================\n') logging.debug(f'\n========================== FOLD {fold_} ... ==========================\n') trn_x, val_x = df[fold_df[col] == 0], df[fold_df[col] > 0] val_y = val_x.loc[:33126][cfg.common.target] val_org_idx = np.where(val_x.index <= 33126)[0] train_loader = factory.get_dataloader(trn_x, cfg.data.train) valid_loader = factory.get_dataloader(val_x, cfg.data.valid) model = factory.get_model(cfg).to(device) criterion = factory.get_loss(cfg) optimizer = factory.get_optim(cfg, model.parameters()) scheduler = factory.get_scheduler(cfg, optimizer) best_epoch = -1 best_val_score = -np.inf mb = master_bar(range(cfg.data.train.epochs)) train_loss_list = [] val_loss_list = [] val_score_list = [] for epoch in mb: start_time = time.time() model, avg_loss = train_epoch(model, train_loader, criterion, optimizer, mb, cfg) valid_preds, avg_val_loss = val_epoch(model, valid_loader, criterion, cfg) val_score = factory.get_metrics(cfg.common.metrics.name)(val_y, valid_preds[val_org_idx]) train_loss_list.append(avg_loss) val_loss_list.append(avg_val_loss) val_score_list.append(val_score) if cfg.scheduler.name != 'ReduceLROnPlateau': scheduler.step() elif cfg.scheduler.name == 'ReduceLROnPlateau': scheduler.step(avg_val_loss) elapsed = time.time() - start_time mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s') logging.debug(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s') if val_score > best_val_score: best_epoch = epoch + 1 best_val_score = val_score best_valid_preds = valid_preds if cfg.model.multi_gpu: best_model = model.module.state_dict() else: best_model = model.state_dict() oof[val_x.index] = best_valid_preds.reshape(-1) cv += best_val_score * fold_df[col].max() torch.save(best_model, f'../logs/{run_name}/weight_best_{fold_}.pt') save_png(run_name, cfg, train_loss_list, val_loss_list, val_score_list, fold_) print(f'\nEpoch {best_epoch} - val_score: {best_val_score:.4f}') logging.debug(f'\nEpoch {best_epoch} - val_score: {best_val_score:.4f}') print('\n\n===================================\n') print(f'CV: {cv:.6f}') logging.debug(f'\n\nCV: {cv:.6f}') print('\n===================================\n\n') result = { 'cv': cv, } np.save(f'../logs/{run_name}/oof.npy', oof) return result
def train_cnn(run_name, trn_x, val_x, trn_y, val_y, cfg): train_loader = factory.get_dataloader(trn_x, trn_y, cfg.data.train) valid_loader = factory.get_dataloader(val_x, val_y, cfg.data.valid) model = factory.get_model(cfg).to(device) criterion = factory.get_loss(cfg) optimizer = factory.get_optim(cfg, model.parameters()) scheduler = factory.get_scheduler(cfg, optimizer) best_epoch = -1 best_val_score = -np.inf best_coef = [] mb = master_bar(range(cfg.data.train.epochs)) train_loss_list = [] val_loss_list = [] val_score_list = [] initial_coef = [0.5, 1.5, 2.5, 3.5, 4.5] for epoch in mb: start_time = time.time() model, avg_loss = train_epoch(model, train_loader, criterion, optimizer, mb, cfg) valid_preds, avg_val_loss = val_epoch(model, valid_loader, criterion, cfg) if cfg.model.n_classes > 1: val_score = quadratic_weighted_kappa(val_y, valid_preds.argmax(1)) cm = confusion_matrix(val_y, valid_preds.argmax(1)) else: optR = QWKOptimizedRounder() optR.fit(valid_preds.copy(), val_y, initial_coef) coef = optR.coefficients() valid_preds_class = optR.predict(valid_preds.copy(), coef) val_score = quadratic_weighted_kappa(val_y, valid_preds_class) cm = confusion_matrix(val_y, valid_preds_class) # cm = np.round(cm / np.sum(cm, axis=1, keepdims=True), 3) train_loss_list.append(avg_loss) val_loss_list.append(avg_val_loss) val_score_list.append(val_score) if cfg.scheduler.name != 'ReduceLROnPlateau': scheduler.step() elif cfg.scheduler.name == 'ReduceLROnPlateau': scheduler.step(avg_val_loss) elapsed = time.time() - start_time mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s') logging.debug(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s') if val_score > best_val_score: best_epoch = epoch + 1 best_val_score = val_score best_valid_preds = valid_preds if cfg.model.multi_gpu: best_model = model.module.state_dict() else: best_model = model.state_dict() if cfg.model.n_classes == 1: best_coef = coef best_cm = cm print('\n\nCONFUSION MATRIX') logging.debug('\n\nCONFUSION MATRIX') print(cm) logging.debug(cm) print('\n\n===================================\n') print(f'CV: {best_val_score:.6f}') print(f'BEST EPOCH: {best_epoch}') logging.debug(f'\n\nCV: {best_val_score:.6f}') logging.debug(f'BEST EPOCH: {best_epoch}\n\n') print('\n===================================\n\n') result = { 'cv': best_val_score, } np.save(f'../logs/{run_name}/oof.npy', best_valid_preds) np.save(f'../logs/{run_name}/best_coef.npy', best_coef) torch.save(best_model, f'../logs/{run_name}/weight_best.pt') save_png(run_name, cfg, train_loss_list, val_loss_list, val_score_list) return result
def train(self, train_df, target_df): oof = np.zeros((len(train_df), self.cfg.model.n_classes)) cv = 0 for fold_, col in enumerate(self.fold_df.columns): print(f'\n========================== FOLD {fold_ + 1} / {self.n_splits} ... ==========================\n') logging.debug(f'\n========================== FOLD {fold_ + 1} / {self.n_splits} ... ==========================\n') trn_x, val_x = train_df[self.fold_df[col] == 0], train_df[self.fold_df[col] > 0] val_y = target_df[self.fold_df[col] > 0].values if 'transformer' in self.cfg.model.backbone: usecols = ['user_id', 'content_id', 'task_container_id', 'timestamp', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'part', 'answered_correctly', 'te_content_id_by_answered_correctly', 'answered_correctly_avg_u'] group = (trn_x[usecols] .groupby('user_id') .apply(lambda r: (r['content_id'].values, r['answered_correctly'].values, r['timestamp'].values, r['prior_question_elapsed_time'].values, r['part'].values, r['te_content_id_by_answered_correctly'].values, r['task_container_id'].values))) train_loader = factory.get_transformer_dataloader(samples=group, df=trn_x, cfg=self.cfg.data.train) valid_loader = factory.get_transformer_dataloader(samples=group, df=val_x, cfg=self.cfg.data.valid) else: train_loader = factory.get_dataloader(trn_x, self.cfg.data.train) valid_loader = factory.get_dataloader(val_x, self.cfg.data.valid) model = factory.get_nn_model(self.cfg).to(device) criterion = factory.get_loss(self.cfg) optimizer = factory.get_optim(self.cfg, model.parameters()) scheduler = factory.get_scheduler(self.cfg, optimizer) best_epoch = -1 best_val_score = -np.inf mb = master_bar(range(self.cfg.model.epochs)) train_loss_list = [] val_loss_list = [] val_score_list = [] for epoch in mb: start_time = time.time() with detect_anomaly(): model, avg_loss = self._train_epoch(model, train_loader, criterion, optimizer, mb) valid_preds, avg_val_loss = self._val_epoch(model, valid_loader, criterion) val_score = factory.get_metrics(self.cfg.common.metrics.name)(val_y, valid_preds) train_loss_list.append(avg_loss) val_loss_list.append(avg_val_loss) val_score_list.append(val_score) if self.cfg.scheduler.name != 'ReduceLROnPlateau': scheduler.step() elif self.cfg.scheduler.name == 'ReduceLROnPlateau': scheduler.step(avg_val_loss) elapsed = time.time() - start_time mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.6f} avg_val_loss: {avg_val_loss:.6f} val_score: {val_score:.6f} time: {elapsed:.0f}s') logging.debug(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.6f} avg_val_loss: {avg_val_loss:.6f} val_score: {val_score:.6f} time: {elapsed:.0f}s') if val_score > best_val_score: best_epoch = epoch + 1 best_val_score = val_score best_valid_preds = valid_preds if self.cfg.model.multi_gpu: best_model = model.module.state_dict() else: best_model = model.state_dict() oof[val_x.index, :] = best_valid_preds cv += best_val_score * self.fold_df[col].max() torch.save(best_model, f'../logs/{self.run_name}/weight_best_{fold_}.pt') # self._save_loss_png(train_loss_list, val_loss_list, val_score_list, fold_) print(f'\nEpoch {best_epoch} - val_score: {best_val_score:.6f}') logging.debug(f'\nEpoch {best_epoch} - val_score: {best_val_score:.6f}') print('\n\n===================================\n') print(f'CV: {cv:.6f}') logging.debug(f'\n\nCV: {cv:.6f}') print('\n===================================\n\n') self.oof = oof return cv
def train_ordinal_reg(run_name, trn_x, val_x, trn_y, val_y, cfg): ordinal_val_preds = np.zeros_like(val_y) for i, col in enumerate(trn_y.columns[1:]): print(f'\n\n==================== {col} ====================') logging.debug(f'\n\n==================== {col} ====================') train_loader = factory.get_dataloader(trn_x, trn_y[col], cfg.data.train) valid_loader = factory.get_dataloader(val_x, val_y[col], cfg.data.valid) model = factory.get_model(cfg).to(device) criterion = factory.get_loss(cfg) optimizer = factory.get_optim(cfg, model.parameters()) scheduler = factory.get_scheduler(cfg, optimizer) best_epoch = -1 best_val_loss = np.inf mb = master_bar(range(cfg.data.train.epochs)) train_loss_list = [] val_loss_list = [] val_score_list = [] initial_coef = [0.5, 1.5, 2.5, 3.5, 4.5] for epoch in mb: start_time = time.time() model, avg_loss = train_epoch(model, train_loader, criterion, optimizer, mb, cfg) valid_preds, avg_val_loss = val_epoch(model, valid_loader, criterion, cfg) train_loss_list.append(avg_loss) val_loss_list.append(avg_val_loss) if cfg.scheduler.name != 'ReduceLROnPlateau': scheduler.step() elif cfg.scheduler.name == 'ReduceLROnPlateau': scheduler.step(avg_val_loss) elapsed = time.time() - start_time mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s') logging.debug(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s') if avg_val_loss < best_val_loss: best_epoch = epoch + 1 best_val_loss = avg_val_loss best_valid_preds = valid_preds if cfg.model.multi_gpu: best_model = model.module.state_dict() else: best_model = model.state_dict() print(f'epoch: {best_epoch} loss: {best_val_loss}') ordinal_val_preds[:, i] = 1 / (1 + np.exp(-1 * best_valid_preds)) np.save(f'../logs/{run_name}/oof_{col}.npy', best_valid_preds) torch.save(best_model, f'../logs/{run_name}/weight_best_{col}.pt') valid_preds = np.sum(ordinal_val_preds, axis=1) val_y = (np.sum(val_y.values, axis=1) - 1).astype(int) optR = QWKOptimizedRounder() optR.fit(valid_preds.copy(), val_y, initial_coef) best_coef = optR.coefficients() valid_preds_class = optR.predict(valid_preds.copy(), best_coef) best_val_score = quadratic_weighted_kappa(val_y, valid_preds_class) cm = confusion_matrix(val_y, valid_preds_class) print('\n\nCONFUSION MATRIX') logging.debug('\n\nCONFUSION MATRIX') print(cm) logging.debug(cm) print('\n\n===================================\n') print(f'CV: {best_val_score:.6f}') logging.debug(f'\n\nCV: {best_val_score:.6f}') print('\n===================================\n\n') result = { 'cv': best_val_score, } np.save(f'../logs/{run_name}/best_coef.npy', best_coef) return result