def train(self, feature_names): self.feature_names = feature_names fold = 0 valid = "valid{}".format(str(fold)) trn_x = super().get_feature_df(self.feature_names, valid, "train") val_x = super().get_feature_df(self.feature_names, valid, "validate") trn_x.set_index("MachineIdentifier", inplace=True) val_x.set_index("MachineIdentifier", inplace=True) trn_y = trn_x["HasDetections"].astype(np.int8) val_y = val_x["HasDetections"].astype(np.int8) del trn_x["HasDetections"], val_x["HasDetections"] self.train_dataset = lgb.Dataset(trn_x, trn_y) self.valid_dataset = lgb.Dataset(val_x, val_y) self.optimized_count = 0 study = optuna.create_study() study.optimize(self.objective, n_trials=self.params["n_trials"]) best_trn_params = study.best_params getLogger(get_version()).info( "\t >> Best params: {}".format(best_trn_params)) send_message("\t :youzyo: Best params: {}".format(best_trn_params)) for key, val in self.params["trn_params"].items(): if type(val) != list: best_trn_params[key] = val self.params["trn_params"] = best_trn_params del self.train_dataset, self.valid_dataset gc.collect() self.best_lgbm_classifier = LGBMClassifier(self.params, self.dataset_name) return self.best_lgbm_classifier.train(feature_names)
def output_cv(validity, stamp): validity = validity.reset_index() columns_order = ["MachineIdentifier", "HasDetections", "Predict"] validity = validity.sort_values("MachineIdentifier").reset_index( drop=True).loc[:, columns_order] cv_auc = (fast_auc(validity["HasDetections"], np.array(validity["Predict"]))) getLogger(get_version()).info("\t >> CV Score (AUC):{}".format(cv_auc)) send_message("\t {} CV Score (AUC):{}".format(stamp, cv_auc)) return validity
def get_feature_df(self, feature_names, valid_dir, part): """ Ex) dataset_name : min, ... feature_names : __preprocess() return valid_dir : "valid0", "valid1", ... part : "train", "validate", "test" """ feature_df = None feature_set_path = Path(__file__).absolute( ).parents[2] / "data" / "features" / self.dataset_name / valid_dir if dask_mode(): print("Using dask.dataframe.read_csv()") for group, feature_list in feature_names.items(): getLogger(get_version()).info( "\t \t \t Reading {}_{}.csv...".format(part, group)) send_message("\t \t \t Reading {}_{}.csv...".format(part, group)) if dask_mode(): df = dd.read_csv(feature_set_path / "{}_{}.csv".format(part, group), usecols=["MachineIdentifier"] + feature_list) df = df.compute() else: df = pd.read_csv(feature_set_path / "{}_{}.csv".format(part, group), usecols=["MachineIdentifier"] + feature_list) if feature_df is None: feature_df = df else: feature_df = feature_df.merge(right=df, how="inner", on="MachineIdentifier") if part in ["train", "validate"]: if dask_mode(): HasDetections = dd.read_csv( Path(__file__).absolute().parents[2] / "input" / "train.csv", usecols=["MachineIdentifier", "HasDetections"]) HasDetections = HasDetections.compute() else: HasDetections = pd.read_csv( Path(__file__).absolute().parents[2] / "input" / "train.csv", usecols=["MachineIdentifier", "HasDetections"]) feature_df = feature_df.merge(right=HasDetections, on="MachineIdentifier", how="inner") return feature_df
def objective(self, trial): # Extract optuna attribs from the input json optuna_trn_params = {} for key, val in self.params["trn_params"].items(): if type(val) != list: optuna_trn_params[key] = val else: if type(val[0]) == float: optuna_trn_params[key] = trial.suggest_uniform( key, val[0], val[1]) elif type(val[0]) == int: optuna_trn_params[key] = trial.suggest_int( key, val[0], val[1]) else: optuna_trn_params[key] = trial.suggest_categorical( key, val) start = time.time() getLogger(get_version()).info( "\t [OPTUNA] {}th optimization starts".format( self.optimized_count)) send_message("\t [OPTUNA] :sushi: {} th optimization starts".format( self.optimized_count)) # Classify mtd_params = self.params["mtd_params"] clf = lgb.train( optuna_trn_params, self.train_dataset, mtd_params["num_boost_round"], valid_sets=[self.train_dataset, self.valid_dataset], feval=eval_auc, verbose_eval=mtd_params["verbose_eval"], early_stopping_rounds=mtd_params["early_stopping_rounds"]) getLogger(get_version()).info("\t {}".format(clf.params)) send_message("\t {}".format(clf.params)) for train_or_valid, metrics in clf.best_score.items(): for metric, score in metrics.items(): getLogger(get_version()).info("\t\t >> Best {} {}: {}".format( train_or_valid, metric, score)) send_message("\t\t :star-struck: Best {} {}: {}".format( train_or_valid, metric, score)) # Post-process this fold elapsed_time = int(time.time() - start) minutes, sec = divmod(elapsed_time, 60) hour, minutes = divmod(minutes, 60) getLogger(get_version()).info( "\t [OPTUNA] >> {}th optimization finishes: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(self.optimized_count, hour, minutes, sec)) send_message( "\t [OPTUNA] :sushi: {}th optimiaztion finishes: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(self.optimized_count, hour, minutes, sec)) self.optimized_count += 1 return clf.best_score["valid_1"]["binary_logloss"]
def __save_outputs(cls, outputs, is_train): """ is_train = True -> Save ground truth and validity of train dataset is_train = False -> Save predictions of test dataset """ if is_train: columns_order = ["MachineIdentifier", "HasDetections", "Predict"] save_path = cls.ROOT_PATH / "data" / "oof" else: columns_order = ["MachineIdentifier", "HasDetections"] save_path = cls.ROOT_PATH / "data" / "submit" Path.mkdir(save_path, exist_ok=True, parents=True) save_df = None for output in outputs: if save_df is None: save_df = output else: save_df = pd.concat([save_df, output]) submission_df = pd.read_csv(cls.ROOT_PATH / "input" / "sample_submission.csv") del submission_df["HasDetections"] if is_train is False: predict_length = len(save_df) sub_length = len(submission_df) if predict_length != sub_length: getLogger(get_version()).info( "CAUSION: Length of predict_df ({}) is NOT equal to that of submisson_df ({})" .format(predict_length, sub_length)) send_message( ":ghost: CAUSION: Length of predict_df ({}) is NOT equal to that of submisson_df ({})" .format(predict_length, sub_length)) save_df = pd.merge(save_df, submission_df, on="MachineIdentifier", how='right') save_df.fillna(0, inplace=True) save_df = save_df.sort_values("MachineIdentifier").reset_index( drop=True).loc[:, columns_order] filename = save_path / "{}.csv".format(get_version()) save_df.to_csv(filename, index=False, float_format="%.6f") getLogger(get_version()).info("Output {}.".format(filename)) send_message("Output {}".format(filename)) return save_df
def train(self, feature_names): self.feature_names = feature_names self.optimized_count = 0 study = optuna.create_study() study.optimize(self.objective, n_trials=self.params["n_trials"]) best_trn_params = study.best_params getLogger(get_version()).info("\t >> Best params: {}".format(best_trn_params)) send_message("\t :youzyo: Best params: {}".format(best_trn_params)) for key, val in self.params["trn_params"].items(): if type(val) != list: best_trn_params[key] = val self.params["trn_params"] = best_trn_params self.best_lgbm_classifier = LGBMClassifier(self.params, self.dataset_name) return self.best_lgbm_classifier.train(feature_names)
def process(cls, args): """ main method """ # Split the config file ({version}.json) into each variable config_file = cls.__load_config(args.version) with config_file.open() as f: params_dict = json.load(f) validities = [] predicts = [] trained_fully_before_processing = False for name, params in params_dict.items(): feature_groups = params["Preprocess"] clf_type = params["Classifier"] clf_params = params["ClassifierParams"] dataset_path = cls.ROOT_PATH / params["DatasetPath"] ProcessorFactory.__print_log(name, clf_type, dataset_path) processor = Processor(feature_groups, ClassifierType.parseType(clf_type), clf_params, dataset_path) validity, predict = processor.process() if validity is None: trained_fully_before_processing = True else: validities.append(validity) predicts.append(predict) # Output total CV Score if trained_fully_before_processing is False and only_prediction( ) is False: valid_df = cls.__save_outputs(validities, is_train=True) total_cv = fast_auc(valid_df["HasDetections"], np.array(valid_df["Predict"])) getLogger(get_version()).info( "\t >> Total CV Score (AUC): {}".format(total_cv)) send_message( "\t :youzyo: Total CV Score (AUC): {}".format(total_cv)) # Output submit file if need_prediction() or only_prediction(): cls.__save_outputs(predicts, is_train=False)
def __print_log(cls, name, clf_type, dataset_path): version = get_version() text = "Classifier Type: {}".format(clf_type) getLogger(version).info(text) send_message(text) text = "Input Path: {}".format(dataset_path) getLogger(version).info(text) send_message(text) text = "--------------------------------" getLogger(version).info(text) send_message(text)
def train(self, feature_names): """ FLow: 1. Initialize parameters 2. Process for each fold 2.1 Generate dataset 2.2 Classify 2.3 calculate feature importances 3. Output CV Score and features importances 4. Predict training data (validate all data) Input: feature_names: directionary of features' names Output: validity: Dataframe(["MachineIdentifier", "HasDetections", "Predict") """ # Initialize parameters mtd_params = self.params["mtd_params"] validity = None model_path = Path(__file__).absolute( ).parents[2] / "data" / "model" / str(get_version()) Path.mkdir(model_path, exist_ok=True, parents=True) feature_importance = pd.DataFrame() START_FOLD = 0 if get_back_training(): START_FOLD = len(list(model_path.glob('**/*.model'))) END_FOLD = 5 if train_one_round(): START_FOLD = 0 END_FOLD = 1 if START_FOLD == END_FOLD: return None # Process for each fold for fold in range(START_FOLD, END_FOLD): # Measure start time of the classification of this fold start = time.time() getLogger(get_version()).info("\t >> {} folds start".format(fold)) send_message("\t :flashlight: {} folds start".format(fold)) # Generate dataset getLogger(get_version()).info("\t \t Generating datasets...") send_message("\t \t Generating datasets...") valid = "valid{}".format(str(fold)) trn_x = super().get_feature_df(feature_names, valid, "train") val_x = super().get_feature_df(feature_names, valid, "validate") trn_x.set_index("MachineIdentifier", inplace=True) val_x.set_index("MachineIdentifier", inplace=True) trn_y = trn_x["HasDetections"].astype(np.int8) val_y = val_x["HasDetections"].astype(np.int8) train_dataset = lgb.Dataset(trn_x, trn_y) valid_dataset = lgb.Dataset(val_x, val_y) getLogger(get_version()).info("\t \t Datasets were generated.") send_message("\t \t Datasets were generated.") # Initialize variables for scoring if validity is None: validity = pd.DataFrame() validity["HasDetections"] = pd.concat([trn_y, val_y]) validity["Predict"] = 0 # Delete needless features del trn_x["HasDetections"], val_x["HasDetections"] # Classify callbacks = [ log_evaluation(get_training_logger(get_version()), fold) ] clf = lgb.train( self.params["trn_params"], train_dataset, mtd_params["num_boost_round"], valid_sets=[train_dataset, valid_dataset], feval=eval_auc, verbose_eval=mtd_params["verbose_eval"], early_stopping_rounds=mtd_params["early_stopping_rounds"], callbacks=callbacks) validity.loc[validity.index.isin(val_x.index), "Predict"] = clf.predict( val_x, num_iteration=clf.best_iteration) for train_or_valid, metrics in clf.best_score.items(): for metric, score in metrics.items(): getLogger(get_version()).info( "\t\t >> Best {} {}: {}".format( train_or_valid, metric, score)) send_message("\t\t :star-struck: Best {} {}: {}".format( train_or_valid, metric, score)) # Calculate feature importance per fold if fold == 0: feature_importance["feature"] = trn_x.columns feature_importance["fold{}".format(fold)] = clf.feature_importance( importance_type="gain") # Measure finish time of the classification of this fold elapsed_time = int(time.time() - start) minutes, sec = divmod(elapsed_time, 60) hour, minutes = divmod(minutes, 60) getLogger(get_version()).info( "\t >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(fold, hour, minutes, sec)) send_message( "\t :flashlight: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(fold, hour, minutes, sec)) # Post-process this fold del train_dataset, valid_dataset gc.collect() clf.save_model(str(model_path / "valid{}.model".format(fold))) # Output CV score validity = output_cv(validity, ":flashlight:") # Save importance directory_path = Path(__file__).absolute().parents[2] / "importance" save_feature_importance(feature_importance, directory_path) # Post-process the training del feature_importance gc.collect() return validity
def train_model(fold, data_loaders, dataset_sizes, model, criterion, optimizer, scheduler, num_epochs, early_stopping_rounds=10, verbose=-1): since = time.time() best_score = { "train BCELoss": 1.0, "train AUC": 0.5, "valid BCELoss": 1.0, "valid AUC": 0.5} best_model_wts = copy.deepcopy(model.state_dict()) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = model.to(device) not_improve_round = 0 for epoch in range(num_epochs): bce_dictionary = {"train": 0.0, "valid": 0.0} auc_dictionary = {"train": 0.0, "valid": 0.0} for phase in ["train", "valid"]: if phase == "train": scheduler.step() model.train() else: model.eval() running_bce = 0.0 running_auc = 0.0 # Iteration for inputs, labels in data_loaders[phase]: inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() # forward with torch.set_grad_enabled(phase == "train"): outputs = model(inputs) loss = criterion(outputs, labels) # backward + optimize if phase == "train": loss.backward() optimizer.step() # statistics running_bce += loss.item() * inputs.size(0) if torch.cuda.is_available(): labels = labels.cpu() outputs = outputs.cpu() y_true = labels.numpy() y_pred = list(chain.from_iterable(outputs.detach().numpy())) running_auc += fast_auc(y_true, y_pred) * inputs.size(0) bce_dictionary[phase] = running_bce / dataset_sizes[phase] auc_dictionary[phase] = running_auc / dataset_sizes[phase] # Update model if best_score["train BCELoss"] > bce_dictionary["train"]: best_score["train BCELoss"] = bce_dictionary["train"] best_score["valid BCELoss"] = bce_dictionary["valid"] best_score["train AUC"] = auc_dictionary["train"] best_score["valid AUC"] = auc_dictionary["valid"] best_model_wts = copy.deepcopy(model.state_dict()) not_improve_round = 0 else: not_improve_round += 1 if not_improve_round >= early_stopping_rounds: getLogger(get_version()).info( "\t \t Epoch {}/{}: Early stopping".format(epoch, num_epochs)) send_message( "\t \t :upura: Epoch {}/{}: Early stopping".format(epoch, num_epochs)) break if epoch % verbose == 0 and verbose != -1: getLogger(get_version()).info("{}\t{}\t{}\t{}\t{}\t{}".format( fold, epoch, bce_dictionary["train"], bce_dictionary["valid"], auc_dictionary["train"], auc_dictionary["valid"])) send_message("{}\t{}\t{}\t{}\t{}\t{}".format( fold, epoch, bce_dictionary["train"], bce_dictionary["valid"], auc_dictionary["train"], auc_dictionary["valid"])) get_training_logger(get_version()).debug("{}\t{}\t{}\t{}\t{}\t{}".format( fold, epoch, bce_dictionary["train"], bce_dictionary["valid"], auc_dictionary["train"], auc_dictionary["valid"])) time_elapsed = time.time() - since getLogger(get_version()).info( "\t \t Training complete in {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60)) send_message( "\t \t Training complete in {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60)) getLogger(get_version()).info( "\t \t [Best train scores] BCELoss: {:.4f}, AUC: {:.4f}".format( best_score["train BCELoss"], best_score["train AUC"])) getLogger(get_version()).info( "\t \t [Valid scores when train is the best] BCELoss: {:.4f}, AUC: {:.4f}".format( best_score["valid BCELoss"], best_score["valid AUC"])) send_message("\t \t :star-struck: Best train BCELoss: {:.4f}".format(best_score["train BCELoss"])) send_message("\t \t :star-struck: Best train AUC: {:.4f}".format(best_score["train AUC"])) send_message("\t \t :star-struck: Best valid BCELoss: {:.4f}".format(best_score["valid BCELoss"])) send_message("\t \t :star-struck: Best valid AUC: {:.4f}".format(best_score["valid AUC"])) model.load_state_dict(best_model_wts) return model
def train(self, feature_names): """ Input: feature_names: directionary of features' names Output: validity: Dataframe(["MachineIdentifier", "HasDetections", "Predict") """ # Initialize parameters validity = None model_path = Path(__file__).absolute().parents[2] / "data" / "model" / str(get_version()) Path.mkdir(model_path, exist_ok=True, parents=True) feature_importance = pd.DataFrame() START_FOLD = 0 if get_back_training(): START_FOLD = len(list(model_path.glob('**/*.model'))) END_FOLD = 5 if train_one_round(): START_FOLD = 0 END_FOLD = 1 if START_FOLD == END_FOLD: return None # Process for each fold for fold in range(START_FOLD, END_FOLD): log_path = Path(__file__).absolute().parents[2] / "log" / "train" / str(get_version()) / str("fold{}".format(fold)) Path.mkdir(log_path, exist_ok=True, parents=True) # Measure start time of the classification of this fold start = time.time() getLogger(get_version()).info("\t >> {} folds start".format(fold)) send_message("\t :cat: {} folds start".format(fold)) # Generate dataset getLogger(get_version()).info("\t \t Generating datasets...") send_message("\t \t Generating datasets...") valid = "valid{}".format(str(fold)) trn_x = super().get_feature_df(feature_names, valid, "train") val_x = super().get_feature_df(feature_names, valid, "validate") trn_x.set_index("MachineIdentifier", inplace=True) val_x.set_index("MachineIdentifier", inplace=True) trn_y = trn_x["HasDetections"].astype(np.int8) val_y = val_x["HasDetections"].astype(np.int8) getLogger(get_version()).info("\t \t Datasets were generated.") send_message("\t \t Datasets were generated.") # Initialize variables for scoring if validity is None: validity = pd.DataFrame() validity["HasDetections"] = pd.concat([trn_y, val_y]) validity["Predict"] = 0 # Delete needless features del trn_x["HasDetections"], val_x["HasDetections"] # Classify clf = CatBoostClassifier(iterations=self.params["iterations"], verbose=self.params["verbose"], early_stopping_rounds=self.params["early_stopping_rounds"], random_seed=self.params["random_seed"], max_depth=self.params["max_depth"], loss_function=self.params["loss_function"], custom_metric=self.params["custom_metric"], eval_metric=self.params["eval_metric"], rsm=self.params["rsm"], train_dir=str(log_path)) clf.fit(trn_x.values, trn_y.values, eval_set=(val_x.values, val_y.values)) for train_or_valid, metrics in clf.best_score_.items(): for metric, score in metrics.items(): getLogger(get_version()).info("\t\t >> Best {} {}: {}".format(train_or_valid, metric, score)) send_message("\t\t :star-struck: Best {} {}: {}".format(train_or_valid, metric, score)) validity.loc[validity.index.isin(val_x.index), "Predict"] = clf.predict_proba(val_x.values)[:, 1] # Calculate feature importance per fold if fold == 0: feature_importance["feature"] = trn_x.columns feature_importance["fold{}".format(fold)] = clf.get_feature_importance() # Measure finish time of the classification of this fold elapsed_time = int(time.time() - start) minutes, sec = divmod(elapsed_time, 60) hour, minutes = divmod(minutes, 60) getLogger(get_version()).info( "\t >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(fold, hour, minutes, sec)) send_message("\t :cat: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(fold, hour, minutes, sec)) # Post-process this fold clf.save_model(str(model_path / "valid{}.model".format(fold))) # Output CV score validity = output_cv(validity, ":cat:") # Save importance directory_path = Path(__file__).absolute().parents[2] / "importance" save_feature_importance(feature_importance, directory_path) # Post-process the training del feature_importance gc.collect() return validity
def main(args): send_message( ":thinking_face: ============= {} ============= :thinking_face:". format(str(datetime.now()))) ProcessorFactory.process(args)
warnings.filterwarnings('ignore') def main(args): send_message( ":thinking_face: ============= {} ============= :thinking_face:". format(str(datetime.now()))) ProcessorFactory.process(args) if __name__ == "__main__": gc.enable() version = get_version() create_main_logger(version) create_train_logger(version) try: main(get_option()) except DuplicateVersionException: send_message(":stop: Duplicate Version Exception Occurred.") getLogger(version).exception("Duplicate Version Exception Occurred.") except IrregularArgumentException: send_message(":stop: Irregular Argument for Feature Extraction.") getLogger(version).exception( "Irregular Argument for Feature Extraction.") except IrregularCalcBackException: send_message(":stop: Irregular Dataframe back.") getLogger(version).exception("Irregular Dataframe back.") except Exception: send_message(":stop: Unexpected Exception Occurred.") getLogger(version).exception("Unexpected Exception Occurred.")
def train(self, feature_names): # Initialize parameters validity = None model_path = Path(__file__).absolute( ).parents[2] / "data" / "model" / str(get_version()) Path.mkdir(model_path, exist_ok=True, parents=True) START_FOLD = 0 if get_back_training(): START_FOLD = len(list(model_path.glob('**/*.model'))) END_FOLD = 5 if train_one_round(): START_FOLD = 0 END_FOLD = 1 if START_FOLD == END_FOLD: return None get_training_logger(get_version()).debug("fold \t iteration \ \t train BCELoss \t valid BCELoss \ \t train AUC \t valid AUC") # Process for each fold for fold in range(START_FOLD, END_FOLD): # Measure start time of the classification of this fold start = time.time() getLogger(get_version()).info("\t >> {} folds start".format(fold)) send_message("\t :fire: {} folds start".format(fold)) valid = "valid{}".format(str(fold)) # Generate train data getLogger(get_version()).info("\t \t Generating datasets...") send_message("\t \t Generating datasets...") trn_x = super().get_feature_df(feature_names, valid, "train") trn_x.set_index("MachineIdentifier", inplace=True) trn_y = trn_x["HasDetections"].values.astype(np.float32) val_x = super().get_feature_df(feature_names, valid, "validate") val_x.set_index("MachineIdentifier", inplace=True) val_y = val_x["HasDetections"].values.astype(np.float32) # Initialize variables for scoring if validity is None: validity = pd.DataFrame() validity["HasDetections"] = pd.concat([ trn_x["HasDetections"].astype(np.float32), val_x["HasDetections"].astype(np.float32) ]) validity["Predict"] = 0 del trn_x["HasDetections"], val_x["HasDetections"] trn_x.fillna(0, inplace=True) val_x.fillna(0, inplace=True) normalized_trn_x = (trn_x - trn_x.min()) / (trn_x.max() - trn_x.min()) normalized_trn_x.fillna(normalized_trn_x.mean(), inplace=True) normalized_val_x = (val_x - val_x.min()) / (val_x.max() - val_x.min()) normalized_val_x.fillna(normalized_val_x.mean(), inplace=True) train_loader, valid_loader, dataset_sizes = create_tensor_dataloader( self.params, normalized_trn_x, trn_y, normalized_val_x, val_y) data_loaders = {"train": train_loader, "valid": valid_loader} getLogger(get_version()).info("\t \t Datasets were generated.") send_message("\t \t Datasets were generated.") # Define the Network num_epochs = self.params["num_epochs"] network_class = getattr( import_module("classifier.pytorch_network." + self.params["network"]), self.params["network"]) network = network_class(len(trn_x.columns), self.params["network_params"]) criterion = nn.BCELoss() # Don't change the criterion function! optimizer_class = getattr(import_module('torch.optim'), self.params["optimizer"]) optimizer = optimizer_class(network.parameters(), **self.params["optimizer_params"]) scheduler_class = getattr( import_module('torch.optim.lr_scheduler'), self.params["scheduler"]) scheduler = scheduler_class(optimizer, **self.params["scheduler_params"]) model = train_model( fold, data_loaders, dataset_sizes, network, criterion, optimizer, scheduler, num_epochs=num_epochs, early_stopping_rounds=self.params["early_stopping_rounds"], verbose=self.params["verbose"]) # Classify validity.loc[validity.index.isin(val_x.index), "Predict"] = predict_with_model( self.params, normalized_val_x, model) # Measure finish time of the classification of this fold elapsed_time = int(time.time() - start) minutes, sec = divmod(elapsed_time, 60) hour, minutes = divmod(minutes, 60) getLogger(get_version()).info( "\t >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(fold, hour, minutes, sec)) send_message( "\t :fire: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(fold, hour, minutes, sec)) # Post-process this fold gc.collect() torch.save(model, str(model_path / "valid{}.model".format(fold))) # Output CV score validity = validity.reset_index() columns_order = ["MachineIdentifier", "HasDetections", "Predict"] validity = validity.sort_values("MachineIdentifier").reset_index( drop=True).loc[:, columns_order] cv_auc = (fast_auc(validity["HasDetections"], np.array(validity["Predict"]))) getLogger(get_version()).info("\t >> CV Score (AUC):{}".format(cv_auc)) send_message("\t :fire: CV Score (AUC):{}".format(cv_auc)) # Post-process the training gc.collect() return validity
def objective(self, trial): # Extract optuna attribs from the input json optuna_trn_params = {} for key, val in self.params["trn_params"].items(): if type(val) != list: optuna_trn_params[key] = val else: if type(val[0]) == float: optuna_trn_params[key] = trial.suggest_uniform(key, val[0], val[1]) elif type(val[0]) == int: optuna_trn_params[key] = trial.suggest_int(key, val[0], val[1]) else: optuna_trn_params[key] = trial.suggest_categorical(key, val) # Initialize parameters mtd_params = self.params["mtd_params"] validity = None model_path = Path(__file__).absolute().parents[2] / "data" / "model" / str(get_version()) Path.mkdir(model_path, exist_ok=True, parents=True) START_FOLD = 0 if get_back_training(): START_FOLD = len(list(model_path.glob('**/*.model'))) END_FOLD = 5 if train_one_round(): START_FOLD = 0 END_FOLD = 1 if START_FOLD == END_FOLD: return None start2 = time.time() getLogger(get_version()).info("\t [OPTUNA] {}th optimization starts".format(self.optimized_count)) send_message("\t [OPTUNA] :sushi: {} th optimization starts".format(self.optimized_count)) # Process for each fold for fold in range(START_FOLD, END_FOLD): start = time.time() getLogger(get_version()).info("\t [OPTUNA] >> {} folds start".format(fold)) send_message("\t [OPTUNA] :sushi: {} folds start".format(fold)) # Generate dataset valid = "valid{}".format(str(fold)) trn_x = super().get_feature_df(self.feature_names, valid, "train") val_x = super().get_feature_df(self.feature_names, valid, "validate") trn_x.set_index("MachineIdentifier", inplace=True) val_x.set_index("MachineIdentifier", inplace=True) trn_y = trn_x["HasDetections"].astype(np.int8) val_y = val_x["HasDetections"].astype(np.int8) train_dataset = lgb.Dataset(trn_x, trn_y) valid_dataset = lgb.Dataset(val_x, val_y) # Initialize variables for scoring if validity is None: validity = pd.DataFrame() validity["HasDetections"] = pd.concat([trn_y, val_y]) validity["Predict"] = 0 # Delete needless features del trn_x["HasDetections"], val_x["HasDetections"] # Classify clf = lgb.train(optuna_trn_params, train_dataset, mtd_params["num_boost_round"], valid_sets=[train_dataset, valid_dataset], feval=eval_auc, verbose_eval=mtd_params["verbose_eval"], early_stopping_rounds=mtd_params["early_stopping_rounds"]) validity.loc[validity.index.isin(val_x.index), "Predict"] = clf.predict(val_x, num_iteration=clf.best_iteration) if fold == START_FOLD: getLogger(get_version()).info("\t {}".format(clf.params)) send_message("\t {}".format(clf.params)) for train_or_valid, metrics in clf.best_score.items(): for metric, score in metrics.items(): getLogger(get_version()).info("\t\t >> Best {} {}: {}".format(train_or_valid, metric, score)) send_message("\t\t :star-struck: Best {} {}: {}".format(train_or_valid, metric, score)) # Post-process this fold del train_dataset, valid_dataset gc.collect() elapsed_time = int(time.time() - start) minutes, sec = divmod(elapsed_time, 60) hour, minutes = divmod(minutes, 60) getLogger(get_version()).info( "\t [OPTUNA] >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(fold, hour, minutes, sec)) send_message("\t [OPTUNA] :sushi: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(fold, hour, minutes, sec)) elapsed_time = int(time.time() - start2) minutes, sec = divmod(elapsed_time, 60) hour, minutes = divmod(minutes, 60) getLogger(get_version()).info( "\t [OPTUNA] >> {}th optimization finishes: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(self.optimized_count, hour, minutes, sec)) send_message("\t [OPTUNA] :sushi: {}th optimiaztion finishes: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(self.optimized_count, hour, minutes, sec)) self.optimized_count += 1 # Output CV score validity = validity.reset_index() columns_order = ["MachineIdentifier", "HasDetections", "Predict"] validity = validity.sort_values("MachineIdentifier").reset_index(drop=True).loc[:, columns_order] cv_auc = (fast_auc(validity["HasDetections"], np.array(validity["Predict"]))) return 1 - cv_auc