def runExperiment(runname, mlepConfig, experiment_name, expstatuslog, earlystop): # set up mlflow access # mlflow.set_tracking_uri -- not needed, defaults to mlruns # mlflow.create_experiment -- need experiment name. Should I programmatically create one? or go by timestamp if expstatuslog: sys.stdout = open(LOG_FILE, "w") else: sys.stdout = dumbwrite() mlflow.set_tracking_uri("mysql://*****:*****@127.0.0.1:3306/mlflow_runs") mlflow.start_run(run_name=runname) # Log relevant details for _key in mlepConfig["config"]: # possible error if _key != "drift_metrics": mlflow.log_param(_key, mlepConfig["config"][_key]) mlflow.log_param("experiment_name", experiment_name) internalTimer = 0 streamData = StreamLocal.StreamLocal( data_source="data/2014_to_dec2018.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) augmentation = BatchedLocal.BatchedLocal( data_source='data/collectedIrrelevant.json', data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) augmentation.load_by_class() trainingData = BatchedLocal.BatchedLocal( data_source='data/initialTrainingData.json', data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) trainingData.load() # Now we have the data MLEPLearner = MLEPServer.MLEPLearningServer(config_dict=mlepConfig, safe_mode=False) # Perform initial traininig MLEPLearner.initialTrain(traindata=trainingData) io_utils.std_flush("Completed training at", time_utils.readable_time()) MLEPLearner.addAugmentation(augmentation) io_utils.std_flush("Added augmentation at", time_utils.readable_time()) totalCounter = 0.0 mistakes = [] _earlystopcond = False while streamData.next() and not _earlystopcond: if internalTimer < streamData.getObject().getValue("timestamp"): internalTimer = streamData.getObject().getValue("timestamp") MLEPLearner.updateTime(internalTimer) classification = MLEPLearner.classify(streamData.getObject()) totalCounter += 1.0 if classification != streamData.getLabel(): mistakes.append(1.0) else: mistakes.append(0.0) if totalCounter % 1000 == 0 and totalCounter > 0.0: io_utils.std_flush("Completed", int(totalCounter), " samples, with running error (past 100) of", sum(mistakes[-100:]) / 100.0) if earlystop and totalCounter == earlystop: _earlystopcond = True if totalCounter % 100 == 0 and totalCounter > 0.0: running_error = sum(mistakes[-100:]) / 100.0 mlflow.log_metric("running_err" + str(int(totalCounter / 100)), running_error) MLEPLearner.shutdown() io_utils.std_flush( "\n-----------------------------\nCOMPLETED\n-----------------------------\n" ) mlflow.log_param("total_samples", totalCounter) if expstatuslog: mlflow.log_artifact(LOG_FILE) mlflow.log_param("run_complete", True) mlflow.end_run() if expstatuslog: sys.stdout.close() sys.stdout = sys.__stdout__ else: sys.stdout = sys.__stdout__
def __exit__(self, type, value, traceback): mlflow.end_run()
def main(params: dict): import mlflow logger = get_logger() print("start params={}".format(params)) # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle").head(30_000_000) df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) df = df[["user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly"]] train_idx = [] val_idx = [] np.random.seed(0) for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"): if np.random.random() < 0.1: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.9) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) df["is_val"] = 0 df["is_val"].loc[val_idx] = 1 w_df = df[df["is_val"] == 0] w_df["group"] = (w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype(str) + "_" + w_df["group"].astype(str) ff_for_transformer = FeatureFactoryForTransformer(column_config={("content_id", "content_type_id"): {"type": "category"}, "user_answer": {"type": "category"}, "part": {"type": "category"}}, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) group = ff_for_transformer.all_predict(w_df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) print(group) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) ff_for_transformer = FeatureFactoryForTransformer(column_config={("content_id", "content_type_id"): {"type": "category"}, "user_answer": {"type": "category"}, "part": {"type": "category"}}, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) dataloader_train = DataLoader(dataset_train, batch_size=64, shuffle=True, num_workers=1) dataloader_val = DataLoader(dataset_val, batch_size=64, shuffle=False, num_workers=1) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"]) optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"]) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".format(epoch, loss, auc, auc_val)) preds = [] labels = [] for d in tqdm(dataloader_val): x = d[0].to(device).long() target_id = d[1].to(device).long() part = d[2].to(device).long() label = d[3].to(device).long() output, atten_weight = model(x, target_id, part) preds.extend(torch.nn.Sigmoid()(output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) df_oof = pd.DataFrame() df_oof["row_id"] = df.loc[val_idx].index df_oof["predict"] = preds df_oof["target"] = df.loc[val_idx]["answered_correctly"].values df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) df_oof2 = pd.read_csv("../output/ex_172/20201202080625/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_transformer = roc_auc_score(df_oof2["target"].values, df_oof2["predict"].values) auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("single transformer: {:.4f}".format(auc_transformer)) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) mlflow.log_param("count_row", len(df)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.log_metric("auc_lgbm", auc_lgbm) mlflow.log_metric("auc_ensemble", max_auc) mlflow.log_metric("ensemble_nn_ratio", max_nn_ratio) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: ff_for_transformer = FeatureFactoryForTransformer(column_config={("content_id", "content_type_id"): {"type": "category"}, "part": {"type": "category"}}, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open(f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) logger = get_logger() # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") df = pd.read_pickle( "../input/riiid-test-answer-prediction/split10/train_0.pickle" ).sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "category" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" } } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent() feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="train_0", load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id" ]] print(df.head(10)) print("data preprocess") train_idx = [] val_idx = [] np.random.seed(0) for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"): if np.random.random() < 0.01: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.95) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df["is_val"] = 0 df["is_val"].loc[val_idx] = 1 w_df = df[df["is_val"] == 0] w_df["group"] = ( w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype( str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model075", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model075/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model075/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model075/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model075/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True, num_workers=1) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False, num_workers=1) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.01, ) num_train_optimization_steps = int(len(dataloader_train) * epochs) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) preds = [] labels = [] for item in tqdm(dataloader_val): x = item["x"].to(device).long() target_id = item["target_id"].to(device).long() part = item["part"].to(device).long() label = item["label"].to(device).float() elapsed_time = item["elapsed_time"].to(device).long() duration_previous_content = item["duration_previous_content"].to( device).long() prior_question_had_explanation = item["prior_q"].to(device).long() user_answer = item["user_answer"].to(device).long() rate_diff = item["rate_diff"].to(device).float() output = model(x, target_id, part, elapsed_time, duration_previous_content, prior_question_had_explanation, user_answer, rate_diff) preds.extend(torch.nn.Sigmoid()( output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) auc_transformer = roc_auc_score(labels, preds) print("single transformer: {:.4f}".format(auc_transformer)) df_oof = pd.DataFrame() # df_oof["row_id"] = df.loc[val_idx].index print(len(dataloader_val)) print(len(preds)) df_oof["predict"] = preds df_oof["target"] = labels df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def start_run(): mlflow.start_run() yield mlflow.end_run()
def after_pipeline_run(self) -> None: """Hook implementation to end the MLflow run after the Kedro pipeline finishes. """ mlflow.end_run()
def on_train_end(self, args, state, control, **kwargs): if self._initialized and state.is_world_process_zero: if self._log_artifacts: logger.info("Logging artifacts. This may take time.") mlflow.log_artifacts(args.output_dir) mlflow.end_run()
def train(self, config: ConfigurationNode = None): """ Take a configuration node and train the model from it. :param config: :return: """ if config is None: config = self.config # Create writable timestamp for easier record keeping timestamp = datetime.now().isoformat(sep="T", timespec="auto") name_timestamp = timestamp.replace(":", "_") # Start the mlflow run: mlflow.start_run(run_name=name_timestamp) # Check valid output path, set path from the path_cfg_override modules respectively assert config.OUTPUT_PATH != '' path_output = config.OUTPUT_PATH # output folder path_train = config.DATASET.TRAIN_DATA_PATH # training data folder path_val = config.DATASET.VAL_DATA_PATH # validation data folder # Make output dir and its parents if not exist. if not os.path.exists(path_output): os.makedirs(path_output) # Make result folders if they do not exist. self.results_dir = (Path(path_output) / name_timestamp) if not os.path.exists(self.results_dir): os.makedirs(self.results_dir) # Make backup folders if they do not exist. self.backup_dir = os.path.join(self.results_dir, 'model_backups') if not os.path.exists(self.backup_dir): os.makedirs(self.backup_dir) writer_tensorboard = SummaryWriter(log_dir=Path(self.results_dir / "logs_tensorflow")) # Now that CFG has been properly merged with new data along the way, time to dump a version of it into a string for trackability purposes. config.dump(stream=open( os.path.join(self.results_dir, f'config{name_timestamp}.yaml'), 'w')) # file path to store the state of the model. state_fpath = os.path.join(self.results_dir, f'model{name_timestamp}.pt') # ???? perf_path = os.path.join(self.results_dir, f'trace{name_timestamp}.p') perf_trace = [] # Load data, create the data loader objects from them. data_train = pickle.load(open(path_train, 'rb')) data_val = pickle.load(open(path_val, 'rb')) self.loader_train = build_data_loader(data_train, config.DATASET, True) self.loader_val = build_data_loader(data_val, config.DATASET, False) # Build the model using configue dict node self.model = build_model(config.MODEL) # Enable parallel multi GPU mode if the config specify it. if config.MODEL.PARALLEL: print("Utilized parallel processing") self.model = torch.nn.DataParallel(self.model) current_epoch = 0 # For resuming training (i.e. load checkpoint) if config.RESUME_PATH != "": checkpoint = torch.load(config.RESUME_PATH, map_location='cpu') current_epoch = checkpoint['epoch'] self.model.load_state_dict(checkpoint["model_state"]) _ = self.model.cuda() # SOLVER EVALUATOR cfg_solver = config.MODEL.SOLVER # Build optimizer (between train/validation, using the solver portion of the configuration. optimizer = build_optimizer(self.model, cfg_solver) # Build evaluator (between train/validation, using the solver portion of the configuration. evaluator = build_evaluator(cfg_solver) evaluator.float().cuda() total_epochs = cfg_solver.TOTAL_EPOCHS # Main training epoch loop starts here. for epoch in range(current_epoch, total_epochs): # Train a single epoch self.train_epoch(epoch, evaluator, optimizer, perf_path, perf_trace, state_fpath, writer_tensorboard) mlflow.end_run()
def run_train_cv(self) -> None: """クロスバリデーションでの学習・評価を行う 学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う """ # mlflow mlflow.set_experiment(self.exp_name) mlflow.start_run(run_name=self.run_name) logger.info(f'{self.run_name} - start training cv') scores = [] va_idxes = [] preds = [] # Adversarial validation if self.advanced and 'adversarial_validation' in self.advanced: X_train = self.X_train X_test = self.X_test X_train['target'] = 0 X_test['target'] = 1 X_train = pd.concat([X_train, X_test], sort=False).reset_index(drop=True) y_train = X_train['target'] X_train.drop('target', axis=1, inplace=True) X_test.drop('target', axis=1, inplace=True) self.X_train = X_train self.y_train = y_train # 各foldで学習を行う for i_fold in range(self.cv.n_splits): # 学習を行う logger.info(f'{self.run_name} fold {i_fold} - start training') model, va_idx, va_pred, score = self.train_fold(i_fold) logger.info( f'{self.run_name} fold {i_fold} - end training - score {score}' ) # モデルを保存する model.save_model() # 結果を保持する va_idxes.append(va_idx) scores.append(score) preds.append(va_pred) # 各foldの結果をまとめる va_idxes = np.concatenate(va_idxes) order = np.argsort(va_idxes) preds = np.concatenate(preds, axis=0) preds = preds[order] if self.evaluation_metric == 'log_loss': cv_score = log_loss(self.y_train, preds, eps=1e-15, normalize=True) elif self.evaluation_metric == 'mean_absolute_error': cv_score = mean_absolute_error(self.y_train, preds) elif self.evaluation_metric == 'rmse': cv_score = np.sqrt(mean_squared_error(self.y_train, preds)) elif self.evaluation_metric == 'auc': cv_score = roc_auc_score(self.y_train, preds) elif self.evaluation_metric == 'prauc': cv_score = average_precision_score(self.y_train, preds) logger.info(f'{self.run_name} - end training cv - score {cv_score}') # 予測結果の保存 Data.dump(preds, f'../output/pred/{self.run_name}-train.pkl') # mlflow self.run_id = mlflow.active_run().info.run_id log_param('model_name', str(self.model_cls).split('.')[-1][:-2]) log_param('fe_name', self.fe_name) log_param('train_params', self.params) log_param('cv_strategy', str(self.cv)) log_param('evaluation_metric', self.evaluation_metric) log_metric('cv_score', cv_score) log_param( 'fold_scores', dict( zip([f'fold_{i}' for i in range(len(scores))], [round(s, 4) for s in scores]))) log_param('cols_definition', self.cols_definition) log_param('description', self.description) mlflow.end_run()
def main(args: DictConfig): # Non-strict access to fields OmegaConf.set_struct(args, False) args.exp.pop('rfi') # Adding default estimator params default_names, _, _, default_values, _, _, _ = \ inspect.getfullargspec(instantiate(args.estimator, context_size=0).__class__.__init__) if default_values is not None: args.estimator['defaults'] = { n: str(v) for (n, v) in zip( default_names[len(default_names) - len(default_values):], default_values) } logger.info(OmegaConf.to_yaml(args, resolve=True)) # Data-generating DAG data_path = hydra.utils.to_absolute_path( f'{ROOT_PATH}/{args.data.relative_path}') exp_name = args.data.relative_path.split('/')[-1] adjacency_matrix = np.load( f'{data_path}/DAG{args.data.sample_ind}.npy').astype(int) if exp_name == 'sachs_2005': var_names = np.load(f'{data_path}/sachs-header.npy') else: var_names = [f'x{i}' for i in range(len(adjacency_matrix))] dag = DirectedAcyclicGraph(adjacency_matrix, var_names) # Experiment tracking exp_name = f'sage/{exp_name}' mlflow.set_tracking_uri(args.exp.mlflow_uri) mlflow.set_experiment(exp_name) # Checking if run exist if check_existing_hash(args, exp_name): logger.info('Skipping existing run.') return else: logger.info('No runs found - perfoming one.') # Loading Train-test data data = np.load(f'{data_path}/data{args.data.sample_ind}.npy') if args.data.standard_normalize: if 'normalise_params' in args.data: standard_normalizer = StandardScaler(**args.data.normalise_params) else: standard_normalizer = StandardScaler() data = standard_normalizer.fit_transform(data) data_train, data_test = train_test_split(data, test_size=args.data.test_ratio, random_state=args.data.split_seed) train_df = pd.DataFrame(data_train, columns=dag.var_names) test_df = pd.DataFrame(data_test, columns=dag.var_names) mlflow.start_run() mlflow.log_params(flatten_dict(args)) mlflow.log_param('data_generator/dag/n', len(var_names)) mlflow.log_param('data_generator/dag/m', int(adjacency_matrix.sum())) mlflow.log_param('data/n_train', len(train_df)) mlflow.log_param('data/n_test', len(test_df)) # Saving artifacts train_df.to_csv( hydra.utils.to_absolute_path(f'{mlflow.get_artifact_uri()}/train.csv'), index=False) test_df.to_csv( hydra.utils.to_absolute_path(f'{mlflow.get_artifact_uri()}/test.csv'), index=False) dag.plot_dag() plt.savefig( hydra.utils.to_absolute_path(f'{mlflow.get_artifact_uri()}/dag.png')) mlflow.log_param('features_sequence', str(list(dag.var_names))) for var_ind, target_var in enumerate(dag.var_names): var_results = {} # Considering all the variables for input input_vars = [var for var in dag.var_names if var != target_var] y_train, X_train = train_df.loc[:, target_var], train_df.loc[:, input_vars] y_test, X_test = test_df.loc[:, target_var], test_df.loc[:, input_vars] # Initialising risks risks = {} for risk in args.predictors.risks: risks[risk] = getattr(importlib.import_module('sklearn.metrics'), risk) # Fitting predictive model models = {} for pred_model in args.predictors.pred_models: logger.info( f'Fitting {pred_model._target_} for target = {target_var} and inputs {input_vars}' ) model = instantiate(pred_model) model.fit(X_train.values, y_train.values) y_pred = model.predict(X_test.values) models[pred_model._target_] = model for risk, risk_func in risks.items(): var_results[f'test_{risk}_{pred_model._target_}'] = risk_func( y_test.values, y_pred) # =================== Global SAGE =================== logger.info(f'Analysing the importance of features: {input_vars}') sampler = instantiate(args.estimator.sampler, X_train=X_train, fit_method=args.estimator.fit_method, fit_params=args.estimator.fit_params) log_lik = [] sage_explainer = explainer.Explainer(None, input_vars, X_train, sampler=sampler, loss=None) # Generating the same orderings across all the models and losses np.random.seed(args.exp.sage.orderings_seed) fixed_orderings = [ np.random.permutation(input_vars) for _ in range(args.exp.sage.nr_orderings) ] for model_name, model in models.items(): for risk, risk_func in risks.items(): sage_explainer.model = model.predict explanation, test_log_lik = sage_explainer.sage( X_test, y_test, loss=risk_func, fixed_orderings=fixed_orderings, nr_runs=args.exp.sage.nr_runs, return_test_log_lik=True, nr_resample_marginalize=args.exp.sage. nr_resample_marginalize) log_lik.extend(test_log_lik) fi = explanation.fi_vals().mean() for fsoi, input_var in enumerate(input_vars): var_results[ f'sage/mean_{risk}_{model_name}_{input_var}'] = fi[ input_var] var_results['sage/mean_log_lik'] = np.mean(log_lik) var_results['sage/num_fitted_estimators'] = len(log_lik) mlflow.log_metrics(var_results, step=var_ind) mlflow.end_run()
tensorboard=tensorboard, valid_graph_path=valid_graph_path, valid_html_auto_open=valid_html_auto_open, using_mlflow=using_mlflow, # valid dataset 그리기 decode_number=decode_number, multiperclass=multiperclass, nms_thresh=nms_thresh, nms_topk=nms_topk, iou_thresh=iou_thresh, except_class_thresh=except_class_thresh, plot_class_thresh=plot_class_thresh) if using_mlflow: ml.end_run() else: test.run(mean=image_mean, std=image_std, load_name=load_name, load_period=load_period, GPU_COUNT=GPU_COUNT, test_weight_path=test_weight_path, test_dataset_path=test_dataset_path, num_workers=num_workers, test_save_path=test_save_path, test_graph_path=test_graph_path, test_html_auto_open=test_html_auto_open, foreground_iou_thresh=foreground_iou_thresh, background_iou_thresh=background_iou_thresh, show_flag=show_flag, save_flag=save_flag, # test dataset 그리기 decode_number=decode_number,
def train_nn_cv(df: pd.DataFrame, model, params: dict, output_dir: str, model_id: int, exp_name: str, drop_user_id: bool, experiment_id: int=0, is_debug: bool=False): if not is_debug: mlflow.start_run(experiment_id=experiment_id, run_name=exp_name) mlflow.log_param("model_id", model_id) mlflow.log_param("count_row", len(df)) mlflow.log_param("count_column", len(df.columns)) for key, value in params.items(): mlflow.log_param(key, value) if drop_user_id: features = [x for x in df.columns if x not in ["answered_correctly", "user_id"]] else: features = [x for x in df.columns if x not in ["answered_correctly"]] df_imp = pd.DataFrame() df_imp["feature"] = features train_idx = [] val_idx = [] np.random.seed(0) for _, w_df in df.groupby("user_id"): if np.random.random() < 0.01: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.95) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) if is_debug: epochs = 3 else: epochs = 1000 model.fit(df[features].iloc[train_idx].values, df["answered_correctly"].iloc[train_idx].values.reshape(-1, 1), batch_size=2**17, epochs=epochs, verbose=True, validation_data=(df[features].iloc[val_idx].values, df["answered_correctly"].iloc[val_idx].values.reshape(-1, 1)), callbacks=[EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=-1, mode='auto'), ReduceLROnPlateau(monitor='val_loss', factor=0.25, patience=3, verbose=-1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0), ModelCheckpoint(filepath=f"{output_dir}/best_nn_{model_id}.weight", monitor='val_loss', verbose=-1, save_best_only=True, mode='auto')]) model = load_model(f"{output_dir}/best_nn_{model_id}.weight") pd.DataFrame(features, columns=["feature"]).to_csv(f"{output_dir}/nn_use_feature.csv", index=False) y_train = model.predict(df.iloc[train_idx][features]) y_oof = model.predict(df.iloc[val_idx][features]) auc_train = roc_auc_score(df.iloc[train_idx]["answered_correctly"].values.flatten(), y_train.flatten()) auc_val = roc_auc_score(df.iloc[val_idx]["answered_correctly"].values.flatten(), y_oof.flatten()) print(f"auc_train: {auc_train}, auc_val: {auc_val}") if not is_debug: mlflow.log_metric("auc_train", auc_train) mlflow.log_metric("auc_val", auc_val) mlflow.end_run() df_oof = pd.DataFrame() df_oof["row_id"] = df.iloc[val_idx].index df_oof["predict"] = y_oof df_oof["target"] = df.iloc[val_idx]["answered_correctly"].values df_oof.to_csv(f"{output_dir}/oof_{model_id}_nn.csv", index=False)
def train_model_mlflow(model: tf.keras.models.Model, train_gen: DataGenerator, validation_gen: DataGenerator, epochs=10, steps=4000, mlflow_server='http://0.0.0.0:8643', checkpoints_path="checkpoints/run3"): # Configure output_dir output_dir = tempfile.mkdtemp() if mlflow_server: # Tracking URI if not mlflow_server.startswith("http"): mlflow_tracking_uri = 'http://' + mlflow_server + ':5000' else: mlflow_tracking_uri = mlflow_server # Set the Tracking URI mlflow.set_tracking_uri(mlflow_tracking_uri) print("MLflow Tracking URI: %s" % mlflow_tracking_uri) else: print("MLflow Tracking URI: %s" % "local directory 'mlruns'") # mlflow.tensorflow.autolog() # mlflow.keras.autolog() mlflow.set_experiment("/face-age-emotion-gender-detector") with mlflow.start_run(): model_dir = "models/" + str(mlflow.active_run().info.run_uuid) # mlflow.log_artifacts("checkpoints/") mlflow.log_param('Epochs', str(epochs)) mlflow.log_param('Steps', str(steps)) x = str(model.summary()) mlflow.log_param('model', x) mlflow.keras.log_model(model, 'models') tf.saved_model.save(model, model_dir) mlflow.log_artifact('./' + model_dir + "/saved_model.pb") mlflow.log_artifacts('./' + model_dir, artifact_path='models') optimizer = tf.keras.optimizers.Adadelta() global_steps = 0 validation_steps = 0 for epoch in range(epochs): for step in range(steps): global_steps += 1 image_data, target_y_e, target_y_g, target_y_a = next( train_gen.get_data()) with tf.GradientTape() as tape: pred_y_e, pred_y_g, pred_y_a = model(image_data) l_e, l_g, l_a = compute_loss(pred_y_e, pred_y_g, pred_y_a, target_y_e, target_y_g, target_y_a) total_loss = l_e + l_g + l_a gradients = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients( zip(gradients, model.trainable_variables)) print("=> epoch %d step %d train_loss: %.6f" % (epoch + 1, step + 1, total_loss.numpy())) mlflow.log_metric("train/total_loss", total_loss.numpy(), step=global_steps) mlflow.log_metric("train/emotion_loss", l_e.numpy(), step=global_steps) mlflow.log_metric("train/gender_loss", l_g.numpy(), step=global_steps) mlflow.log_metric("train/age_loss", l_a, step=global_steps) # validation step if step % 500 == 0: validation_steps += 1 image_data, target_y_e, target_y_g, target_y_a = next( validation_gen.get_data()) pred_y_e, pred_y_g, pred_y_a = model(image_data) l_e, l_g, l_a = compute_loss(pred_y_e, pred_y_g, pred_y_a, target_y_e, target_y_g, target_y_a) total_valid_loss = l_e + l_g + l_a mlflow.log_metric("valid_loss", total_valid_loss.numpy(), step=validation_steps) mk_dir("checkpoints/") p_loss = int(round(total_loss.numpy(), 2) * 100) print(f"EGA_epoch_{epoch}_score_{p_loss}") # model.save(f"EGA_epoch_{epoch}_score_{p_loss}") # model.save_weights(f"EGA_epoch_{epoch}_score_{p_loss}.h5") mlflow.keras.save_model(model, "checkpoints/" + str(int(time.time()))) mlflow.log_artifacts("checkpoints/") mlflow.end_run()
def main(runname, expstatslog, mlflowlog, earlystop): if mlflowlog: pass else: global mlflow mlflow = dumbflow() if expstatslog: exp_status_write = open(EXP_STATUS, "a") else: exp_status_write = sys.stdout exp_status_write.write("\n\n\n\n") exp_status_write.write("--------------------------") exp_status_write.write(" BEGINNING NEW EXECUTION (" + runname + ") AT " + str(time_utils.readable_time("%Y-%m-%d %H:%M:%S"))) exp_status_write.write(" ------------------------" + "\n\n") # We are tracking drift adaptivity # namely labeled drift detection # Set up explicit drift detection params explicit_drift_param_grid = { "allow_explicit_drift": [(True, "ExpDr")], "explicit_drift_class": [("LabeledDriftDetector", "LDD")], "explicit_drift_mode": [("PageHinkley", "PageHinkley"), ("ADWIN", "ADWIN"), ("EDDM", "EDDM"), ("DDM", "DDM")], "explicit_update_mode": [("all", "A"), ("errors", "E")], "allow_unlabeled_drift": [(False, "")], "allow_update_schedule": [(False, "")], "weight_method": [("unweighted", "U"), ("performance", "P")], "select_method": [("recent", "RR"), ("recent-new", "RN"), ("recent-updates", "RU")], "filter_method": [("no-filter", "F"), ("top-k", "T"), ("nearest", "N")], "kval": [(5, "5"), (10, "10")] } explicit_drift_params = ParameterGrid(explicit_drift_param_grid) for param_set in explicit_drift_params: # This is an experiment if param_set["explicit_update_mode"][0] == "all": continue # Load up configuration file mlepConfig = io_utils.load_json('./MLEPServer.json') # Update config file and generate an experiment name experiment_name = '' for _param in param_set: if param_set[_param][1] != "": experiment_name += param_set[_param][1] + '-' mlepConfig["config"][_param] = param_set[_param][0] experiment_name = experiment_name[:-1] # Now we have the Experimental Coonfig we can use for running an experiment # generate an experiment name exp_status_write.write("--STATUS-- " + experiment_name + " ") exp_status_write.flush() try: runExperiment(runname, mlepConfig, experiment_name, expstatslog, earlystop) exp_status_write.write("SUCCESS\n") except Exception as e: exp_status_write.write("FAILED\n") exp_status_write.write(traceback.format_exc()) exp_status_write.write(str(e)) exp_status_write.write("\n") exp_status_write.flush() mlflow.end_run() exp_status_write.flush() exp_status_write.write("\n\n") exp_status_write.write("--------------------------") exp_status_write.write(" FINISHED EXECUTION OF (" + runname + ") AT " + str(time_utils.readable_time("%Y-%m-%d %H:%M:%S"))) exp_status_write.write(" ------------------------" + "\n\n") exp_status_write.close()
def train_model(args, base_line=True): ''' Train model function ''' graph_label_loss = 'Baseline Model: Training and Validation Loss' graph_label_acc = 'Baseline Model: Training and Validation Accuracy' graph_image_loss_png = os.path.join(image_dir, 'baseline_loss.png') graph_image_acc_png = os.path.join(image_dir, 'baseline_accuracy.png') if not base_line: graph_label_loss = 'Experimental: Training and Validation Loss' graph_label_acc = 'Experimental Model: Training and Validation Accuracy' graph_image_loss_png = os.path.join(image_dir, 'experimental_loss.png') graph_image_acc_png = os.path.join(image_dir, 'experimental_accuracy.png') image_data_generator = tf.keras.preprocessing.image.ImageDataGenerator( validation_split=validation_split) train_generator = image_data_generator.flow_from_directory( TRAIN_DATA_DIR, target_size=(TRAIN_IMAGE_SIZE, TRAIN_IMAGE_SIZE), batch_size=TRAIN_BATCH_SIZE, class_mode='categorical', subset='training') validation_generator = image_data_generator.flow_from_directory( TRAIN_DATA_DIR, target_size=(TRAIN_IMAGE_SIZE, TRAIN_IMAGE_SIZE), batch_size=TRAIN_BATCH_SIZE, class_mode='categorical', subset='validation') # Create the model model = Sequential() model.add( Conv2D(args.filters, kernel_size=args.kernel_size, activation='relu', padding='same', input_shape=(img_width, img_height, img_num_channels))) model.add(Flatten()) model.add(Dense(args.output, activation='softmax')) # Compile the model model.compile(loss=args.loss, optimizer=args.optimizer, metrics=['accuracy']) history = model.fit_generator(train_generator, epochs=args.epochs, validation_data=validation_generator) model.summary() print_metrics(history) figure_loss = plot_loss_graph(history, graph_label_loss) figure_loss.savefig(graph_image_loss_png) figure_acc = plot_accuracy_graph(history, graph_label_acc) figure_acc.savefig(graph_image_acc_png) # print('==================================================') # predictions = model.predict(TEST_DATA_DIR) # print(predictions) # print('==================================================') #mlflow.set_experiment(args.experiment_name) with mlflow.start_run(): # print out current run_uuid run_uuid = mlflow.active_run().info.run_uuid print("MLflow Run ID: %s" % run_uuid) # mlflow.create_experiment("Training CNN Model", artifact_location=None) # log parameters mlflow.log_param("Filters", args.filters) mlflow.log_param("Kernel Size", args.kernel_size) mlflow.log_param("Output", args.output) mlflow.log_param("Epochs", args.epochs) mlflow.log_param("Loss", args.loss) mlflow.log_param("Optimize", args.optimizer) # calculate metrics binary_loss = get_binary_loss(history) binary_acc = get_binary_acc(history) validation_loss = get_validation_loss(history) validation_acc = get_validation_acc(history) # log metrics mlflow.log_metric("binary_loss", binary_loss) mlflow.log_metric("binary_acc", binary_acc) mlflow.log_metric("validation_loss", validation_loss) mlflow.log_metric("validation_acc", validation_acc) # log artifacts mlflow.log_artifacts(image_dir, "images") # log model mlflow.keras.log_model(model, "models") # save model locally pathdir = "../data/out/keras_models/" + run_uuid # keras_save_model(model, pathdir) # Write out TensorFlow events as a run artifact print("Uploading TensorFlow events as a run artifact.") mlflow.log_artifacts(output_dir, artifact_path="events") mlflow.end_run()
def reset_mlflow(self): mlflow.end_run()
def close(self, mlflow=False): for prefix, writer in self.writers.items(): writer.flush() writer.close() if mlflow: module_mlflow.end_run()
def main(): mlflow.start_run(run_name=NAME) if "X_train.pkl" not in os.listdir(): print("procesando los datos") X, y, encoder = preprocess_data("TOTAL_TRAIN.csv", process_cat=False) print(X.shape) with open(f"label_encoder_{NAME}.pkl", "wb") as f: pickle.dump(encoder, f) print( f"##################### The shape of X is {X.shape} #######################" ) y = y.astype("int") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=15, stratify=y) with open("X_train.pkl", "wb") as f: pickle.dump(X_train, f) with open("X_test.pkl", "wb") as f: pickle.dump(X_test, f) with open("y_train.pkl", "wb") as f: pickle.dump(y_train, f) with open("y_test.pkl", "wb") as f: pickle.dump(y_test, f) print(X_train.shape) else: with open("X_train.pkl", "rb") as f: X_train = pickle.load(f) with open("X_test.pkl", "rb") as f: X_test = pickle.load(f) with open("y_train.pkl", "rb") as f: y_train = pickle.load(f) with open("y_test.pkl", "rb") as f: y_test = pickle.load(f) with open(f"label_encoder_XGB1704.pkl", "rb") as f: encoder = pickle.load(f) print("######### ajustando cat encoder ############") cols_cat = ["ruido", "CODIGO_POSTAL", "ZONA_METROPOLITANA", "CALIDAD_AIRE"] cols_float = [col for col in X_train.columns if col not in cols_cat] X_train[cols_float] = X_train[cols_float].astype("float") X_test[cols_float] = X_test[cols_float].astype("float") labs_names = [c for c in encoder.classes_] model = LGBMClassifier( class_weight="balanced", objective="multiclass:softmax", n_jobs=-1, random_state=100, silent=True, ) if MODE != "INDIVIDUAL": params = { "reg_alpha": (1e-3, 5.0, "log-uniform"), "reg_lambda": (1e-2, 50.0, "log-uniform"), "n_estimators": (600, 4500), "learning_rate": (5e-3, 1.0, "log-uniform"), "num_leaves": (20, 80), "boosting_type": ["gbdt", "goss"], "colsample_bytree": (0.1, 1.0, "uniform"), "subsample": (0.1, 1.0, "uniform"), "min_child_samples": (1, 25), "min_child_weight": (1e-6, 0.1, "log-uniform"), } print(params) cb = CatBoostEncoder(cols=cols_cat) X_train = cb.fit_transform(X_train, y_train) X_test = cb.transform(X_test) fit_params = { ### fit params ### "eval_set": [(X_test, y_test)], "eval_metric": lgb_f1_score, "early_stopping_rounds": 300, } pipeline = Pipeline(steps=[("clas_encoder", CatBoostEncoder( cols=cols_cat)), ("model", model)]) best_model = BayesSearchCV( model, params, n_iter=N_ITER, n_points=1, cv=cv, scoring=f2_scorer, random_state=100, optimizer_kwargs={"n_initial_points": 10}, fit_params=fit_params, ) def on_step(optim_result): score = best_model.best_score_ results = best_model.cv_results_ try: results_df = pd.DataFrame(results) results_df.to_csv(f"results_{NAME}.csv", header=True, index=False) print( f"############ Llevamos {results_df.shape[0]} pruebas #################" ) print(f"los resultados del cv de momento son {results_df}") except: print("Unable to convert cv results to pandas dataframe") mlflow.log_metric("best_score", score) with open(f"./best_{NAME}_params.pkl", "wb") as f: pickle.dump(best_model.best_params_, f) print("best score: %s" % score) if score >= 0.98: print("Interrupting!") return True print("ajustando modelo") if MODE != "INDIVIDUAL": print(X_train.dtypes) best_model.fit(X_train, y_train, callback=[on_step]) with open(f"./best_{NAME}_model.pkl", "wb") as f: pickle.dump(best_model, f) preds = best_model.predict(X_test) else: if NAME not in os.listdir(): os.mkdir(NAME) cat_encoder = CatBoostEncoder(cols=cols_cat) X_train = cat_encoder.fit_transform(X_train, y_train) X_test = cat_encoder.transform(X_test) best_model = BalancedBaggingClassifier( base_estimator=HistGradientBoostingClassifier( max_iter=3000, random_state=42, learning_rate=0.1, max_leaf_nodes=54, min_samples_leaf=2, scoring=f2_scorer, validation_fraction=0.1, n_iter_no_change=50, ), n_estimators=5, random_state=42, n_jobs=-1, max_features=0.7, sampling_strategy={5: int(dict(Counter(y_train))[5] * 0.11)}, ) best_model.fit(X_train, y_train) preds = best_model.predict(X_test) print( f'F1 SCORE IS {f1_score(y_test, preds, average="macro")}, precision is {precision_score(y_test, preds, average="macro")}, recall is {recall_score(y_test, preds, average="macro")}, accuracy is {accuracy_score(y_test, preds)}' ) print( f"F2 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}" ) print( f"F05 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}" ) cm = confusion_matrix(y_test, preds) grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names) grafico_conf_matrix.savefig(f"{NAME}/norm_NO_PIPELINE") with open(f"best_model_{NAME}.pkl", "wb") as f: pickle.dump(best_model, f) print("loggeando movidas") mlflow.log_metrics( metrics={ "f1": f1_score(y_test, preds, average="macro"), "precision": precision_score(y_test, preds, average="macro"), "recall": recall_score(y_test, preds, average="macro"), "accuracy": accuracy_score(y_test, preds), "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"), "f2": fbeta_score(y_test, preds, beta=2, average="macro"), }) if MODE != "INDIVIDUAL": best_params = best_model.best_params_ for param in best_params.keys(): mlflow.log_param(param, best_params[param]) cm = confusion_matrix(y_test, preds) grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names) grafico_conf_matrix.savefig(NAME) grafico_norm = print_confusion_matrix(cm, class_names=labs_names, normalize=False) grafico_norm.savefig(f"{NAME}_no_norm") mlflow.end_run()
def main_worker(gpu, ngpus_per_node, config): config['gpu'] = gpu # suppress printing if not master process if config['multiprocessing_distributed'] and config['gpu'] != 0: def print_pass(*args): pass builtins.print = print_pass if config['gpu'] is not None: print("Use GPU: {} for training".format(config['gpu'])) if config['distributed']: if config['dist_url'] == "env://" and config['rank'] == -1: config['rank'] = int(os.environ["RANK"]) if config['multiprocessing_distributed']: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes config['rank'] = config['rank'] * ngpus_per_node + gpu dist.init_process_group(backend=config['dist_backend'], init_method=config['dist_url'], world_size=config['world_size'], rank=config['rank']) print("=> creating model '{}'".format(config['arch'])) #hardcoding the resnet50 for the time being model = builder.MoCo(resnet50, config['moco_dim'], config['moco_k'], config['moco_m'], config['moco_t'], config['mlp']) if config['distributed']: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if config['gpu'] is not None: torch.cuda.set_device(config['gpu']) model.cuda(config['gpu']) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have config['batch_size'] = int(config['batch_size'] / ngpus_per_node) config['workers'] = int( (config['workers'] + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[config['gpu']]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif config['gpu'] is not None: torch.cuda.set_device(config['gpu']) model = model.cuda(config['gpu']) # comment out the following line for debugging raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. raise NotImplementedError("Only DistributedDataParallel is supported.") criterion = nn.CrossEntropyLoss().cuda(config['gpu']) optimizer = torch.optim.SGD(model.parameters(), config['lr'], momentum=config['momentum'], weight_decay=config['weight_decay']) #set the start_epoch, overwritten if resuming config['start_epoch'] = 0 # optionally resume from a checkpoint if config['resume']: if os.path.isfile(config['resume']): print("=> loading checkpoint '{}'".format(config['resume'])) if config['gpu'] is None: checkpoint = torch.load(config['resume']) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(config['gpu']) checkpoint = torch.load(config['resume'], map_location=loc) config['start_epoch'] = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( config['resume'], checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(config['resume'])) cudnn.benchmark = True #get the mean and standard deviation pixels from config #and wrap them in lists for tf.Normalize to work norms = config['norms'] mean_pixel = norms['mean'] std_pixel = norms['std'] normalize = tf.Normalize(mean=[mean_pixel], std=[std_pixel]) #for now, these augmentations are hardcoded. torchvision #isn't as easy to work with as albumentations augmentation = tf.Compose([ tf.Grayscale(3), tf.RandomApply([tf.RandomRotation(180)], p=0.5), tf.RandomResizedCrop(224, scale=(0.2, 1.)), tf.ColorJitter(0.4, 0.4, 0.4, 0.1), tf.RandomApply([GaussianBlur([.1, 2.])], p=0.5), tf.Grayscale(1), tf.RandomHorizontalFlip(), tf.RandomVerticalFlip(), tf.ToTensor(), GaussNoise(p=0.5), normalize ]) train_dataset = EMData(config['data_file'], augmentation) if config['distributed']: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=(train_sampler is None), num_workers=config['workers'], pin_memory=True, sampler=train_sampler, drop_last=True) #log parameters, if needed: if config['logging'] and (config['multiprocessing_distributed'] and config['rank'] % ngpus_per_node == 0): #end any old runs mlflow.end_run() mlflow.set_experiment(config['experiment_name']) mlflow.log_artifact(config['config_file']) #we don't want to add everything in the config #to mlflow parameters, we'll just add the most #likely to change parameters mlflow.log_param('data_file', config['data_file']) mlflow.log_param('architecture', config['arch']) mlflow.log_param('epochs', config['epochs']) mlflow.log_param('batch_size', config['batch_size']) mlflow.log_param('learning_rate', config['lr']) mlflow.log_param('moco_dim', config['moco_dim']) mlflow.log_param('moco_k', config['moco_k']) mlflow.log_param('moco_m', config['moco_m']) mlflow.log_param('moco_t', config['moco_t']) for epoch in range(config['start_epoch'], config['epochs']): if config['distributed']: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, config) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, config) #only save checkpoints from the main process if not config['multiprocessing_distributed'] or ( config['multiprocessing_distributed'] and config['rank'] % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': config['arch'], 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'norms': [mean_pixel, std_pixel], }, is_best=False, filename=os.path.join(config['model_dir'], 'current.pth.tar')) #save checkpoint every save_freq epochs if (epoch + 1) % config['save_freq'] == 0: save_checkpoint( { 'epoch': epoch + 1, 'arch': config['arch'], 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'norms': [mean_pixel, std_pixel], }, is_best=False, filename=os.path.join( config['model_dir'] + 'checkpoint_{:04d}.pth.tar'.format(epoch + 1)))
def close(self): import mlflow mlflow.end_run()
def __del__(self): # if the previous run is not terminated correctly, the fluent API will # not let you start a new run before the previous one is killed if mlflow.active_run is not None: mlflow.end_run(status="KILLED")
def train( main_options: MainOptions, train_options: TrainOptions ) -> None: assert train_options.dim == 2 or train_options.dim == 3, \ "Only 2D is supported at the moment " \ "for data loading and observation / transition. " \ "See torchvision.datasets.ImageFolder" output_dir = train_options.output_dir model_dir = "models" if not exists(join(output_dir, model_dir)): mkdir(join(output_dir, model_dir)) if exists(join(output_dir, model_dir)) \ and not isdir(join(output_dir, model_dir)): raise Exception(f"\"{join(output_dir, model_dir)}\"" f"is not a directory.") exp_name = "MARLClassification" mlflow.set_experiment(exp_name) mlflow.start_run(run_name=f"train_{main_options.run_id}") mlflow.log_param("output_dir", output_dir) mlflow.log_param("model_dir", join(output_dir, model_dir)) img_pipeline = tr.Compose([ tr.ToTensor(), custom_tr.NormalNorm() ]) if train_options.ft_extr_str.startswith("resisc"): dataset_constructor = RESISC45Dataset elif train_options.ft_extr_str.startswith("mnist"): dataset_constructor = MNISTDataset else: dataset_constructor = KneeMRIDataset nn_models = ModelsWrapper( train_options.ft_extr_str, train_options.window_size, train_options.hidden_size_belief, train_options.hidden_size_action, train_options.hidden_size_msg, train_options.hidden_size_state, train_options.dim, train_options.action, train_options.nb_class, train_options.hidden_size_linear_belief, train_options.hidden_size_linear_action ) dataset = dataset_constructor(img_pipeline) marl_m = MultiAgent( main_options.nb_agent, nn_models, train_options.hidden_size_belief, train_options.hidden_size_action, train_options.window_size, train_options.hidden_size_msg, train_options.action, obs_generic, trans_generic ) mlflow.log_params({ "ft_extractor": train_options.ft_extr_str, "window_size": train_options.window_size, "hidden_size_belief": train_options.hidden_size_belief, "hidden_size_action": train_options.hidden_size_action, "hidden_size_msg": train_options.hidden_size_msg, "hidden_size_state": train_options.hidden_size_state, "dim": train_options.dim, "action": train_options.action, "nb_class": train_options.nb_class, "hidden_size_linear_belief": train_options.hidden_size_linear_belief, "hidden_size_linear_action": train_options.hidden_size_linear_action, "nb_agent": main_options.nb_agent, "frozen_modules": train_options.frozen_modules, "epsilon": train_options.epsilon, "epsilon_decay": train_options.epsilon_decay, "nb_epoch": train_options.nb_epoch, "learning_rate": train_options.learning_rate, "img_size": train_options.img_size, "retry_number": train_options.retry_number, "step": main_options.step, "batch_size": train_options.batch_size }) json_f = open(join(output_dir, "class_to_idx.json"), "w") json.dump(dataset.class_to_idx, json_f) json_f.close() mlflow.log_artifact(join(output_dir, "class_to_idx.json")) cuda = main_options.cuda device_str = "cpu" # Pass pytorch stuff to GPU # for agents hidden tensors (belief etc.) if cuda: nn_models.cuda() marl_m.cuda() device_str = "cuda" mlflow.log_param("device", device_str) module_to_train = ModelsWrapper.module_list \ .difference(train_options.frozen_modules) # for RL agent models parameters optim = th.optim.Adam( nn_models.get_params(list(module_to_train)), lr=train_options.learning_rate ) idx = th.randperm(len(dataset)) idx_train = idx[:int(0.85 * idx.size(0))] idx_test = idx[int(0.85 * idx.size(0)):] train_dataset = Subset(dataset, idx_train) test_dataset = Subset(dataset, idx_test) train_dataloader = DataLoader( train_dataset, batch_size=train_options.batch_size, shuffle=True, num_workers=3, drop_last=False ) test_dataloader = DataLoader( test_dataset, batch_size=train_options.batch_size, shuffle=True, num_workers=3, drop_last=False ) epsilon = train_options.epsilon curr_step = 0 for e in range(train_options.nb_epoch): nn_models.train() sum_loss = 0. i = 0 conf_meter = ConfusionMeter(train_options.nb_class) tqdm_bar = tqdm(train_dataloader) for x_train, y_train in tqdm_bar: x_train, y_train = x_train.to(th.device(device_str)), \ y_train.to(th.device(device_str)) # pred = [Nr, Ns, Nb, Nc] # prob = [Nr, Ns, Nb] retry_pred, retry_prob = episode_retry( marl_m, x_train, epsilon, main_options.step, train_options.retry_number, train_options.nb_class, device_str ) # Class one hot encoding y_eye = th.eye( train_options.nb_class, device=th.device(device_str) )[y_train.unsqueeze(0)].unsqueeze(1).repeat( 1, main_options.step, 1, 1) # Update confusion meter # mean between trials conf_meter.add( retry_pred.detach()[:, -1, :, :].mean(dim=0), y_train ) # L2 Loss - Classification error / reward # reward = -error(y_true, y_step_pred).mean(class_dim) r = -th.pow(y_eye - retry_pred, 2.).mean(dim=-1) # Compute loss losses = retry_prob * r.detach() + r # Losses mean on images batch and trials # maximize(E[reward]) -> minimize(-E[reward]) loss = -losses.mean() # Reset gradient optim.zero_grad() # Backward on compute graph loss.backward() # Update weights optim.step() # Update epoch loss sum sum_loss += loss.item() # Compute global score precs, recs = prec_rec(conf_meter) if curr_step % 100 == 0: mlflow.log_metrics( {"loss": loss.item(), "train_prec": precs.mean().item(), "train_rec": recs.mean().item(), "epsilon": epsilon}, step=curr_step ) tqdm_bar.set_description( f"Epoch {e} - Train, " f"loss = {sum_loss / (i + 1):.4f}, " f"eps = {epsilon:.4f}, " f"train_prec = {precs.mean():.3f}, " f"train_rec = {recs.mean():.3f}" ) epsilon *= train_options.epsilon_decay epsilon = max(epsilon, 0.) i += 1 curr_step += 1 sum_loss /= len(train_dataloader) save_conf_matrix(conf_meter, e, output_dir, "train") mlflow.log_artifact( join(output_dir, f"confusion_matrix_epoch_{e}_train.png") ) nn_models.eval() conf_meter.reset() with th.no_grad(): tqdm_bar = tqdm(test_dataloader) for x_test, y_test in tqdm_bar: x_test, y_test = x_test.to(th.device(device_str)), \ y_test.to(th.device(device_str)) preds, _ = episode(marl_m, x_test, 0., main_options.step) conf_meter.add(preds.detach(), y_test) # Compute score precs, recs = prec_rec(conf_meter) tqdm_bar.set_description( f"Epoch {e} - Eval, " f"eval_prec = {precs.mean():.4f}, " f"eval_rec = {recs.mean():.4f}" ) # Compute score precs, recs = prec_rec(conf_meter) save_conf_matrix(conf_meter, e, output_dir, "eval") mlflow.log_metrics( {"eval_prec": precs.mean(), "eval_recs": recs.mean()}, step=curr_step ) nn_models.json_args( join(output_dir, model_dir, f"marl_epoch_{e}.json") ) th.save( nn_models.state_dict(), join(output_dir, model_dir, f"nn_models_epoch_{e}.pt") ) mlflow.log_artifact( join(output_dir, model_dir, f"marl_epoch_{e}.json") ) mlflow.log_artifact( join(output_dir, model_dir, f"nn_models_epoch_{e}.pt") ) mlflow.log_artifact( join(output_dir, f"confusion_matrix_epoch_{e}_eval.png") ) empty_pipe = tr.Compose([ tr.ToTensor() ]) dataset_tmp = dataset_constructor(empty_pipe) test_dataloader_ori = Subset(dataset_tmp, idx_test) test_dataloader = Subset(dataset, idx_test) test_idx = randint(0, len(test_dataloader_ori)) visualize_steps( marl_m, test_dataloader[test_idx][0], test_dataloader_ori[test_idx][0], main_options.step, train_options.window_size, output_dir, train_options.nb_class, device_str, dataset.class_to_idx ) mlflow.end_run()
def main(model_config_module): model_config = importlib.import_module(model_config_module) logger.info(f"Loading data from {RAW_DATA_IN_PATH}") raw_dataframe = get_data(RAW_DATA_IN_PATH) logger.info(f"Splitting into {config.TRAIN_TEST_SPLIT_RATIO} train and {1-config.TRAIN_TEST_SPLIT_RATIO} test") raw_train, raw_test = train_test_split(raw_dataframe, config.TEST_SPLIT_DAYS) logger.info(f"Loading metadata from {META_DATA_IN_PATH}") meta_dataframe = get_data(META_DATA_IN_PATH) logger.info(f"Processing train dataset") processed_train_dataset = preprocess_train_data(raw_train, meta_dataframe) initialize_model = model_config.initialize_model grid = model_config.GRID #set experiment name logger.info(f"Starting MLFlow runs in experiment {config.EXPERIMENT_NAME}") mlflow.set_experiment(config.EXPERIMENT_NAME) logger.info(f"Train model with grid length of {len(grid)}") with mlflow.start_run(run_name=f"{model_config.RUN_NAME} grid search parent."): for params in grid: with mlflow.start_run(run_name=f'{model_config.RUN_NAME}: parameters: {params}', nested=True): logger.info(f"Train model with parameters: {params}.") mlflow.log_param("Parameters", params) init_model = initialize_model(params) model = train_model(init_model, processed_train_dataset) logger.info(f"Adding predictions") test_dataframe = add_prediction(test_dataset=raw_test, base_dataset=processed_train_dataset, meta_dataframe=meta_dataframe, model=model, predict_col_name=config.PREDICT) # Metrics logger.info(f"Logging metrics to MLFlow") metric = evaluate(test_dataframe) #Remember to change name from metric to test metric # MlFlow logs mlflow.log_metric("Root mean squared error", metric['root_mean_squared_error']) mlflow.log_metric("Mean squared error", metric['mean_squared_error']) mlflow.log_metric("Mean absolute error", metric['mean_absolute_error']) mlflow.log_metric("Mean absolute percentage error", metric['mean_absolute_percentage_error']) mlflow.log_metric("Absolute biggest deviation", metric['absolute_biggest_deviation']) # Plot logger.info(f"Logging timeserie graph to MLFlow") timeserie_plot(test_dataframe, config.DATE_COLUMN, PLOT_ACTUAL_VS_PREDICT_PLOT) # Log artifacts (output files) mlflow.log_artifact(str(PLOT_ACTUAL_VS_PREDICT_PLOT)) logger.info(f"Saving model to {MODEL_PATH}") save_as_pickle(model, MODEL_PATH) logger.info(f"Saving test_dataframe to {TEST_DATAFRAME_PATH}") save_as_pickle(test_dataframe, TEST_DATAFRAME_PATH) mlflow.end_run()
def end_run(cls): mlflow.end_run()
def manual_run(request): if request.param: mlflow.start_run() yield mlflow.end_run()
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "all" logger = get_logger() df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df["prior_question_had_explanation"].fillna(-1) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) column_config = { ("content_id", "content_type_id"): {"type": "category"}, "user_answer": {"type": "leakage_feature"}, "answered_correctly": {"type": "leakage_feature"}, "part": {"type": "category"}, "prior_question_elapsed_time_bin300": {"type": "category"}, "duration_previous_content_bin300": {"type": "category"}, "prior_question_had_explanation": {"type": "category"}, "rating_diff_content_user_id": {"type": "numeric"}, "task_container_id_bin300": {"type": "category"}, "previous_answer_index_content_id": {"type": "category"}, "previous_answer_content_id": {"type": "category"}, "timediff-elapsedtime_bin500": {"type": "category"} } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"]["DurationPreviousContent"] = DurationPreviousContent(is_partial_fit=True) feature_factory_dict["user_id"]["ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"]["UserContentRateEncoder"] = UserContentRateEncoder(rate_func="elo", column="user_id") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"]["StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True) feature_factory_dict["user_id"][f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator(column="user_id", agg_column="study_time", remove_now=False) feature_factory_dict["user_id"]["ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder() feature_factory_dict["post"] = { "DurationFeaturePostProcess": DurationFeaturePostProcess() } feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) def f(x): x = x // 1000 if x < -100: return -100 if x > 400: return 400 return x df["task_container_id_bin300"] = [x if x < 300 else 300 for x in df["task_container_id"]] df["timediff-elapsedtime_bin500"] = [f(x) for x in df["timediff-elapsedtime"].values] df = df[["user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "task_container_id_bin300", "previous_answer_index_content_id", "previous_answer_content_id", "row_id", "timediff-elapsedtime_bin500"]] print(df.head(10)) print("data preprocess") ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df_val_row = pd.read_feather("../input/riiid-test-answer-prediction/train_transformer_last2500k_only_row_id.feather") if is_debug: df_val_row = df_val_row.head(3000) df_val_row["is_val"] = 1 df = pd.merge(df, df_val_row, how="left", on="row_id") df["is_val"] = df["is_val"].fillna(0) print(df["is_val"].value_counts()) w_df = df[df["is_val"] == 0] w_df["group"] = (w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype(str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model232", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model232/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model232/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model232/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model232/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True, num_workers=1) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False, num_workers=1) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout, cont_emb=params["cont_emb"]) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.01, ) num_train_optimization_steps = int(len(dataloader_train) * 20) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, epoch, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".format(epoch, loss, auc, auc_val)) torch.save(model.state_dict(), f"{output_dir}/transformers_epoch{epoch}_auc{round(auc_val, 4)}.pth") preds = [] labels = [] with torch.no_grad(): for item in tqdm(dataloader_val): label = item["label"].to(device).float() output = model(item, device) preds.extend(torch.nn.Sigmoid()(output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) auc_transformer = roc_auc_score(labels, preds) print("single transformer: {:.4f}".format(auc_transformer)) df_oof = pd.DataFrame() # df_oof["row_id"] = df.loc[val_idx].index print(len(dataloader_val)) print(len(preds)) df_oof["predict"] = preds df_oof["target"] = labels df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model torch.cuda.empty_cache() with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"]["DurationPreviousContent"] = DurationPreviousContent(is_partial_fit=True) feature_factory_dict["user_id"]["ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open(f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def after_fit(self): mlflow.end_run()
def prexisting_run_id(tracking_uri): mlflow.set_tracking_uri(tracking_uri) mlflow.start_run() yield mlflow.active_run().info.run_id mlflow.end_run()
# # Code snippet for https://mlflow.org/docs/latest/python_api/mlflow.html#end_run # import warnings import mlflow if __name__ == "__main__": warnings.filterwarnings("ignore") print(mlflow.__version__) # Start run and get status mlflow.start_run() run = mlflow.active_run() print("run_id: {}; status: {}".format(run.info.run_id, run.info.status)) # End the run and get status mlflow.end_run() run = mlflow.get_run(run.info.run_id) print("run_id: {}; status: {}".format(run.info.run_id, run.info.status)) print("--") # Check for any active runs print("Active runs: {}".format(mlflow.active_run()))
def main(): """ Получение тематик из текста и сохранение модели """ # Выгрузка топ комменариев comments = gc.get_all_comments(**config['comments']) comments_clean = pt.get_clean_text(comments, stopwords.words(config['stopwords'])) tfidf = TfidfVectorizer(**config['tf_model']).fit(comments_clean) # Матрица векторизов комментов и модель X_matrix = pt.vectorize_text(comments_clean, tfidf) # Разделение текста на тематики при помощи кластеризации, выбор наилучшего числа кластеров cluster_labels = cl.get_clusters(X_matrix, random_state=SEED, **config['clustering']) # Обучение линейной модели на поиска сформированных тематик X_train, X_test, y_train, y_test = train_test_split(X_matrix, cluster_labels, **config['cross_val'], random_state=SEED) clf_lr = LogisticRegression(**config['model']) # MLFlow трэкинг mlflow.set_tracking_uri("http://localhost:5000") mlflow.set_experiment(config['name_experiment']) with mlflow.start_run(): clf_lr.fit(X_train, y_train) # Логирование модели и параметров mlflow.log_param( 'f1', cl.get_f1_score(y_test, clf_lr.predict(X_test), set(cluster_labels))) mlflow.log_param('accuracy', accuracy_score(y_test, clf_lr.predict(X_test))) mlflow.log_param( 'precision', cl.get_precision_score(y_test, clf_lr.predict(X_test), set(cluster_labels))) mlflow.sklearn.log_model( tfidf, artifact_path="vector", registered_model_name=f"{config['model_vec']}") mlflow.sklearn.log_model(clf_lr, artifact_path='model_lr', registered_model_name=f"{config['model_lr']}") mlflow.log_artifact(local_path='./train.py', artifact_path='code') mlflow.end_run() # Получение последней версии модели и сохраннение в файлы client = MlflowClient() last_version_lr = get_version_model(config['model_lr'], client) last_version_vec = get_version_model(config['model_vec'], client) yaml_file = yaml.safe_load(open(config_path)) yaml_file['predict']["version_lr"] = int(last_version_lr) yaml_file['predict']["version_vec"] = int(last_version_vec) with open(config_path, 'w') as fp: yaml.dump(yaml_file, fp, encoding='UTF-8', allow_unicode=True)