def main(): print('keras.__version__=' + str(keras.__version__)) print('tf.__version__=' + str(tf.__version__)) print('PIL.__version__=' + str(PIL.__version__)) print('Using GPU ' + str(os.environ["CUDA_VISIBLE_DEVICES"]) + ' Good luck...') model_dir = 'F:/AMATEUR/models_mask_rcnn/' coco_model_path = 'C:/Users/cj3272/PycharmProjects/Mask_RCNN/mask_rcnn_coco.h5' if C.is_running_on_casir(): model_dir = '/gpfs/home/cj3272/amateur/modeles/segmentation/MaskRCNN_win/logs/' coco_model_path = '/gpfs/home/cj3272/amateur/modeles/segmentation/MaskRCNN_win/mask_rcnn_coco.h5' config = AmateurTrain() assert config.BASE_DIR is None dataset_id = '30MAI2019' run_name = 'training' if C.is_running_on_casir(): config.BASE_DIR = '/gpfs/groups/gc014a/AMATEUR/dataset/segmentation/' + dataset_id + '/GEN_segmentation/' config.IMAGES_PER_GPU = 4 config.STEPS_PER_EPOCH = 1000 config.VALIDATION_STEPS = 200 n_train = None n_val = None else: config.BASE_DIR = 'F:/AMATEUR/segmentation/13JUIN2019/GEN_segmentation/' config.IMAGES_PER_GPU = 1 config.IMAGE_MIN_DIM = 400 config.IMAGE_MAX_DIM = 512 n_train = 1 n_val = 1 # Datasets seed = 10 val_size = 0.2 image_ids = [ f for f in listdir(config.BASE_DIR) if isfile(join(config.BASE_DIR, f)) and f.endswith('gz') ] train_list, val_list = train_test_split(image_ids, test_size=val_size, random_state=seed) print('train size=' + str(len(train_list))) print('val size=' + str(len(val_list))) # Training dataset dataset_train = dataset_util.AmateurDatasetOnDiskMRCNN() # if C.is_running_on_casir(): # dataset_train = dataset_util.AmateurDatasetMemoryMRCNN() dataset_train.load(config.BASE_DIR, train_list, n=n_train) dataset_train.prepare() # Validation dataset dataset_val = dataset_util.AmateurDatasetOnDiskMRCNN() dataset_val.load(config.BASE_DIR, val_list, n=n_val) dataset_val.prepare() print(str(len(dataset_train.image_ids)) + ' images for training.') print(str(len(dataset_val.image_ids)) + ' images for validating.') assert config.STEPS_PER_EPOCH * config.IMAGES_PER_GPU >= len( dataset_train.image_ids) assert config.VALIDATION_STEPS * config.IMAGES_PER_GPU >= len( dataset_val.image_ids), '' # Create model in training mode model = model_lib.MaskRCNN(mode="training", config=config, model_dir=model_dir) # Which weights to start with? init_with = "coco" # imagenet, coco, or last if init_with == "imagenet": model.load_weights(model.get_imagenet_weights(), by_name=True) elif init_with == "coco": # Load weights trained on MS COCO, but skip layers that # are different due to the different number of classes # See README for instructions to download the COCO weights model.load_weights(coco_model_path, by_name=True, exclude=[ "mrcnn_class_logits", "mrcnn_bbox_fc", "mrcnn_bbox", "mrcnn_mask" ]) elif init_with == "last": # Load the last model you trained and continue training model.load_weights(model.find_last(), by_name=True) mlflow.set_tracking_uri("http://10.7.248.206:3389") mlflow.set_experiment("AMATEUR_SEGMENTATION") mlflow.start_run(run_name=run_name) mlflow.log_params({ 'model_dir': str(model_dir), 'n_train': str(n_train), 'n_val': str(n_val), 'init_with': init_with, 'os': os.name, 'dataset_id': dataset_id, 'CUDA_VISIBLE_DEVICES': os.environ["CUDA_VISIBLE_DEVICES"] }) mlflow.log_params(config.get_parameters()) # Train the head branches # Passing layers="heads" freezes all layers except the head # layers. You can also pass a regular expression to select # which layers to train by name pattern. # model.train(dataset_train, dataset_val, learning_rate=config.LEARNING_RATE, epochs=6, layers='heads') # Fine tune all layers # Passing layers="all" trains all layers. You can also # pass a regular expression to select which layers to # train by name pattern. custom_callbacks = [ keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=7, verbose=1, mode='auto', min_delta=0.01, cooldown=0, min_lr=10e-7), keras.callbacks.LambdaCallback( on_epoch_end=lambda epoch, logs: print(logs.keys())), keras.callbacks.LambdaCallback( on_epoch_end=lambda epoch, logs: mlflow.log_metrics(logs)) ] model.train(dataset_train, dataset_val, learning_rate=config.LEARNING_RATE, epochs=150, layers="all", custom_callbacks=custom_callbacks)
default=os.environ['SM_NUM_GPUS']) args = parser.parse_args() experiment_name = args.experiment_name mlflow.set_tracking_uri(args.mlflow_server) mlflow.set_experiment(experiment_name) experiment_id = mlflow.get_experiment_by_name( experiment_name).experiment_id tags = {"engineering": "ML Platform"} best_model, max_map = train(args) with mlflow.start_run(tags=tags) as run: params = { "batch-size": args.batch_size, "test-batch-size": args.test_batch_size, "epochs": args.epochs, "lr": args.lr, "momentum": args.momentum } mlflow.log_params(params) mlflow.log_metric(key='max_map', value=max_map) run_id = run.info.run_id metadata_dict = {'run_id': run_id} metadata_path = os.path.join(args.model_dir, 'metadata.json') with open(metadata_path, "w") as f:
def test_log_image_raises_exception_for_unsupported_image_object_type(): with mlflow.start_run(), pytest.raises( TypeError, match="Unsupported image object type"): mlflow.log_image("not_image", "image.png")
mlflow.set_experiment(exp_name) test_size = None if args.test_size: test_size = args.test_size LOGGER.info(f'Test size: {test_size}') mlflow.log_param(f'Test size', test_size) if DATA_AUGMENTATION: mlflow.log_param(f'AUGMENTATION_SIZE', AUGMENTATION_SIZE) LOGGER.info(f'Augmentation size: {AUGMENTATION_SIZE}') from models.naive_bayes import NaiveBayesEmotionDetection with mlflow.start_run() as run: naive_model = NaiveBayesEmotionDetection(test_size, DATA_AUGMENTATION, AUGMENTATION_SIZE) LOGGER.info(f'Experiment {exp_name} started.') naive_model.experiment() if MODEL_FILENAME: with open(MODEL_FILENAME) as fl: pickle.dump(naive_model.model, fl) LOGGER.info( f'Model {exp_name} saved to {MODEL_FILENAME} successfully!') LOGGER.info(f'Experiment {exp_name} completed.') if args.model == 'svm': LOGGER.info(f'Using SVM model')
def test_log_figure_raises_error_for_unsupported_figure_object_type(): with mlflow.start_run(), pytest.raises( TypeError, match="Unsupported figure object type"): mlflow.log_figure("not_figure", "figure.png")
def test_log_image_numpy_raises_exception_for_invalid_array_shape(): import numpy as np with mlflow.start_run(), pytest.raises( ValueError, match="`image` must be a 2D or 3D array"): mlflow.log_image(np.zeros((1, ), dtype=np.uint8), "image.png")
targetcol = 'default' y = mydf[targetcol] experimento = mlflow.get_experiment_by_name('Risco de Credito') if experimento is None: experimento = mlflow.create_experiment('Risco de Credito') experimento = mlflow.get_experiment_by_name('Risco de Credito') # Cria o Classifier (modelo 1 com todas as colunas e parâmetros padrão do Sckit Learn) independentcols = [ 'renda', 'idade', 'etnia', 'sexo', 'casapropria', 'outrasrendas', 'estadocivil', 'escolaridade' ] x = mydf[independentcols] run_name = get_a_funnyName() with mlflow.start_run(experiment_id=experimento.experiment_id, run_name=("MyNameIs.. " + run_name)): from sklearn.ensemble import RandomForestClassifier as rfc clf = rfc() clf.fit(X=x, y=y) clf.independentcols = independentcols clf_acuracia = clf.score(X=x, y=y) print( "Modelo 01 (classificador), criado com acurácia de: [{0}]".format( clf_acuracia)) mlflow.log_param("criterion", clf.criterion) mlflow.log_param("n_estimators", clf.n_estimators) mlflow.log_param("min_samples_leaf", clf.min_samples_leaf) mlflow.log_param("max_depth", clf.max_depth)
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "all" logger = get_logger() df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "leakage_feature" }, "answered_correctly": { "type": "leakage_feature" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" }, "task_container_id_bin300": { "type": "category" }, "previous_answer_index_question_id": { "type": "category" }, "previous_answer_question_id": { "type": "category" }, "timediff-elapsedtime_bin500": { "type": "category" }, "timedelta_log10": { "type": "category" } } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="question_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"][ "StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True) feature_factory_dict["user_id"][ f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator( column="user_id", agg_column="study_time", remove_now=False) feature_factory_dict["user_id"][ "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder( ) feature_factory_dict["post"] = { "DurationFeaturePostProcess": DurationFeaturePostProcess() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) def f(x): x = x // 1000 if x < -100: return -100 if x > 400: return 400 return x df["task_container_id_bin300"] = [ x if x < 300 else 300 for x in df["task_container_id"] ] df["timediff-elapsedtime_bin500"] = [ f(x) for x in df["timediff-elapsedtime"].values ] df["timedelta_log10"] = np.log10( df["duration_previous_content"].values) df["timedelta_log10"] = df["timedelta_log10"].replace( -np.inf, -1).replace(np.inf, -1).fillna(-1).astype("int8") df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "task_container_id_bin300", "previous_answer_index_question_id", "previous_answer_question_id", "row_id", "timediff-elapsedtime_bin500", "timedelta_log10" ]] print(df.head(10)) print("data preprocess") ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df_val_row = pd.read_feather( "../input/riiid-test-answer-prediction/train_transformer_last2500k_only_row_id.feather" ) if is_debug: df_val_row = df_val_row.head(3000) df_val_row["is_val"] = 1 df = pd.merge(df, df_val_row, how="left", on="row_id") df["is_val"] = df["is_val"].fillna(0) print(df["is_val"].value_counts()) w_df = df[df["is_val"] == 0] w_df["group"] = ( w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype( str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model275_all", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model275_all/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model275_all/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model275_all/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model275_all/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout, cont_emb=params["cont_emb"]) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.1 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.1, ) num_train_optimization_steps = int(len(dataloader_train) * 20) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) auc_val = 0 for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, epoch, output_dir, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) torch.save( model.state_dict(), f"{output_dir}/transformers_epoch{epoch}_auc{round(auc_val, 4)}.pth" ) # df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_val) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model torch.cuda.empty_cache() with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
return "".join( random.sample(string.ascii_letters, random.randint(1, max_len))) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( "--large", help= "If true, will also generate larger datasets for testing UI performance.", action="store_true") args = parser.parse_args() client = MlflowClient() # Simple run for l1, alpha in itertools.product([0, 0.25, 0.5, 0.75, 1], [0, 0.5, 1]): with mlflow.start_run(run_name='ipython'): parameters = { 'l1': str(l1), 'alpha': str(alpha), } metrics = { 'MAE': [rand()], 'R2': [rand()], 'RMSE': [rand()], } log_params(parameters) log_metrics(metrics) # Big parameter values with mlflow.start_run(run_name='ipython'): parameters = {
def log_torch_model(self, model, epoch): with mlflow.start_run(self.run_id): mlflow.pytorch.log_model(model, "model_%04d" % epoch)
if __name__ == '__main__': with open('test.file', 'wb') as f: test = range(10) joblib.dump(test, f) local_registry = "sqlite:///mlruns.db" print(f"Running local model registry={local_registry}") mlflow.set_tracking_uri(local_registry) clt = MlflowClient() exp_a_uri = "file:///tmp/exp_A" exp_a_id = get_exp_id("experiment_A", local_registry, local_registry, exp_a_uri) exp_b_uri = "file:///tmp/exp_B" exp_b_id = get_exp_id("experiment_B", local_registry, local_registry, exp_b_uri) for i in range(3): with mlflow.start_run(experiment_id=exp_a_id): mlflow.log_metric("MEAN SQUARE ERROR", 0.25 * random()) mlflow.log_artifact('test.file') print(f"artifact_uri={mlflow.get_artifact_uri()}") print("-" * 75) for i in range(3): with mlflow.start_run(experiment_id=exp_b_id): mlflow.log_metric("MEAN SQUARE ERROR", 0.25 * random()) mlflow.log_artifact('test.file') print(f"artifact_uri={mlflow.get_artifact_uri()}")
def eval_Slim(params, cfg, train_mat, eval_mat, experiment): # This function is what Hyperopt is going to optimize (minimize 'loss' value) print(experiment) with mlflow.start_run(experiment_id=experiment): # Log the config utils.config_helpers.log_config(dict(cfg.model)) n_users, n_items = train_mat.shape np.random.seed(seed=cfg.model.seed) # Log relevant parameters for this run. mlflow.log_param("alpha", params['alpha']) mlflow.log_param("l1_ratio", params['l1_ratio']) mlflow.log_param("max_iter", params['max_iter']) mlflow.log_param("tol", params['tol']) # Log this run log.info( f"Testing alpha: {params['alpha']}, l1_ratio: {params['l1_ratio']}, max_iter: {params['max_iter']} and tol: {params['tol']}" ) start = time.time() # Create model slim = RecModel.Slim(num_items=n_items, num_users=n_users) # Train Model slim.train(X=train_mat.copy(), alpha=params['alpha'], l1_ratio=params['l1_ratio'], max_iter=params['max_iter'], tolerance=params['tol'], cores=1, verbose=int(cfg.model.verbose)) # Log run-time mlflow.log_metric("Runtime", int(round(time.time() - start, 0))) # Evaluate model perf_all = slim.eval_topn(eval_mat.copy(), rand_sampled=int(cfg.model.rand_sampled), topn=np.array(cfg.model.top_n_performances, dtype=np.int32), random_state=int(cfg.model.seed), cores=int(cfg.model.cores)) # Log the performance of the model for pos in range(len(cfg.model.top_n_performances)): mlflow.log_metric( f"recallAT{cfg.model.top_n_performances[pos]}_of_{cfg.model.rand_sampled}", perf_all[f"Recall@{cfg.model.top_n_performances[pos]}"]) mlflow.log_metric('MAE_train', slim.eval_prec(train_mat.copy())) mlflow.log_metric('MAE_eval', slim.eval_prec(eval_mat.copy())) #We will always choose the first topn performance. Hopefully, that is also the smallest is most relevant for us.g rel_topn_perf = perf_all[f"Recall@{cfg.model.top_n_performances[0]}"] log.info( f"Current recallAT{cfg.model.top_n_performances[0]}_of_{cfg.model.rand_sampled} performance was {rel_topn_perf}" ) loss = -rel_topn_perf return {'loss': loss, 'status': hp.STATUS_OK, 'eval_time': time.time()}
def test_xgb_autolog_persists_manually_created_run(bst_params, dtrain): mlflow.xgboost.autolog() with mlflow.start_run() as run: xgb.train(bst_params, dtrain) assert mlflow.active_run() assert mlflow.active_run().info.run_id == run.info.run_id
def create_gen3_heatmap(json_file_path): """ Create elapsed time and speed heatmaps from a json file resulting from 'gen3_eval', which contains the times elapsed and speeds for a number of evaluation runs. Args: json_file_path: Path the json file containing the run evaluation elapsed times and speeds """ # start mlflow run with mlflow.start_run() as mlrun: # if the json file does not exist, raise error if not os.path.exists(json_file_path) or not os.path.isfile( json_file_path): raise ValueError('{} does not exist'.format(json_file_path)) # initialize chunk_sizes and chunks lists to contain all the powers of 2 between # 2^MIN_EXPONENT and 2^MAX_EXPONENT (included) chunk_sizes_iterator = [ 2**x for x in range(MIN_CHUNK_SIZE_EXPONENT, MAX_CHUNK_SIZE_EXPONENT + 1) ] chunks_iterator = [ 2**x for x in range(MIN_CHUNKS_EXPONENT, MAX_CHUNKS_EXPONENT + 1) ] # open json file and load its content in the data dict with open(json_file_path, 'r') as f: data = json.load(f) # instantiate empty numpy arrays for containing the average values and standard deviations for # both elapsed times and speeds elapsed_times_avg = np.empty(shape=(len(chunk_sizes_iterator), len(chunks_iterator)), dtype=np.float32) speeds_avg = np.empty(shape=(len(chunk_sizes_iterator), len(chunks_iterator)), dtype=np.float32) elapsed_times_std = np.empty(shape=(len(chunk_sizes_iterator), len(chunks_iterator)), dtype=np.float32) speeds_std = np.empty(shape=(len(chunk_sizes_iterator), len(chunks_iterator)), dtype=np.float32) # assign NaN (Not a Number) value to all positions of the numpy arrays just defined # (NaN will be ignored when plotting the heatmap) elapsed_times_avg[:] = np.NaN speeds_avg[:] = np.NaN elapsed_times_std[:] = np.NaN speeds_std[:] = np.NaN # for each chunk size for i, cs in enumerate(chunk_sizes_iterator): # for each chunks number for j, c in enumerate(chunks_iterator): # if the value in data dict corresponding to the combination of chunk size and chunks number is None # for speeds or times, continue to next combination if data['elapsed_times'][str(cs)][str(c)] is None or data[ 'speeds'][str(cs)][str(c)] is None: continue # compute average values and standard deviations for the elapsed times and speeds elapsed_times_avg[i][j] = np.average( data['elapsed_times'][str(cs)][str(c)]) elapsed_times_std[i][j] = np.std( data['elapsed_times'][str(cs)][str(c)]) speeds_avg[i][j] = np.average( data['speeds'][str(cs)][str(c)]) speeds_std[i][j] = np.std(data['speeds'][str(cs)][str(c)]) # create elapsed times and speeds figures time_fig, time_ax = plt.subplots(figsize=(10, 9)) speed_fig, speed_ax = plt.subplots(figsize=(10, 9)) # compute elapsed times heatmap time_im, time_cbar = heatmap(elapsed_times_avg, row_labels=chunk_sizes_iterator, row_title='Chunk sizes', col_labels=chunks_iterator, col_title='Chunks', ax=time_ax, cmap="BuGn", cbarlabel="elapsed time [s]") # compute speeds heatmap speed_im, speed_cbar = heatmap(speeds_avg, row_labels=chunk_sizes_iterator, row_title='Chunk sizes', col_labels=chunks_iterator, col_title='Chunks', ax=speed_ax, cmap="BuGn", cbarlabel="speed [it/s]") # compute elapsed times and speeds masks time_mask = [[np.isnan(c) for c in cs] for cs in elapsed_times_avg] speed_mask = [[np.isnan(c) for c in cs] for cs in speeds_avg] # annotate elapsed times and speeds heatmaps give the corresponding masks time_texts = annotate_heatmap(time_im, elapsed_times_std, time_mask, ann_format='time', fontsize='x-small') speed_texts = annotate_heatmap(speed_im, speeds_std, speed_mask, ann_format='speed', fontsize='x-small') # adjusts subplots params so that the subplot fits in to the figure area time_fig.tight_layout() speed_fig.tight_layout() # create temporary directory with tempfile.TemporaryDirectory() as tmpdir: # save both elapsed times and speeds heatmaps to temporary files time_filename = os.path.join(tmpdir, 'times.png') speed_filename = os.path.join(tmpdir, 'speeds.png') time_fig.savefig(time_filename) speed_fig.savefig(speed_filename) # log temporary files as artifacts mlflow.log_artifact(time_filename) mlflow.log_artifact(speed_filename)
def benchmark_t(args): from plaidbench import cli results = cli.plaidbench(args) print('results.... ', type(results), ' ', results) devices = ['metal_amd', 'opencl_amd', 'metal_uhd', 'opencl_uhd', 'cpu'] small_networks = 'mobilenet|nasnet_mobile|imdb_lstm'.split('|') for device in devices: for network in small_networks: name = f'TRIAL_{device}_{network}' if path.exists(getcwd() + '/' + name): continue popen(f'cp ~/{device}.json ~/.plaidml') with mlflow.start_run(experiment_id=1, run_name=name[6:]): x = Thread( target=benchmark_t, kwargs={'args': [f'--results=./{name}', 'keras', network]}) start_time = process_time() x.start() x.join() mlflow.log_metric('ttl_exec_time', process_time() - start_time) mlflow.log_param('device', device) mlflow.log_param('network', network) sleep(2) mlflow.log_artifact(getcwd() + '/' + name + '/result.json') with open(name + '/result.json') as json_file: result = json_load(json_file) mlflow.log_param('examples', result['examples'])
def main(params: dict): import mlflow logger = get_logger() print("start params={}".format(params)) if is_full_data: df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") else: df = pd.read_pickle( "../input/riiid-test-answer-prediction/split10/train_0.pickle" ).sort_values(["user_id", "timestamp"]).reset_index(drop=True) # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly" ]] train_idx = [] val_idx = [] np.random.seed(0) for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"): if np.random.random() < 0.1: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.9) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) df["is_val"] = 0 df["is_val"].loc[val_idx] = 1 w_df = df[df["is_val"] == 0] w_df["group"] = (w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype(str) + "_" + w_df["group"].astype( str) ff_for_transformer = FeatureFactoryForTransformer( column_config={ ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "category" }, "part": { "type": "category" } }, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) group = ff_for_transformer.all_predict(w_df) del w_df gc.collect() n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) print(group) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) ff_for_transformer = FeatureFactoryForTransformer( column_config={ ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "category" }, "part": { "type": "category" } }, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df["group"] = (df.groupby("user_id")["user_id"].transform("count") - df.groupby("user_id").cumcount()) // params["max_seq"] group = ff_for_transformer.all_predict(df) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) dataloader_train = DataLoader(dataset_train, batch_size=64, shuffle=True, num_workers=1) dataloader_val = DataLoader(dataset_val, batch_size=64, shuffle=False, num_workers=1) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"]) optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"]) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) preds = [] labels = [] for d in tqdm(dataloader_val): x = d[0].to(device).long() qa = d[1].to(device).long() target_id = d[2].to(device).long() part = d[3].to(device).long() label = d[4].to(device).long() output, atten_weight = model(x, qa, target_id, part) preds.extend(torch.nn.Sigmoid()( output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy()) df_oof = pd.DataFrame() df_oof["row_id"] = df.loc[val_idx].index df_oof["predict"] = preds df_oof["target"] = df.loc[val_idx]["answered_correctly"].values df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) df_oof2 = pd.read_csv( "../output/ex_172/20201202080625/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_transformer = roc_auc_score(df_oof2["target"].values, df_oof2["predict"].values) auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("single transformer: {:.4f}".format(auc_transformer)) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score( df_oof2["target"].values, df_oof2["predict_lgbm"].values * (1 - r) + df_oof2["predict"].values * r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) mlflow.log_param("count_row", len(df)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.log_metric("auc_lgbm", auc_lgbm) mlflow.log_metric("auc_ensemble", max_auc) mlflow.log_metric("ensemble_nn_ratio", max_nn_ratio) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del df, dataset_train, dataset_val, dataloader_train, dataloader_val gc.collect() with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: ff_for_transformer = FeatureFactoryForTransformer( column_config={ ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "category" }, "part": { "type": "category" } }, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "train_0" logger = get_logger() # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") df = pd.read_pickle( "../input/riiid-test-answer-prediction/split10/train_0.pickle" ).sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "leakage_feature" }, "answered_correctly": { "type": "leakage_feature" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" }, "task_container_id_bin300": { "type": "category" }, "previous_answer_index_content_id": { "type": "category" }, "previous_answer_content_id": { "type": "category" } } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent() feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="train_0", load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) df["task_container_id_bin300"] = [ x if x < 300 else 300 for x in df["task_container_id"] ] df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "task_container_id_bin300", "previous_answer_index_content_id", "previous_answer_content_id" ]] print(df.head(10)) print("data preprocess") train_idx = [] val_idx = [] np.random.seed(0) for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"): if np.random.random() < 0.01: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.95) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df["is_val"] = 0 df["is_val"].loc[val_idx] = 1 w_df = df[df["is_val"] == 0] w_df["group"] = ( w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype( str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model155", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model155/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model155/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model155/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model155/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True, num_workers=1) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False, num_workers=1) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout, cont_emb=params["cont_emb"]) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdaBelief( optimizer_grouped_parameters, lr=params["lr"], ) num_train_optimization_steps = int(len(dataloader_train) * 20) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, epoch, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) preds = [] labels = [] with torch.no_grad(): for item in tqdm(dataloader_val): x = item["x"].to(device).long() target_id = item["target_id"].to(device).long() part = item["part"].to(device).long() label = item["label"].to(device).float() elapsed_time = item["elapsed_time"].to(device).long() duration_previous_content = item["duration_previous_content"].to( device).long() prior_question_had_explanation = item["prior_q"].to(device).long() user_answer = item["user_answer"].to(device).long() rate_diff = item["rate_diff"].to(device).float() container_id = item["container_id"].to(device).long() prev_ans_idx = item["previous_answer_index_content_id"].to( device).long() prev_answer_content_id = item["previous_answer_content_id"].to( device).long() output = model(x, target_id, part, elapsed_time, duration_previous_content, prior_question_had_explanation, user_answer, rate_diff, container_id, prev_ans_idx, prev_answer_content_id) preds.extend(torch.nn.Sigmoid()( output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) auc_transformer = roc_auc_score(labels, preds) print("single transformer: {:.4f}".format(auc_transformer)) df_oof = pd.DataFrame() # df_oof["row_id"] = df.loc[val_idx].index print(len(dataloader_val)) print(len(preds)) df_oof["predict"] = preds df_oof["target"] = labels df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model torch.cuda.empty_cache() with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def test_search_runs(): mlflow.set_experiment("exp-for-search") # Create a run and verify that the current active experiment is the one we just set logged_runs = {} with mlflow.start_run() as active_run: logged_runs["first"] = active_run.info.run_id mlflow.log_metric("m1", 0.001) mlflow.log_metric("m2", 0.002) mlflow.log_metric("m1", 0.002) mlflow.log_param("p1", "a") mlflow.set_tag("t1", "first-tag-val") with mlflow.start_run() as active_run: logged_runs["second"] = active_run.info.run_id mlflow.log_metric("m1", 0.008) mlflow.log_param("p2", "aa") mlflow.set_tag("t2", "second-tag-val") def verify_runs(runs, expected_set): assert set([r.info.run_id for r in runs ]) == set([logged_runs[r] for r in expected_set]) experiment_id = MlflowClient().get_experiment_by_name( "exp-for-search").experiment_id # 2 runs in this experiment assert len(MlflowClient().list_run_infos(experiment_id, ViewType.ACTIVE_ONLY)) == 2 # 2 runs that have metric "m1" > 0.001 runs = MlflowClient().search_runs([experiment_id], "metrics.m1 > 0.0001") verify_runs(runs, ["first", "second"]) # 1 run with has metric "m1" > 0.002 runs = MlflowClient().search_runs([experiment_id], "metrics.m1 > 0.002") verify_runs(runs, ["second"]) # no runs with metric "m1" > 0.1 runs = MlflowClient().search_runs([experiment_id], "metrics.m1 > 0.1") verify_runs(runs, []) # 1 run with metric "m2" > 0 runs = MlflowClient().search_runs([experiment_id], "metrics.m2 > 0") verify_runs(runs, ["first"]) # 1 run each with param "p1" and "p2" runs = MlflowClient().search_runs([experiment_id], "params.p1 = 'a'", ViewType.ALL) verify_runs(runs, ["first"]) runs = MlflowClient().search_runs([experiment_id], "params.p2 != 'a'", ViewType.ALL) verify_runs(runs, ["second"]) runs = MlflowClient().search_runs([experiment_id], "params.p2 = 'aa'", ViewType.ALL) verify_runs(runs, ["second"]) # 1 run each with tag "t1" and "t2" runs = MlflowClient().search_runs([experiment_id], "tags.t1 = 'first-tag-val'", ViewType.ALL) verify_runs(runs, ["first"]) runs = MlflowClient().search_runs([experiment_id], "tags.t2 != 'qwerty'", ViewType.ALL) verify_runs(runs, ["second"]) runs = MlflowClient().search_runs([experiment_id], "tags.t2 = 'second-tag-val'", ViewType.ALL) verify_runs(runs, ["second"]) # delete "first" run MlflowClient().delete_run(logged_runs["first"]) runs = MlflowClient().search_runs([experiment_id], "params.p1 = 'a'", ViewType.ALL) verify_runs(runs, ["first"]) runs = MlflowClient().search_runs([experiment_id], "params.p1 = 'a'", ViewType.DELETED_ONLY) verify_runs(runs, ["first"]) runs = MlflowClient().search_runs([experiment_id], "params.p1 = 'a'", ViewType.ACTIVE_ONLY) verify_runs(runs, [])
def test_requestor(self, request): response = mock.MagicMock response.status_code = 200 response.text = '{}' request.return_value = response creds = MlflowHostCreds('https://hello') store = RestStore(lambda: creds) user_name = "mock user" source_name = "rest test" source_name_patch = mock.patch( "mlflow.tracking.context.default_context._get_source_name", return_value=source_name) source_type_patch = mock.patch( "mlflow.tracking.context.default_context._get_source_type", return_value=SourceType.LOCAL) with mock.patch('mlflow.utils.rest_utils.http_request') as mock_http, \ mock.patch('mlflow.tracking._tracking_service.utils._get_store', return_value=store), \ mock.patch('mlflow.tracking.context.default_context._get_user', return_value=user_name), \ mock.patch('time.time', return_value=13579), \ source_name_patch, source_type_patch: with mlflow.start_run(experiment_id="43"): cr_body = message_to_json( CreateRun(experiment_id="43", user_id=user_name, start_time=13579000, tags=[ ProtoRunTag(key='mlflow.source.name', value=source_name), ProtoRunTag(key='mlflow.source.type', value='LOCAL'), ProtoRunTag(key='mlflow.user', value=user_name) ])) expected_kwargs = self._args(creds, "runs/create", "POST", cr_body) assert mock_http.call_count == 1 actual_kwargs = mock_http.call_args[1] # Test the passed tag values separately from the rest of the request # Tag order is inconsistent on Python 2 and 3, but the order does not matter expected_tags = expected_kwargs['json'].pop('tags') actual_tags = actual_kwargs['json'].pop('tags') assert (sorted(expected_tags, key=lambda t: t['key']) == sorted( actual_tags, key=lambda t: t['key'])) assert expected_kwargs == actual_kwargs with mock.patch('mlflow.utils.rest_utils.http_request') as mock_http: store.log_param("some_uuid", Param("k1", "v1")) body = message_to_json( LogParam(run_uuid="some_uuid", run_id="some_uuid", key="k1", value="v1")) self._verify_requests(mock_http, creds, "runs/log-parameter", "POST", body) with mock.patch('mlflow.utils.rest_utils.http_request') as mock_http: store.set_experiment_tag("some_id", ExperimentTag("t1", "abcd" * 1000)) body = message_to_json( SetExperimentTag(experiment_id="some_id", key="t1", value="abcd" * 1000)) self._verify_requests(mock_http, creds, "experiments/set-experiment-tag", "POST", body) with mock.patch('mlflow.utils.rest_utils.http_request') as mock_http: store.set_tag("some_uuid", RunTag("t1", "abcd" * 1000)) body = message_to_json( SetTag(run_uuid="some_uuid", run_id="some_uuid", key="t1", value="abcd" * 1000)) self._verify_requests(mock_http, creds, "runs/set-tag", "POST", body) with mock.patch('mlflow.utils.rest_utils.http_request') as mock_http: store.delete_tag("some_uuid", "t1") body = message_to_json(DeleteTag(run_id="some_uuid", key="t1")) self._verify_requests(mock_http, creds, "runs/delete-tag", "POST", body) with mock.patch('mlflow.utils.rest_utils.http_request') as mock_http: store.log_metric("u2", Metric("m1", 0.87, 12345, 3)) body = message_to_json( LogMetric(run_uuid="u2", run_id="u2", key="m1", value=0.87, timestamp=12345, step=3)) self._verify_requests(mock_http, creds, "runs/log-metric", "POST", body) with mock.patch('mlflow.utils.rest_utils.http_request') as mock_http: metrics = [ Metric("m1", 0.87, 12345, 0), Metric("m2", 0.49, 12345, -1), Metric("m3", 0.58, 12345, 2) ] params = [Param("p1", "p1val"), Param("p2", "p2val")] tags = [RunTag("t1", "t1val"), RunTag("t2", "t2val")] store.log_batch(run_id="u2", metrics=metrics, params=params, tags=tags) metric_protos = [metric.to_proto() for metric in metrics] param_protos = [param.to_proto() for param in params] tag_protos = [tag.to_proto() for tag in tags] body = message_to_json( LogBatch(run_id="u2", metrics=metric_protos, params=param_protos, tags=tag_protos)) self._verify_requests(mock_http, creds, "runs/log-batch", "POST", body) with mock.patch('mlflow.utils.rest_utils.http_request') as mock_http: store.delete_run("u25") self._verify_requests(mock_http, creds, "runs/delete", "POST", message_to_json(DeleteRun(run_id="u25"))) with mock.patch('mlflow.utils.rest_utils.http_request') as mock_http: store.restore_run("u76") self._verify_requests(mock_http, creds, "runs/restore", "POST", message_to_json(RestoreRun(run_id="u76"))) with mock.patch('mlflow.utils.rest_utils.http_request') as mock_http: store.delete_experiment("0") self._verify_requests( mock_http, creds, "experiments/delete", "POST", message_to_json(DeleteExperiment(experiment_id="0"))) with mock.patch('mlflow.utils.rest_utils.http_request') as mock_http: store.restore_experiment("0") self._verify_requests( mock_http, creds, "experiments/restore", "POST", message_to_json(RestoreExperiment(experiment_id="0"))) with mock.patch('mlflow.utils.rest_utils.http_request') as mock_http: response = mock.MagicMock response.text = '{"runs": ["1a", "2b", "3c"], "next_page_token": "67890fghij"}' mock_http.return_value = response result = store.search_runs(["0", "1"], "params.p1 = 'a'", ViewType.ACTIVE_ONLY, max_results=10, order_by=["a"], page_token="12345abcde") expected_message = SearchRuns(experiment_ids=["0", "1"], filter="params.p1 = 'a'", run_view_type=ViewType.to_proto( ViewType.ACTIVE_ONLY), max_results=10, order_by=["a"], page_token="12345abcde") self._verify_requests(mock_http, creds, "runs/search", "POST", message_to_json(expected_message)) assert result.token == "67890fghij"
def test_log_image_numpy_raises_exception_for_invalid_array_data_type(): import numpy as np with mlflow.start_run(), pytest.raises(TypeError, match="Invalid array data type"): mlflow.log_image(np.tile("a", (1, 1, 3)), "image.png")
def valid_rf(race_results_df_processed_valid, model_rf, parameters): # mlflow print('FILE_DIR: ' + FILE_DIR) mlflow.set_tracking_uri(FILE_DIR + '/../../../logs/mlruns/') mlflow.set_experiment('forecast_keiba_valid') run_info = mlflow.start_run() mlflow.set_tag('model', 'lr') # 検証のデータ準備 race_results_df_processed_valid = race_results_df_processed_valid # 説明変数の取得 X_valid = race_results_df_processed_valid.drop(['rank'], axis=1) # 目的変数の取得 y_valid = race_results_df_processed_valid['rank'] # 推論実行 y_valid_pred = model_rf.predict(X_valid) # 集計用に処理 valid_results_df = pd.DataFrame({'pred': y_valid_pred, 'actual': y_valid}) race_id_list = list(set(list(valid_results_df.index))) valid_results_list = valid_results_df.reset_index().values.tolist() # シャッフル random.shuffle(valid_results_list) # 集計(馬単) correct_count = 0 for race_id in race_id_list: pred_cnt_by_race = 0 cnt_by_race = 0 for rank in [1]: for i in range(len(valid_results_list)): # 対象レースidのうち、{rank}位と予測された馬 if valid_results_list[i][0] == race_id and valid_results_list[ i][1] == rank: pred_cnt_by_race += 1 if pred_cnt_by_race <= 1 and (valid_results_list[i][2] == 1): cnt_by_race += 1 if cnt_by_race == 1: correct_count += 1 acc_exacta_1 = correct_count / 100 print('acc_exacta_1: ' + str(acc_exacta_1)) # 集計(馬連) correct_count = 0 for race_id in race_id_list: pred_cnt_by_race = 0 cnt_by_race = 0 for rank in [1, 2]: for i in range(len(valid_results_list)): # 対象レースidのうち、{rank}位と予測された馬 if valid_results_list[i][0] == race_id and valid_results_list[ i][1] == rank: pred_cnt_by_race += 1 if pred_cnt_by_race <= 2 and (valid_results_list[i][2] == 1 or valid_results_list[i][2] == 2): cnt_by_race += 1 if cnt_by_race == 2: correct_count += 1 acc_quinella_2 = correct_count / 100 print('acc_quinella_2: ' + str(acc_quinella_2)) # 集計(三連複) correct_count = 0 for race_id in race_id_list: pred_cnt_by_race = 0 cnt_by_race = 0 for rank in [1, 2, 3]: for i in range(len(valid_results_list)): # 対象レースidのうち、{rank}位と予測された馬 if valid_results_list[i][0] == race_id and valid_results_list[ i][1] == rank: pred_cnt_by_race += 1 if pred_cnt_by_race <= 3 and ( valid_results_list[i][2] == 1 or valid_results_list[i][2] == 2 or valid_results_list[i][2] == 3): cnt_by_race += 1 if cnt_by_race == 3: correct_count += 1 acc_trio_3 = correct_count / 100 print('acc_trio_3: ' + str(acc_trio_3)) mlflow.log_metric("acc_exacta_1", acc_exacta_1) mlflow.log_metric("acc_quinella_2", acc_quinella_2) mlflow.log_metric("acc_trio_3", acc_trio_3) # 通知 if parameters['is_notify']: run_result_dict = mlflow.get_run(run_info.info.run_id).to_dictionary() run_result_str = json.dumps(run_result_dict, indent=4) conf_paths = [ FILE_DIR + "/../../../conf/base", FILE_DIR + "/../../../conf/local" ] conf_loader = ConfigLoader(conf_paths) credentials = conf_loader.get("credentials*", "credentials*/**") token = credentials['dev_line']['access_token'] url = "https://notify-api.line.me/api/notify" headers = {"Authorization": "Bearer " + token} payload = {"message": "model_rf" + run_result_str} requests.post(url, headers=headers, data=payload) mlflow.end_run()
def test_log_image_numpy_raises_exception_for_invalid_channel_length(): import numpy as np with mlflow.start_run(), pytest.raises(ValueError, match="Invalid channel length"): mlflow.log_image(np.zeros((1, 1, 5), dtype=np.uint8), "image.png")
from random import random, randint from sklearn.ensemble import RandomForestRegressor import mlflow import mlflow.sklearn with mlflow.start_run(run_name="YOUR_RUN_NAME") as run: params = {"n_estimators": 5, "random_state": 42} sk_learn_rfr = RandomForestRegressor(**params) # Log parameters and metrics using the MLflow APIs mlflow.log_params(params) mlflow.log_param("param_1", randint(0, 100)) mlflow.log_metrics({"metric_1": random(), "metric_2": random() + 1}) # Log the sklearn model and register as version 1 mlflow.sklearn.log_model( sk_model=sk_learn_rfr, artifact_path="sklearn-model", registered_model_name="sk-learn-random-forest-reg-model" )
def test_get_artifact_uri_with_artifact_path_unspecified_returns_artifact_root_dir( ): with mlflow.start_run() as active_run: assert mlflow.get_artifact_uri( artifact_path=None) == active_run.info.artifact_uri
def main(): torch.manual_seed(42) random.seed(42) np.random.seed(42) torch.backends.cudnn.deterministic = True # lr = 0.01 n_epochs = 40 batch_size = 64 # device = "cpu" device = "cuda:0" mlflow.start_run() mlflow.log_param("n_epochs", n_epochs) mlflow.log_param("batch_size", batch_size) mlflow.log_param("device", device) run_start_time = mlflow.active_run().info.start_time readable_start_time = time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(run_start_time / 1000)) trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))]) train_set = CIFAR10(root="data/", train=True, transform=trans, download=True) test_set = CIFAR10(root="data/", train=False, transform=trans, download=True) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, pin_memory=False, num_workers=4) test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, pin_memory=False, num_workers=4) model = Net() model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters()) best_lr, summary = find_lr_supervised(model, criterion, optimizer, train_loader, 1e-9, 1, device=device) # torch_fuze.utils.set_lr(optimizer, best_lr * 0.1) torch.manual_seed(42) random.seed(42) np.random.seed(42) # plt.plot(np.log10(summary.learning_rates), summary.losses) # plt.plot(np.log10(summary.learning_rates), summary.smoothed_losses) # plt.draw() # plt.pause(10) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[5, 8], gamma=0.3) # scheduler = OneCycleLR(optimizer, best_lr * 0.01, best_lr * 0.8, 1e-6, n_total_epochs=n_epochs, cycle_fraction=0.8) # scheduler = OneCycleLR(optimizer, best_lr * 0.01, best_lr * 0.5, 1e-6, n_total_epochs=n_epochs, cycle_fraction=0.8) # scheduler = OneCycleLR(optimizer, best_lr * 0.01, best_lr * 0.1, 1e-6, n_total_epochs=n_epochs, cycle_fraction=0.8) scheduler = None metrics = OrderedDict([ ("loss", criterion), ("acc", torch_fuze.metrics.Accuracy()) ]) callbacks = [ torch_fuze.callbacks.ProgressCallback(), torch_fuze.callbacks.BestModelSaverCallback( model, "checkpoints/best.pt", metric_name="acc", lower_is_better=False), torch_fuze.callbacks.TensorBoardXCallback(f"logs/{readable_start_time}/", remove_old_logs=True), torch_fuze.callbacks.MLFlowCallback( metrics_to_track={"valid_loss", "valid_acc", "train_acc"}, lowest_metrics_to_track={"valid_loss"}, highest_metrics_to_track={"valid_acc"}, files_to_save_at_every_batch={"checkpoints/best.pt"}) ] trainer = torch_fuze.SupervisedTrainer(model, criterion, device) trainer.run( train_loader, test_loader, optimizer, scheduler=scheduler, n_epochs=n_epochs, callbacks=callbacks, metrics=metrics)
# COMMAND ---------- print(n_estimators) print(max_depth) rfHyperOpt = RandomForestClassifier(labelCol="label", featuresCol="features", maxDepth=max_depth, numTrees=n_estimators, featureSubsetStrategy="all", seed=42, maxBins=100) rfHyperOptFitted = rfHyperOpt.fit(train_data) loss = 1 - evaluator.evaluate( rfHyperOptFitted.transform(test_data)) # 1 - f-score # COMMAND ---------- import mlflow import mlflow.spark with mlflow.start_run(run_id=run_id, experiment_id=experiment_id) as run: mlflow.spark.log_model(rfHyperOptFitted, "model") # COMMAND ---------- dbutils.notebook.exit(str(loss)) # COMMAND ----------
run_name = parser["run_name"] if mx.context.num_gpus() > 0 and using_cuda: GPU_COUNT = mx.context.num_gpus() else: GPU_COUNT = 0 # window 운영체제에서 freeze support 안나오게 하려면, 아래와 같이 __name__ == "__main__" 에 해줘야함. if __name__ == "__main__": print("\n실행 경로 : " + __file__) if training: if using_mlflow: ml.set_tracking_uri("./mlruns") # mlruns가 기본 트래킹이다. ex_id = ml.set_experiment("CENTER_" + "RES" + str(base)) ml.start_run(run_name=run_name, experiment_id=ex_id) ml.log_param("height", input_size[0]) ml.log_param("width", input_size[1]) ml.log_param("pretrained_base", pretrained_base) ml.log_param("train dataset path", train_dataset_path) ml.log_param("valid dataset path", valid_dataset_path) ml.log_param("test dataset path", test_dataset_path) ml.log_param("epoch", epoch) ml.log_param("batch size", batch_size) ml.log_param("multiscale", multiscale) ml.log_param("data augmentation", data_augmentation) ml.log_param("optimizer", optimizer) ml.log_param("learning rate", learning_rate) ml.log_param("decay lr", decay_lr) train.run(mean=image_mean,
import mlflow #mlflow.set_tracking_uri("http://training.itu.dk:5000/") #mlflow.set_experiment("Hermes Demo") mlflow.sklearn.autolog() def get_ys(xs): signal = -0.1*xs**3 + xs**2 - 5*xs - 5 noise = np.random.normal(0,200,(len(xs),1)) return signal + noise X = np.random.uniform(-20,20,num_samples).reshape((num_samples,1)) y = get_ys(X) poly_params = { 'Poly__degree': range(1,8), } mlflow.end_run() with mlflow.start_run(): mlflow.log_param("num_samples", num_samples) model = Pipeline([ ("Poly", PolynomialFeatures()), ("LinReg", LinearRegression()) ]) gridsearch = GridSearchCV(model, poly_params, scoring="r2") gridsearch.fit(X,y)
def test_list_run(): with mlflow.start_run(run_name='apple'): pass result = CliRunner().invoke(list_run, ["--experiment-id", "0"]) assert 'apple' in result.output
import mlflow import os import time from mlflow import log_metric, log_param, log_artifact if __name__ == "__main__": mlflow.set_experiment("First") with mlflow.start_run(): # Log a parameter (key-value pair) log_param("param1", 5) # Log a metric; metrics can be updated throughout the run for i in range(200): time.sleep(0.1) log_metric("foo1", 1 * i) log_metric("foo2", 2 * i) log_metric("foo3", 3 * i) log_metric("foo4", 3 * i) log_metric("foo5", 3 * i) log_metric("foo6", 3 * i) log_metric("foo7", 3 * i) log_metric("foo8", 3 * i) log_metric("foo9", 3 * i) log_metric("foo10", 3 * i) log_metric("foo11", 3 * i) log_metric("foo12", 3 * i) log_metric("foo13", 3 * i) log_metric("foo14", 3 * i) log_metric("foo15", 3 * i) log_metric("foo16", 3 * i)
def gen3_eval( ds_path, # path of the directory where to find the pre-processed dataset (containing .dat files) # path to a new or an existent (from a previous run) json file to be used to store run # evaluation elapsed times and speeds json_file_path, net_type='mtje', # network to use batch_size=8192, # how many samples per batch to load min_mul=1, # minimum product between chunks and chunk_size to consider (in # of batches) max_mul=32, # maximum product between chunks and chunk_size to consider (in # of batches) epochs=1, # number of epochs to perform evaluation for training_n_samples=0, # number of training samples to consider (used to access the right files) use_malicious_labels=1, # whether or not (1/0) to use malware/benignware labels as a target use_count_labels=1, # whether or not (1/0) to use the counts as an additional target use_tag_labels=1, # whether or not (1/0) to use the tags as additional targets feature_dimension=2381, # The input dimension of the model # if provided, seed random number generation with this value (defaults None, no seeding) random_seed=None, # how many worker (threads) should the dataloader use (default: 0 -> use multiprocessing.cpu_count()) workers=0): """ Evaluate generator alt3 speed changing values for 'chunk_size' and 'chunks' variables. The evaluation is done for 'epochs' epochs for each combination of values. The resulting elapsed times and speeds are save to a json file. Args: ds_path: Path of the directory where to find the pre-processed dataset (containing .dat files) json_file_path: Path to a new or an existent (from a previous run) json file to be used to store run evaluation elapsed times and speeds net_type: Network to use between 'mtje', 'mtje_cosine', 'mtje_pairwise_distance' and 'aloha' (default: 'mtje') batch_size: How many samples per batch to load (default: 8192) min_mul: Minimum product between chunks and chunk_size to consider (in # of batches) (default: 1) max_mul: Maximum product between chunks and chunk_size to consider (in # of batches) (default: 32) epochs: How many epochs to train for (default: 1) training_n_samples: Number of training samples to consider (used to access the right files) (default: 0 -> all) use_malicious_labels: Whether or (1/0) not to use malware/benignware labels as a target (default: 1) use_count_labels: Whether or not (1/0) to use the counts as an additional target (default: 1) use_tag_labels: Whether or not (1/0) to use the tags as additional targets (default: 1) feature_dimension: The input dimension of the model (default: 2381 -> EMBER 2.0 feature size) random_seed: If provided, seed random number generation with this value (default: None -> no seeding) workers: How many worker (threads) should the dataloader use (default: 0 -> use multiprocessing.cpu_count()) """ # dynamically import some classes, functions and variables from modules depending on the current net type Net, run_additional_params = import_modules(net_type=net_type) # start mlflow run with mlflow.start_run() as mlrun: if net_type.lower() != 'aloha': # joint embedding nets have use_tag_labels set to 1 by default use_tag_labels = 1 # if workers has a value (it is not None) then convert it to int if it is > 0, otherwise set it to None workers = workers if workers is None else int( workers) if int(workers) > 0 else None if random_seed is not None: # if a seed was provided logger.info(f"Setting random seed to {int(random_seed)}.") # set the seed for generating random numbers torch.manual_seed(int(random_seed)) logger.info('Running generator alternative 3 cross evaluation..') # initialize chunk_sizes and chunks lists to contain all the powers of 2 between # 2^MIN_EXPONENT and 2^MAX_EXPONENT (included) chunk_sizes_iterator = [ 2**x for x in range(MIN_CHUNK_SIZE_EXPONENT, MAX_CHUNK_SIZE_EXPONENT + 1) ] chunks_iterator = [ 2**x for x in range(MIN_CHUNKS_EXPONENT, MAX_CHUNKS_EXPONENT + 1) ] # create json file parent directory if it did not already exist os.makedirs(os.path.dirname(json_file_path), exist_ok=True) # if the json file path provided points to an existing file, open it and load its content into data dict if os.path.exists(json_file_path) and os.path.isfile(json_file_path): with open(json_file_path, 'r') as f: data = json.load(f) else: # otherwise initialize data dict to contain two vectors (elapsed_times and speeds) containing None values data = { 'elapsed_times': { str(cs): {str(c): None for c in chunks_iterator} for cs in chunk_sizes_iterator }, 'speeds': { str(cs): {str(c): None for c in chunks_iterator} for cs in chunk_sizes_iterator } } # for each chunk size for cs in chunk_sizes_iterator: # for each chunk number for c in chunks_iterator: # if the product between current chunk size and number of chunks is outside the valid range, # skip evaluation if cs * c < batch_size * min_mul or cs * c > batch_size * max_mul: continue # if the values in the data dict corresponding to the current combination of chunk size and chunks # number are None, initialize then to empty vectors if data['elapsed_times'][str(cs)][str( c)] is None or data['speeds'][str(cs)][str(c)] is None: data['elapsed_times'][str(cs)][str(c)] = [] data['speeds'][str(cs)][str(c)] = [] # create Network model model = Net(use_malware=bool(use_malicious_labels), use_counts=bool(use_count_labels), use_tags=bool(use_tag_labels), n_tags=len(Dataset.tags), feature_dimension=feature_dimension, layer_sizes=run_additional_params['layer_sizes'], dropout_p=run_additional_params['dropout_p'], activation_function=run_additional_params[ 'activation_function'], normalization_function=run_additional_params[ 'normalization_function']) # select optimizer is selected given the run additional parameters got from config file # if adam optimizer is selected if run_additional_params['optimizer'].lower() == 'adam': # use Adam optimizer on all the model parameters opt = torch.optim.Adam( model.parameters(), lr=run_additional_params['lr'], weight_decay=run_additional_params['weight_decay']) # else if sgd optimizer is selected elif run_additional_params['optimizer'].lower() == 'sgd': # use stochastic gradient descent on all the model parameters opt = torch.optim.SGD( model.parameters(), lr=run_additional_params['lr'], weight_decay=run_additional_params['weight_decay'], momentum=run_additional_params['momentum']) else: # otherwise raise error raise ValueError( 'Unknown optimizer {}. Try "adam" or "sgd".'.format( run_additional_params['optimizer'])) # create train generator (a.k.a. Dataloader) generator = get_generator( ds_root=ds_path, batch_size=batch_size, chunk_size=cs, chunks=c, mode='train', num_workers=workers, n_samples=training_n_samples, use_malicious_labels=bool(use_malicious_labels), use_count_labels=bool(use_count_labels), use_tag_labels=bool(use_tag_labels)) # get number of steps per epoch (# of total batches) from generator steps_per_epoch = len(generator) # allocate model to selected device model.to(device) # instantiate a new dictionary-like object called loss_histories loss_histories = defaultdict(list) # set the model mode to 'train' model.train() # initialize current elapsed times and speeds vectors with zeroes current_elapsed_times = [0.0 for _ in range(epochs)] current_speeds = [0.0 for _ in range(epochs)] # loop for the selected number of epochs for epoch in range(epochs): # set current epoch start time start_time = time.time() # for all the training batches for i, (features, labels) in enumerate(generator): opt.zero_grad( ) # clear old gradients from the last step # copy current features and allocate them on the selected device (CPU or GPU) features = deepcopy(features).to(device) # perform a forward pass through the network out = model(features) # compute loss given the predicted output from the model loss_dict = model.compute_loss( out, deepcopy(labels), loss_wts=run_additional_params['loss_wts']) # extract total loss loss = loss_dict['total'] # compute gradients loss.backward() # update model parameters opt.step() # for all the calculated losses in loss_dict for k in loss_dict.keys(): # if the loss is 'total' then append it to loss_histories['total'] after having detached it # and passed it to the cpu if k == 'total': loss_histories[k].append( deepcopy( loss_dict[k].detach().cpu().item())) # otherwise append the loss to loss_histories without having to detach it else: loss_histories[k].append(loss_dict[k]) # compute current epoch elapsed time (in seconds) elapsed_time = time.time() - start_time # create loss string with the current losses loss_str = " ".join([ f"{key} loss:{value:7.3f}" for key, value in loss_dict.items() ]) loss_str += " | " loss_str += " ".join([ f"{key} mean:{np.mean(value):7.3f}" for key, value in loss_histories.items() ]) # write on standard out the loss string + other information (elapsed time, # predicted total epoch completion time, current mean speed and main memory usage) sys.stdout.write( '\r Epoch: {}/{} {}/{} '.format( epoch + 1, epochs, i + 1, steps_per_epoch) + '[{}/{}, {:6.3f}it/s, RAM used: {:4.1f}%, chunk_size: {}, chunks: {}] ' .format( time.strftime( "%H:%M:%S", time.gmtime(elapsed_time)), # elapsed time time.strftime( "%H:%M:%S", # predict total epoch completion time time.gmtime(steps_per_epoch * elapsed_time / (i + 1))), (i + 1) / elapsed_time, # compute current mean speed (it/s) psutil.virtual_memory(). percent, # get percentage of main memory used cs, # chunk size c) # chunks number + loss_str) # append loss string # flush standard output sys.stdout.flush() del features, labels # to avoid weird references that lead to generator errors print() # save final elapsed time and speed for the current epoch current_elapsed_times[epoch] = elapsed_time current_speeds[epoch] = steps_per_epoch / elapsed_time # save current chunk size - chunks combination elapsed times and speeds extending the lists data['elapsed_times'][str(cs)][str(c)].extend( current_elapsed_times) data['speeds'][str(cs)][str(c)].extend(current_speeds) # save content of data dict to json file with open(json_file_path, 'w') as f: json.dump(data, f) logger.info('...done')