def check( train_model: tf.keras.models.Model, pred_model: tf.keras.models.Model, models_dir: tk.typing.PathLike, dataset: tk.data.Dataset = None, train_data_loader: tk.data.DataLoader = None, pred_data_loader: tk.data.DataLoader = None, save_mode: str = "hdf5", ): """モデルの簡易動作確認用コード。 Args: train_model: 学習用モデル pred_model: 推論用モデル models_dir: 情報の保存先ディレクトリ dataset: チェック用データ (少数にしておくこと) train_data_loader: 学習用DataLoader pred_data_loader: 推論用DataLoader save_mode: 保存形式 ("hdf5", "saved_model", "onnx", "tflite"のいずれか) """ models_dir = pathlib.Path(models_dir) # summary表示 tk.models.summary(train_model) # グラフを出力 tk.models.plot(train_model, models_dir / "model.png") # save/loadの動作確認 (とりあえず落ちなければOKとする) with tempfile.TemporaryDirectory() as tmpdir: save_path = pathlib.Path(tmpdir) / f"model.{save_mode}" tk.models.save(pred_model, save_path) pred_model = tk.models.load(save_path) # train_model.evaluate if dataset is not None and train_data_loader is not None: ds, steps = train_data_loader.get_ds(dataset, shuffle=True) logger.info(f"train_model.evaluate: {ds.element_spec} {steps=}") values = train_model.evaluate(ds, steps=steps, verbose=1) if len(train_model.metrics_names) == 1: evals = {train_model.metrics_names[0]: values} else: evals = dict(zip(train_model.metrics_names, values)) logger.info(f"check.evaluate: {tk.evaluations.to_str(evals)}") # pred_model.predict if dataset is not None and pred_data_loader is not None: ds, steps = pred_data_loader.get_ds(dataset) logger.info(f"pred_model.evaluate: {ds.element_spec} {steps=}") pred = pred_model.predict(ds, steps=steps, verbose=1) if isinstance(pred, (list, tuple)): logger.info(f"check.predict: shape={[p.shape for p in pred]}") else: logger.info(f"check.predict: shape={pred.shape}") # train_model.fit if dataset is not None and train_data_loader is not None: ds, steps = train_data_loader.get_ds(dataset, shuffle=True) train_model.fit(ds, steps_per_epoch=steps, epochs=1, verbose=1)
def train_model(model: tf.keras.models.Model, epochs: int, directory: str, batch_size: int, model_id: int = 0, initial_epoch: int = 0): decay = CosineDecayRestarts(0.0001, 15) optimizer = optimizers.Adam(learning_rate=decay, amsgrad=True) resolution = model.output_shape[1:3] if min(resolution[0], resolution[1]) < 88: model.compile(optimizer=optimizer, loss=ms_ssim_low) else: model.compile(optimizer=optimizer, loss=ms_ssim) generator = DataGenerator(df_name='dataset.df', data_dir=directory, dim=resolution, n_channels=3, batch_size=batch_size, shuffle=True) drawer = DrawCallback(model_id, generator.get_object_by_id(15009)) check = ModelCheckpoint(filepath=f'style_nn_512x288_v4_{model_id}.h5') board = TensorBoard(log_dir="logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S"), update_freq='epoch') model.fit(generator, callbacks=[drawer, check, board], workers=4, use_multiprocessing=False, epochs=epochs, initial_epoch=initial_epoch)
def train_nn(model: tensorflow.keras.models.Model, class_weights: np.array, X_train, X_valid, y_train, y_valid): model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X_train, y_train, batch_size=64, epochs=100, verbose=2, validation_data=(X_valid, y_valid), class_weight=dict(enumerate(class_weights)))
def train(model: tf.keras.models.Model, data: Data, path: str, batch_size: int = 100, train_val_split: Optional[float] = 0.25, early_stopping: bool = True, patience: int = 10) -> dict: """ Trains a model using a k-fold cross validation scheme. Returns all trained models and estimates of network accuracy. """ returns = dict("histories", [], "models", [], "metrics", []) for k, [train_val_set, test_set] in enumerate(data.folds): print("Fitting model to fold {}".format(k)) path = os.path.join(path, str(k)) train_x, train_y, val_x, val_y = train_test_split( train_val_set.x, train_val_set.y) history = model.fit(train_x, train_y, epochs=2000, batch_size=batch_size, validation_data=[val_x, val_y], callbacks=_create_callbacks( path, early_stopping=early_stopping, patience=patience)) model_best = tf.keras.models.load_model(path) metrics = model_best.evaluate(x=test_set.x, y=test_set.y) returns["histories"].append(history) returns["models"].append(model_best) returns["metrics"].append(metrics) return returns
def model_compile_fit( hparams: Dict, model: tf.keras.models.Model, dataset: DatasetDF, epochs = 999, output_shape: Union[None, int, Dict] = None, model_file: AnyStr = None, log_dir: AnyStr = None, best_only = True, verbose = settings['verbose']['fit'], ): timer_start = time.time() hparams = { **settings['hparam_defaults'], **hparams } model = model_compile( hparams, model, output_shape ) callback = callbacks(hparams, dataset, model_file, log_dir, best_only, verbose) history = model.fit( dataset.X["train"], dataset.Y["train"], batch_size=hparams.get("batch_size", 128), epochs=epochs, verbose=verbose, validation_data=(dataset.X["valid"], dataset.Y["valid"]), callbacks=callback ) timer_seconds = int(time.time() - timer_start) model_stats = model_stats_from_history(history, timer_seconds, best_only) return model_stats
def train_model(model: tf.keras.models.Model, data_train, batch_size, num_epochs): X_train, Y_train = data_train callbacks = [ tf.keras.callbacks.ReduceLROnPlateau('loss', factor=0.95, patience=3, min_lr=1e-6, verbose=1), ] model.fit( X_train, Y_train, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks, verbose=1, )
def model_compile_fit( hparams: Dict, model: tf.keras.models.Model, dataset: DataSet, log_dir: AnyStr = None, verbose = False, ): optimiser = getattr(tf.keras.optimizers, hparams['optimizer']) schedule = scheduler(hparams, dataset, verbose=verbose) callbacks = [ EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=hparams.get('patience', 10)), schedule, ] if log_dir: callbacks += [ tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1), # log metrics hp.KerasCallback(log_dir, hparams) # log hparams ] timer_start = time.time() model.compile( loss=tf.keras.losses.categorical_crossentropy, optimizer=optimiser(learning_rate=hparams.get('learning_rate', 0.0001)), metrics=['accuracy'] ) history = model.fit( dataset.data['train_X'], dataset.data['train_Y'], batch_size=hparams.get("batch_size", 128), epochs=999, verbose=verbose, validation_data=(dataset.data["valid_X"], dataset.data["valid_Y"]), callbacks=callbacks ) timer_seconds = int(time.time() - timer_start) model_stats = { key: value[-1] for key, value in history.history.items() } model_stats['time'] = timer_seconds model_stats['epochs'] = len(history.history['loss']) return model_stats
def train_model(model: tf.keras.models.Model, train_dataset, valid_dataset): steps = 10 if DEVMODE else (train_dataset.n // train_dataset.batch_size) valid_steps = 5 if DEVMODE else (valid_dataset.n // valid_dataset.batch_size) early_stopping_cb = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True, verbose=1) checkpoint_cb = ModelCheckpoint(os.path.join(WORK_DIR, "cassava_best.h5"), monitor="val_loss", save_best_only=True) reduce_lr_on_plateau_cb = ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=2, min_lr=1e-6, verbose=1) time_stopping_cb = TimeStopping(seconds=int(11.5 * 60 * 60), verbose=1) results = model.fit( train_dataset, epochs=EPOCHS, steps_per_epoch=steps, validation_data=valid_dataset, validation_steps=valid_steps, shuffle=False, # Datasets are already shuffled callbacks=[ early_stopping_cb, checkpoint_cb, reduce_lr_on_plateau_cb, time_stopping_cb ], ) print(f"Train accuracy: {results.history['accuracy']}") print(f"Validation accuracy: {results.history['val_accuracy']}") model.save(os.path.join(WORK_DIR, "cassava.h5")) return results
def fit( model: tf.keras.models.Model, train_iterator: tk.data.Iterator, val_iterator: tk.data.Iterator = None, val_freq: int | typing.Sequence[int] | str | None = "auto", class_weight: dict[int, float] = None, epochs: int = 1800, callbacks: list[tf.keras.callbacks.Callback] = None, verbose: int = 1, initial_epoch: int = 0, ): """学習。 Args: model: モデル train_iterator: 訓練データ val_iterator: 検証データ。Noneなら省略。 val_freq: 検証を行うエポック数の間隔、またはエポック数のリスト。0ならvalidationしない(独自仕様)。"auto"なら適当に決める(独自仕様)。 class_weight: クラスごとの重みのdict epochs: エポック数 callbacks: コールバック。EpochLoggerとErrorOnNaNとhorovod関連は自動追加。 verbose: 1ならプログレスバー表示、2ならepoch毎の結果だけ表示。 initial_epoch: 学習を開始するエポック数 - 1 """ # Horovodはそれぞれのワーカーが勝手にvalidateするのでshuffleする必要がある。 # shuffleするならデータ数分だけでは全体をカバーできないため3倍回す。 # horovodのexamplesの真似: # <https://github.com/horovod/horovod/blob/9bdd70d/examples/keras_mnist_advanced.py#L112,L115> use_horovod = tk.hvd.is_active() if val_freq == 0 or val_iterator is None: # val_freq == 0ならvalidationしない(独自仕様) val_freq = None val_iterator = None elif val_freq == "auto": # "auto"なら適当に決める(独自仕様) val_freq = make_val_freq( val_freq, epochs, len(train_iterator.dataset), len(val_iterator.dataset) * (3 if use_horovod else 1), ) train_ds, train_steps = train_iterator.data_loader.get_ds( train_iterator.dataset, shuffle=True) val_ds, val_steps = (val_iterator.data_loader.get_ds(val_iterator.dataset, shuffle=use_horovod) if val_iterator is not None else (None, 0)) logger.info(f"fit(train): {train_ds.element_spec} {train_steps=}") if val_ds is not None: logger.info(f"fit(val): {val_ds.element_spec} {val_steps=}") callbacks = make_callbacks(callbacks, training=True) fit_kwargs = {} if val_freq is not None: fit_kwargs["validation_freq"] = val_freq with tk.log.trace("fit"): model.fit( train_ds, steps_per_epoch=train_steps // tk.hvd.size(), validation_data=val_ds, validation_steps=(val_steps * 3 // tk.hvd.size() if use_horovod else val_steps) if val_iterator is not None else None, class_weight=class_weight, epochs=epochs, callbacks=callbacks, verbose=verbose if tk.hvd.is_master() else 0, initial_epoch=initial_epoch, **fit_kwargs, )
def model_compile_fit( hparams: Dict, model: tf.keras.models.Model, dataset: DatasetDF, model_file: AnyStr = None, log_dir: AnyStr = None, best_only=True, verbose=settings['verbose']['fit'], ): hparams = {**settings['hparam_defaults'], **hparams} optimiser = getattr(tf.keras.optimizers, hparams['optimizer']) schedule = scheduler(hparams, dataset, verbose=verbose) callbacks = [ EarlyStopping(monitor='val_loss', mode='min', verbose=verbose, patience=hparams.get('patience', hparams['patience']), restore_best_weights=best_only), schedule, KaggleTimeoutCallback(hparams["timeout"], verbose=False), # ProgbarLogger(count_mode='samples', stateful_metrics=None) ] if model_file: callbacks += [ ModelCheckpoint( model_file, monitor='val_loss', verbose=False, save_best_only=best_only, save_weights_only=False, mode='auto', ) ] if log_dir and settings['verbose']['tensorboard']: callbacks += [ tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1), # log metrics KerasCallback(log_dir, hparams) # log train_hparams ] timer_start = time.time() model.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer=optimiser( learning_rate=hparams.get('learning_rate', 0.0001)), metrics=['accuracy']) history = model.fit(dataset.X["train"], dataset.Y["train"], batch_size=hparams.get("batch_size", 128), epochs=999, verbose=verbose, validation_data=(dataset.X["valid"], dataset.Y["valid"]), callbacks=callbacks) timer_seconds = int(time.time() - timer_start) if 'val_loss' in history.history: best_epoch = history.history['val_loss'].index( min(history.history['val_loss'])) if best_only else -1 model_stats = { key: value[best_epoch] for key, value in history.history.items() } model_stats['time'] = timer_seconds model_stats['epochs'] = len(history.history['loss']) else: model_stats = None return model_stats