def create_model(params, data_idx=[0]): data_folder_path = params["data_folder_path"] model_name = params["model_name"] data_set_name = params["data_set_name"] import os, sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import iodata #print() p, E, _ = iodata.load_data_set(data_folder_path, data_set_name, data_idx) input = (p, E) model = discrete_10000_ec_1(params) output = model(input) return model
def train(model, params): # mlflow logging # mlflow server --backend-store-uri mlruns/ --default-artifact-root mlruns/ --host 0.0.0.0 remote_server_uri = params["server_uri"] # set to your server URI mlflow.set_tracking_uri( remote_server_uri) # or set the MLFLOW_TRACKING_URI in the env exp_name = params["model_name"] mlflow.set_experiment(exp_name) # auto logging mlflow.tensorflow.autolog(every_n_iter=params['every_n_iter']) print("# mlflow initialized") # Load model weights ckpt_path = os.path.join(params["model_folder_path"], params["model_name"], params["ckpt_filename"]) ckpt_dir = os.path.dirname(ckpt_path) latest = tf.train.latest_checkpoint(ckpt_dir) if not latest: model.save_weights(ckpt_path.format(epoch=0, val_loss=0)) latest = tf.train.latest_checkpoint(ckpt_dir) else: model.load_weights(latest) print("# Model loaded from: {}".format(latest)) # Initializing train and test data set generator train_ds_path = os.path.join(params['ds_folder'], params['ds_name'], "train") val_ds_path = os.path.join(params['ds_folder'], params['ds_name'], "validation") # deciding a full load or use NVEGenerator API to load # data on the fly in pieces (to avoid memory overflow) if params["full_load"]: train_ds_prefix = os.path.join(train_ds_path, params['ds_name']) train_ds = tf.data.Dataset.from_tensor_slices( load_data_set(train_ds_prefix, params['train_idx'], shuffle=False)) train_ds = train_ds.batch(params['train_batch_size']) val_ds_prefix = os.path.join(val_ds_path, params['ds_name']) val_ds = tf.data.Dataset.from_tensor_slices( load_data_set(val_ds_prefix, params['val_idx'], shuffle=False)) val_ds = val_ds.batch(params['val_batch_size']) else: train_ds = NVEGenerator(train_ds_path, params['ds_name'], params['train_idx'], meta_batch_size=params['train_batch_size'], shuffle=params['shuffle']) val_ds = NVEGenerator(val_ds_path, params['ds_name'], params['val_idx'], meta_batch_size=params['val_batch_size'], shuffle=params['shuffle']) print("# Training and validation generator initialized") # Compiling model model.compile( optimizer=params['optimizer'], loss=params['loss'], #metrics=params['metrics'] ) print("# Model compiled") # callbacks # check point cp_callback = tf.keras.callbacks.ModelCheckpoint( ckpt_path, verbose=1, save_weights_only=True, mode='min', save_freq=params[ 'save_freq'], # saves weights every save_freq times a ds generator is called monitor='val_loss', save_best_only=params['save_best_only']) # lr schedule callbacks_list = [cp_callback] if 'lr_schedule' in params: callbacks_list.append( tf.keras.callbacks.LearningRateScheduler(params['lr_schedule'], verbose=0)) print("# Checkpoint callback initialized") # latest epoch latest_epoch = latest.find('cp-') latest_epoch = int(latest[latest_epoch + 3:latest_epoch + 7]) + 1 print("# continue training from epoch", latest_epoch) # Training # Note: # do not specify batch_size if data is generator or dataset # shuffle is ignored if data set is a generator. shuffle by epoch end. history = model.fit( x=train_ds, epochs=params['epochs'], verbose=params["verbose"], callbacks=callbacks_list, validation_data=val_ds, shuffle=True, initial_epoch=latest_epoch, ) return model, history
def __getitem__(self, idx): idx = self.data_idx[idx * self.meta_batch_size:(idx + 1) * self.meta_batch_size] return load_data_set(self.file_name_prefix, idx, shuffle=self.shuffle)