def on_epoch_begin(self, epoch, logs={}): if epoch > 0 and epoch % self.eval_frequency == 0: # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. model_path_glob = 'checkpoint.*' if not self.job_dir.startswith("gs://"): model_path_glob = os.path.join(self.job_dir, model_path_glob) checkpoints = glob.glob(model_path_glob) if len(checkpoints) > 0: checkpoints.sort() census_model = load_model(checkpoints[-1]) census_model = model.compile_model(census_model, self.learning_rate) loss, acc = census_model.evaluate_generator( model.generator_input(self.eval_files, chunk_size=CHUNK_SIZE), steps=self.steps) print '\nEvaluation epoch[{}] metrics[{:.2f}, {:.2f}] {}'.format( epoch, loss, acc, census_model.metrics_names) if self.job_dir.startswith("gs://"): copy_file_to_gcs(self.job_dir, checkpoints[-1]) else: print '\nEvaluation epoch[{}] (no checkpoints found)'.format( epoch)
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps, train_batch_size, eval_batch_size, learning_rate, eval_frequency, first_layer_size, num_layers, scale_factor, eval_num_epochs, num_epochs, checkpoint_epochs): census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE) try: os.makedirs(job_dir) except: pass # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = FILE_PATH if not job_dir.startswith("gs://"): checkpoint_path = os.path.join(job_dir, checkpoint_path) # Model checkpoint callback checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, period=checkpoint_epochs, mode='max') # Continuous eval callback evaluation = ContinuousEval(eval_frequency, eval_files, learning_rate, job_dir, steps=train_steps) # Tensorboard logs callback tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, evaluation, tblog] census_model.fit_generator(model.generator_input(train_files, chunk_size=CHUNK_SIZE), steps_per_epoch=train_steps, epochs=num_epochs, callbacks=callbacks) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if job_dir.startswith("gs://"): census_model.save(CENSUS_MODEL) copy_file_to_gcs(job_dir, CENSUS_MODEL) else: census_model.save(os.path.join(job_dir, CENSUS_MODEL)) # Convert the Keras model to TensorFlow SavedModel model.to_savedmodel(census_model, os.path.join(job_dir, 'export'))
def on_epoch_begin(self, epoch, logs={}): if epoch > 0 and epoch % self.eval_frequency == 0: checkpoints = glob.glob(os.path.join(self.job_dir, 'checkpoint.*')) checkpoints.sort() census_model = load_model(checkpoints[-1]) census_model = model.compile_model(census_model, self.learning_rate) loss, acc = census_model.evaluate_generator( model.generator_input(self.eval_files, chunk_size=CHUNK_SIZE), steps=self.steps) print('\nEvaluation epoch[{}] metrics[{:.2f}, {:.2f}] {}'.format( epoch, loss, acc, census_model.metrics_names))
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps, train_batch_size, eval_batch_size, learning_rate, eval_frequency, first_layer_size, num_layers, scale_factor, eval_num_epochs, num_epochs, checkpoint_epochs): census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE) try: os.makedirs(job_dir) except: pass # Model checkpoint callback checkpoint = keras.callbacks.ModelCheckpoint(os.path.join( job_dir, FILE_PATH), monitor='val_loss', verbose=1, period=checkpoint_epochs, mode='max') # Continuous eval callback evaluation = ContinuousEval(eval_frequency, eval_files, learning_rate, job_dir) # Tensorboard logs callback tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) # TODO: This needs to be fixed in h5py so that writes to GCS are possible # Don't attempt to create checkpoints on Cloud ML Engine for now because # h5py doesn't come with native GCS write capability if job_dir.startswith('gs://'): callbacks = [evaluation, tblog] else: callbacks = [checkpoint, evaluation, tblog] start_time = time.time() census_model.fit_generator(model.generator_input(train_files, chunk_size=CHUNK_SIZE), steps_per_epoch=train_steps, epochs=num_epochs, callbacks=callbacks) print "\nTime used.", time.time() - start_time census_model.save(os.path.join(job_dir, CENSUS_MODEL))