def execute(self): result_file = os.path.join(self.result_dir, "train_result_{}.txt".format(self.task_index)) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(self.server.target, config=config) as sess: K.set_session(sess) if self.go_on: self.restore_model() tb_callback = TensorBoard(log_dir=self.log_dir, write_grads=True, write_images=True) ckpt_callback = ModelCheckpoint(self.checkpoint_path, monitor='loss', save_weights_only=True) reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=3, verbose=1) early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=10, verbose=1) # add callbacks to save model checkpoint and tensorboard events (on worker:0 only) callbacks = [tb_callback, ckpt_callback] if self.task_index == 0 else [] callbacks += [reduce_lr, early_stopping] # try: his = self.model.fit_generator(self.generate_rdd_data(), steps_per_epoch=self.steps_per_epoch, # validation_data=self.val_generate_data(val_data), # validation_steps=max(1, self.val_num // self.batch_size), epochs=self.epochs + self.initial_epoch, initial_epoch=self.initial_epoch, workers=0, callbacks=callbacks) logger.debug("{}-{}".format(self.task_index, his.history)) ModelDir.write_result(result_file, self.get_results(his), self.go_on) # except Exception as e: # logger.debug(str(e)) self.save_model() self.tf_feed.terminate()
def execute(self): result_file = os.path.join( self.result_dir, "train_result_{}.txt".format(self.task_index)) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(self.server.target, config=config) as sess: K.set_session(sess) if self.go_on: self.restore_model() tb_callback = TensorBoard(log_dir=self.log_dir, write_grads=True, write_images=True) ckpt_callback = ModelCheckpoint(self.checkpoint_path, monitor='loss', save_weights_only=True) # add callbacks to save model checkpoint and tensorboard events (on worker:0 only) callbacks = [tb_callback, ckpt_callback ] if self.task_index == 0 else None # train on data read from a generator which is producing data from a Spark RDD his = self.model.fit_generator( generator=self.generate_rdd_data(), steps_per_epoch=self.steps_per_epoch, epochs=self.epochs + self.initial_epoch, callbacks=callbacks, workers=0, initial_epoch=self.initial_epoch) self.save_model() ModelDir.write_result(result_file, self.get_results(his), self.go_on) self.tf_feed.terminate()
def execute(self): result_file = os.path.join( self.result_dir, "train_result_{}.txt".format(self.task_index)) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Start running operations on the Graph. gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=self.gpu_memory_fraction) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False)) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) summary_writer = tf.summary.FileWriter(self.log_dir, sess.graph) coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord, sess=sess) steps = 0 while not sess.should_stop() and not self.tf_feed.should_stop(): if self.go_on: self.restore_model(sess) # Training and validation loop print('Running training') image_list, label_list = self.get_data() # Enqueue one epoch of image paths and labels labels_array = np.expand_dims(np.array(image_list), 1) image_paths_array = np.expand_dims(np.array(label_list), 1) control_value = facenet.RANDOM_ROTATE * self.random_rotate + \ facenet.RANDOM_CROP * self.random_crop + \ facenet.RANDOM_FLIP * self.random_flip + \ facenet.FIXED_STANDARDIZATION * self.use_fixed_image_standardization control_array = np.ones_like(labels_array) * control_value enqueue_op = tf.get_collection(OUTPUTS)[0] feed_dict = dict( zip(tf.get_collection(INPUTS), [image_paths_array, labels_array, control_array])) sess.run(enqueue_op, feed_dict) self.model.add_params(batch_size=self.batch_size, steps_per_epoch=self.steps_per_epoch, phase_train=True, n_classes=self.n_classes) keys = ["_task_index", "_epoch"] for epoch in range(1, self.epochs + 1): for _ in range(self.steps_per_epoch - 1): sess.run(self.model.fetches, feed_dict=self.model.feed_dict) res = sess.run(self.model.fetches + [summary_op], feed_dict=self.model.feed_dict) steps = sess.run(self.global_step) summary_writer.add_summary(res[-1], global_step=steps) results = [dict(zip(keys, res))] ModelDir.write_result(result_file, results, True) summary = tf.Summary() summary_writer.add_summary(summary, global_step=steps) self.tf_feed.terminate()
def train(self, save_dir, result_dir, checkpoint_dir, log_dir): result_file = os.path.join(result_dir, "train_result.txt") train_set = self.train_set config = tf.compat.v1.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.compat.v1.Session(config=config) as sess: # K.set_session(sess) if self.go_on: self.restore_model(checkpoint_dir) tb_callback = TensorBoard(log_dir=log_dir, write_images=True) checkpoint_file = os.path.join(checkpoint_dir, self.name + '_checkpoint_{epoch}') ckpt_callback = ModelCheckpoint( checkpoint_file, # monitor='loss', save_weights_only=True) reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=3, verbose=1) early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=10, verbose=1) # add callbacks to save model checkpoint and tensorboard events (on worker:0 only) callbacks = [tb_callback, ckpt_callback] # callbacks = [] self.model.compile(optimizer=Adam(lr=1e-4), loss={ 'yolo_loss': lambda y_true, y_pred: y_pred }) # print('Unfreeze all of the layers.') callbacks.extend([reduce_lr, early_stopping]) steps_per_epoch = len(train_set) // self.batch_size # note that more GPU memory is required after unfreezing the body # try: his = self.model.fit_generator( self.train_generate_data(train_set), steps_per_epoch=steps_per_epoch, # validation_data=self.val_generate_data(val_data), # validation_steps=max(1, self.val_num // self.batch_size), epochs=self.initial_epoch + 1, initial_epoch=self.initial_epoch, workers=1, callbacks=callbacks) logger.debug(str(his.history)) # except Exception as e: # logger.debug(str(e)) # logger.debug('end') save_model_path = os.path.join(save_dir, 'model.h5') self.model.save(save_model_path) ModelDir.write_result(result_file, self.get_results(his))
def execute(self): result_file = os.path.join( self.result_dir, "predict_result_{}.txt".format(self.task_index)) with tf.Session(self.server.target) as sess: K.set_session(sess) self.load_model() his = self.model.predict_generator(self.generate_rdd_data(), steps=self.steps_per_epoch) ModelDir.write_result(result_file, self.get_results(his)) self.tf_feed.terminate() self.delete_tmp_dir()
def execute(self): result_file = os.path.join( self.result_dir, "predict_result_{}.txt".format(self.task_index)) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(self.server.target, config=config) as sess: self.load_model(sess) for _ in range(self.steps_per_epoch): x, y = self.generate_rdd_data if len(x) == 0: break predictions = sess.run(self.model.outputs['y'], self.feed_dict(x=x)) y_pred = np.argmax(predictions, 1) y_true = np.argmax(y, 1) if y is not None else None logger.debug(predictions) results = self.get_results(y_pred, y_true) ModelDir.write_result(result_file, results, True) self.tf_feed.terminate()
def execute(self): result_file = os.path.join( self.result_dir, "train_result_{}.txt".format(self.task_index)) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True summary_op = tf.summary.merge_all() with tf.Session(self.server.target, config=config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) self.summary_writer = tf.summary.FileWriter( self.log_dir, sess.graph) coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord, sess=sess) if self.go_on: self.restore_checkpoint(sess) names, values = zip(*self.model.fetches.items()) names = list(names) values = list(values) res, summary_str = None, None for epoch in range(1, self.epochs + 1): for _ in range(self.steps_per_epoch): x, y = self.generate_rdd_data if len(x) == 0: break if summary_op is not None: *res, summary_str = sess.run(values + [summary_op], self.feed_dict(x=x, y=y)) else: res = sess.run(values, self.feed_dict(x=x, y=y)) result = dict( (k, v) for k, v in zip(names, res) if v is not None) result.update(self.common_dict(epoch + self.initial_epoch)) ModelDir.write_result(result_file, [result], True) self.save_checkpoint(sess, epoch + self.initial_epoch, summary_str) self.model.write_model(self.model_config_path, False) self.save_model(sess) self.tf_feed.terminate()