def main(_argv): model = YoloV3(FLAGS.size, training=True) anchors = yolo_anchors anchor_masks = yolo_anchor_masks train_dataset = dataset.load_fake_dataset() if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes) train_dataset = train_dataset.shuffle(buffer_size=1024) # TODO: not 1024 train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map( lambda x, y: (dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, 80))) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) loss = [YoloLoss(anchors[mask]) for mask in anchor_masks] if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_loss.update_state(total_loss) logging.info("{}, train: {}".format(epoch, avg_loss.result().numpy())) avg_loss.reset_states() model.save_weights('checkpoints/yolov3_train_{}.tf'.format(epoch))
def main(_argv): physical_devices = tf.config.experimental.list_physical_devices('GPU') if len(physical_devices) > 0: tf.config.experimental.set_memory_growth(physical_devices[0], True) if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_anchors anchor_masks = yolo_anchor_masks train_dataset = dataset.load_fake_dataset() if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes, FLAGS.size) train_dataset = train_dataset.shuffle(buffer_size=512) train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) val_dataset = dataset.load_fake_dataset() if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset, FLAGS.classes, FLAGS.size) val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) # Configure the model for transfer learning if FLAGS.transfer == 'none': pass # Nothing to do elif FLAGS.transfer in ['darknet', 'no_output']: # Darknet transfer is a special case that works # with incompatible number of classes # reset top layers if FLAGS.tiny: model_pretrained = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) else: model_pretrained = YoloV3(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) model_pretrained.load_weights(FLAGS.weights) if FLAGS.transfer == 'darknet': model.get_layer('yolo_darknet').set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) freeze_all(model.get_layer('yolo_darknet')) elif FLAGS.transfer == 'no_output': for l in model.layers: if not l.name.startswith('yolo_output'): l.set_weights( model_pretrained.get_layer(l.name).get_weights()) freeze_all(l) else: # All other transfer require matching classes model.load_weights(FLAGS.weights) if FLAGS.transfer == 'fine_tune': # freeze darknet and fine tune other layers darknet = model.get_layer('yolo_darknet') freeze_all(darknet) elif FLAGS.transfer == 'frozen': # freeze everything freeze_all(model) optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) loss = [ YoloLoss(anchors[mask], classes=FLAGS.num_classes) for mask in anchor_masks ] if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_loss.update_state(total_loss) for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) avg_loss.reset_states() avg_val_loss.reset_states() model.save_weights('checkpoints/yolov3_train_{}.tf'.format(epoch)) else: model.compile(optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit')) callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True), TensorBoard(log_dir='logs') ] history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset)
def main(_argv): physical_devices = tf.config.experimental.list_physical_devices('GPU') # Setup if FLAGS.multi_gpu: for physical_device in physical_devices: tf.config.experimental.set_memory_growth(physical_device, True) strategy = tf.distribute.MirroredStrategy() print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) BATCH_SIZE = FLAGS.batch_size * strategy.num_replicas_in_sync FLAGS.batch_size = BATCH_SIZE with strategy.scope(): model, optimizer, loss, anchors, anchor_masks = setup_model() else: model, optimizer, loss, anchors, anchor_masks = setup_model() if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes, FLAGS.size) else: train_dataset = dataset.load_fake_dataset() train_dataset = train_dataset.shuffle(buffer_size=512) train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset, FLAGS.classes, FLAGS.size) else: val_dataset = dataset.load_fake_dataset() val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_loss.update_state(total_loss) for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) avg_loss.reset_states() avg_val_loss.reset_states() model.save_weights('checkpoints/yolov3_train_{}.tf'.format(epoch)) else: callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True), TensorBoard(log_dir='logs') ] start_time = time.time() history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset) end_time = time.time() - start_time print(f'Total Training Time: {end_time}')
def main(_argv): if FLAGS.mode == "eager_tf": tf.compat.v1.enable_eager_execution() physical_devices = tf.config.experimental.list_physical_devices('GPU') if len(physical_devices) > 0: tf.config.experimental.set_memory_growth(physical_devices[0], True) if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_anchors anchor_masks = yolo_anchor_masks if FLAGS.trace: run_options = tf.compat.v1.RunOptions( output_partition_graphs=True, trace_level=tf.compat.v1.RunOptions.FULL_TRACE) run_metadata = tf.compat.v1.RunMetadata() trace_dir = os.path.join("traces", "training") if not os.path.isdir(trace_dir): os.makedirs(trace_dir) graphs_dir = os.path.join("traces", "training", "graphs") if not os.path.isdir(graphs_dir): os.makedirs(graphs_dir) else: run_options = None run_metadata = None train_dataset = dataset.load_fake_dataset() if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes, FLAGS.size) train_dataset = train_dataset.shuffle(buffer_size=512) train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) train_dataset = train_dataset.repeat() train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) val_dataset = dataset.load_fake_dataset() if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset, FLAGS.classes, FLAGS.size) val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) val_dataset = val_dataset.repeat() # TF2 doesn't need this, but we're using TF1.15. if FLAGS.mode == "fit": sess = tf.keras.backend.get_session() sess.run(tf.compat.v1.global_variables_initializer(), options=run_options, run_metadata=run_metadata) if FLAGS.trace: fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open(os.path.join(trace_dir, f"variables_init.json"), 'w') as f: f.write(chrome_trace) for i in range(len(run_metadata.partition_graphs)): with open( os.path.join(graphs_dir, f"variables_init_partition_{i}.pbtxt"), 'w') as f: f.write(str(run_metadata.partition_graphs[i])) sess.run(tf.compat.v1.tables_initializer(), options=run_options, run_metadata=run_metadata) if FLAGS.trace: fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open(os.path.join(trace_dir, f"table_init.json"), 'w') as f: f.write(chrome_trace) for i in range(len(run_metadata.partition_graphs)): with open( os.path.join(graphs_dir, f"table_init_partition_{i}.pbtxt"), 'w') as f: f.write(str(run_metadata.partition_graphs[i])) # Configure the model for transfer learning if FLAGS.transfer == 'none': pass # Nothing to do elif FLAGS.transfer in ['darknet', 'no_output']: # Darknet transfer is a special case that works # with incompatible number of classes # reset top layers if FLAGS.tiny: model_pretrained = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) else: model_pretrained = YoloV3(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) model_pretrained.load_weights(FLAGS.weights) if FLAGS.transfer == 'darknet': model.get_layer('yolo_darknet').set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) freeze_all(model.get_layer('yolo_darknet')) elif FLAGS.transfer == 'no_output': for l in model.layers: if not l.name.startswith('yolo_output'): l.set_weights( model_pretrained.get_layer(l.name).get_weights()) freeze_all(l) else: # All other transfer require matching classes model.load_weights(FLAGS.weights) if FLAGS.transfer == 'fine_tune': # freeze darknet and fine tune other layers darknet = model.get_layer('yolo_darknet') freeze_all(darknet) elif FLAGS.transfer == 'frozen': # freeze everything freeze_all(model) optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) loss = [ YoloLoss(anchors[mask], classes=FLAGS.num_classes) for mask in anchor_masks ] if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_loss.update_state(total_loss) for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) avg_loss.reset_states() avg_val_loss.reset_states() model.save_weights('checkpoints/yolov3_train_{}.tf'.format(epoch)) else: model.compile(optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit'), options=run_options, run_metadata=run_metadata) callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True), ] class TraceCallback(tf.keras.callbacks.Callback): def on_epoch_begin(self, epoch, logs=None): self.current_epoch = epoch def on_train_batch_end(self, batch, logs=None): fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open( os.path.join( trace_dir, f"training_epoch_{self.current_epoch}_batch_{batch}.json" ), 'w') as f: f.write(chrome_trace) # No need to dump graph partitions for every batch; they should be identical. if batch == 0: for i in range(len(run_metadata.partition_graphs)): with open( os.path.join(graphs_dir, f"training_partition_{i}.pbtxt"), 'w') as f: f.write(str(run_metadata.partition_graphs[i])) if FLAGS.trace: callbacks.append(TraceCallback()) else: callbacks.append(TensorBoard(write_graph=False, log_dir="logs")) history = model.fit( train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset, steps_per_epoch=FLAGS.num_samples // FLAGS.batch_size, validation_steps=FLAGS.num_val_samples // FLAGS.batch_size)
def main(_argv): # GPU设置 physical_devices = tf.config.experimental.list_physical_devices('GPU') # 获取所有物理GPU for physical_device in physical_devices: tf.config.experimental.set_memory_growth(physical_device, True) # 打开内存增长 # 模型初始化 if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_anchors anchor_masks = yolo_anchor_masks # 加载数据集 # 训练集 train_dataset = dataset.load_fake_dataset() if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset( # 这里也做了resize,和后面的resize调用了一样的函数,后面的 FLAGS.dataset, FLAGS.classes, FLAGS.size) # resize应该删除,且这里的resize应该修改为不让图片失真的resize num_of_data = 0 for _ in train_dataset: num_of_data += 1 train_dataset = train_dataset.shuffle(buffer_size=512) train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), # 做resize和像素值小数化,这里的具体做法可能需要修改 dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) # 验证集 val_dataset = dataset.load_fake_dataset() if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset( FLAGS.val_dataset, FLAGS.classes, FLAGS.size) val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) # 迁移学习 if FLAGS.transfer == 'none': # 不进行迁移学习 pass elif FLAGS.transfer in ['darknet', 'no_output', 'no_output_no_freeze']: # 只迁移某些层的参数并将这些层冻结 if FLAGS.tiny: model_pretrained = YoloV3Tiny( FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) else: model_pretrained = YoloV3( FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) model_pretrained.load_weights(FLAGS.weights) if FLAGS.transfer == 'darknet': # 加载darknet层的参数并冻结 model.get_layer('yolo_darknet').set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) freeze_all(model.get_layer('yolo_darknet')) elif FLAGS.transfer == 'no_output': # 加载除输出层以外的参数并冻结 for l in model.layers: if not l.name.startswith('yolo_output'): l.set_weights(model_pretrained.get_layer(l.name).get_weights()) freeze_all(l) else: # 加载除输出层以外的参数并且不冻结 for l in model.layers: if not l.name.startswith('yolo_output'): l.set_weights(model_pretrained.get_layer(l.name).get_weights()) else: # 迁移整个网络的所有参数并冻结某些层 model.load_weights(FLAGS.weights) if FLAGS.transfer == 'fine_tune': # 迁移整个网络所有参数并冻结yolo_darknet darknet = model.get_layer('yolo_darknet') freeze_all(darknet) elif FLAGS.transfer == 'frozen': # 迁移整个网络所有参数并冻结所有参数 freeze_all(model) elif FLAGS.transfer == 'continue': # 迁移整个网络进行训练 pass optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) # loss: [高层loss, 中层loss函数, 低层loss] loss = [YoloLoss(anchors[mask], classes=FLAGS.num_classes) for mask in anchor_masks] if FLAGS.mode == 'eager_tf': avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) # 用于写日志文件 num_of_batch = int(np.ceil(num_of_data / FLAGS.batch_size)) logging.info("num of data: {}, batch size: {}, num of batch: {}".format( num_of_data, FLAGS.batch_size, num_of_batch)) train_summary_writer = tf.summary.create_file_writer('logs/train') for epoch in range(FLAGS.epochs): for batch, (images, labels) in enumerate(train_dataset): with tf.GradientTape() as tape: # 正则化损失 regularization_loss = tf.reduce_sum(model.losses) # 预测损失 # outputs: (高层output, 中层output, 底层output),格式为: # ((batch_size, h2, w2, num_anchors0, num_class+5), # (batch_size, h2*2, w2*2, num_anchors1, num_class+5) # (batch_size, h2*4, w2*4, num_anchors2, num_class+5)) # labels: 由GT框得到的target,格式为: # (高层target, 中层target, 低层target) # (batch_size, grid_size, grid_size, num_anchors, [x1, y1, x2, y2, 1, class]) # loss: [高层loss, 中层loss函数, 低层loss] # loss = [YoloLoss(anchors[mask], classes=FLAGS.num_classes) # for mask in anchor_masks] # pred_loss: [(batch,), (batch,), (batch,)] pred_loss = [] outputs = model(images, training=True) for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) loss_without_reg = tf.reduce_sum(pred_loss) # 带正则项的损失 total_loss = loss_without_reg + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) logging.info("epoch_{}_batch_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_loss.update_state(total_loss) # 每个batch记录一次训练集上的loss with train_summary_writer.as_default(): tf.summary.scalar('loss', loss_without_reg.numpy(), step=(epoch * num_of_batch + batch)) # 定期保存checkpoint model.save_weights('checkpoints/yolov3_{}_{}.tf'.format(epoch, batch)) # 定期计算mAP avg_loss.reset_states() else: model.compile(optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit')) callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True, save_freq=500), TensorBoard(log_dir='logs', update_freq=10) ] # history.history是一个字典,存放着训练过程的loss和其他metrics history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset)
def main(_argv): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(FLAGS.size, training=True) anchors = yolo_anchors anchor_masks = yolo_anchor_masks train_dataset = dataset.load_fake_dataset() if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes) train_dataset = train_dataset.shuffle(buffer_size=1024) # TODO: not 1024 train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map( lambda x, y: (dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, 80))) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) val_dataset = dataset.load_fake_dataset() if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset, FLAGS.classes) val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map( lambda x, y: (dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, 80))) if FLAGS.transfer != 'none': model.load_weights(FLAGS.weights) if FLAGS.transfer == 'fine_tune': # freeze darknet darknet = model.get_layer('yolo_darknet') freeze_all(darknet) elif FLAGS.mode == 'frozen': # freeze everything freeze_all(model) else: # reset top layers if FLAGS.tiny: # get initial weights init_model = YoloV3Tiny(FLAGS.size, training=True) else: init_model = YoloV3(FLAGS.size, training=True) if FLAGS.transfer == 'darknet': for l in model.layers: if l.name != 'yolo_darknet' and l.name.startswith('yolo_'): l.set_weights( init_model.get_layer(l.name).get_weights()) else: freeze_all(l) elif FLAGS.transfer == 'no_output': for l in model.layers: if l.name.startswith('yolo_output'): l.set_weights( init_model.get_layer(l.name).get_weights()) else: freeze_all(l) # Horovod: adjust learning rate based on number of GPUs. optimizer = tf.optimizers.Adam(FLAGS.learning_rate * hvd.size()) # Horovod: add Horovod DistributedOptimizer. ############################################### loss = [YoloLoss(anchors[mask]) for mask in anchor_masks] if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate( train_dataset.take(5717 // hvd.size())): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if batch == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) ############################# if hvd.rank() == 0: logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) ########################### avg_loss.update_state(total_loss) for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss if hvd.rank() == 0: logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) if hvd.rank() == 0: logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) avg_loss.reset_states() avg_val_loss.reset_states() if hvd.rank() == 0: model.save_weights( 'checkpoints/horovod_yolov3_train_{}.tf'.format(epoch)) else: model.compile(optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit')) callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True), TensorBoard(log_dir='logs') ] history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset)
def main(_argv): physical_devices = tf.config.experimental.list_physical_devices('GPU') for physical_device in physical_devices: tf.config.experimental.set_memory_growth(physical_device, True) # anchors是固定的,每一层每个anchors对应固定的3个anchors boxes,共三层 if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_anchors anchor_masks = yolo_anchor_masks # 目的是什么???不清楚 train_dataset = dataset.load_fake_dataset() # 载入训练数据,生成dataset.map,进行预处理 if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset( FLAGS.dataset, FLAGS.classes, FLAGS.size) # 训练数据打乱 train_dataset = train_dataset.shuffle(buffer_size=512) # 训练数据设置batch大小 train_dataset = train_dataset.batch(FLAGS.batch_size) # 训练数据匹配anchor,做map预处理 train_dataset = train_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) # 训练数据使用多线程并行计算预处理,自动设置为最大的可用线程数,机器算力拉满 train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) # 同train_dataset val_dataset = dataset.load_fake_dataset() if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset( FLAGS.val_dataset, FLAGS.classes, FLAGS.size) val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) # Configure the model for transfer learning 为迁移学习配置模型,所谓迁移,就是利用yolo的结构做为预训练模型 if FLAGS.transfer == 'none': pass # Nothing to do elif FLAGS.transfer in ['darknet', 'no_output']: # Darknet transfer is a special case that works # with incompatible number of classes # reset top layers 载入预训练模型的预处理数据 if FLAGS.tiny: model_pretrained = YoloV3Tiny( FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) else: model_pretrained = YoloV3( FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) # 载入预训练模型权重 model_pretrained.load_weights(FLAGS.weights) # 载入backbone及其参数权重,即darknet,做为预训练模型的主干,训练过程中对除backbone及其参数权重以外的参数做训练 if FLAGS.transfer == 'darknet': model.get_layer('yolo_darknet').set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) freeze_all(model.get_layer('yolo_darknet')) # 载入预训练模型的全部,除了输出部分,即nms部分,训练过程只对输出部分(nms)参数做训练 elif FLAGS.transfer == 'no_output': for l in model.layers: if not l.name.startswith('yolo_output'): l.set_weights(model_pretrained.get_layer( l.name).get_weights()) freeze_all(l) else: # All other transfer require matching classes model.load_weights(FLAGS.weights) # 载入backbone,即darknet,做为预训练模型的主干,训练过程中不改变主干backbone的结构,对全网络参数做训练 if FLAGS.transfer == 'fine_tune': # freeze darknet and fine tune other layers darknet = model.get_layer('yolo_darknet') freeze_all(darknet) # 载入全部模型,整体做为预处理模型,训练过程中,不做任何改变 elif FLAGS.transfer == 'frozen': # freeze everything freeze_all(model) # 优化器设置 optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) # 损失函数设置 loss = [YoloLoss(anchors[mask], classes=FLAGS.num_classes) for mask in anchor_masks] # Eager Mode(动态图模式),便于可以得到即时的反馈,用于训练的时候便于观察变化 if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training # 损失函数的均值观测 avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): # 根据训练数据损失函数的反馈值,逐步优化梯度,进而优化模型参数 with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients( zip(grads, model.trainable_variables)) # 日志展示损失loss的变化 logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_loss.update_state(total_loss) # val同上train的部分 for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) # 日志展示一整个epoch后的train和val的最终loss logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) # 展示完成,复位,重置 avg_loss.reset_states() avg_val_loss.reset_states() # 将本次训练结束后得到的模型参数保存并输出 model.save_weights( 'checkpoints/yolov3_train_{}.tf'.format(epoch)) # 如果不需要观测实时反馈变化,那么就后台训练,日志端不会看到任何信息 else: # 模型配置器 model.compile(optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit'), metrics=['accuracy']) # 自定义模型控制器,创建一个保存模型权重的回调 callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True), TensorBoard(log_dir='logs') ] # # period = 2, 表示每隔1个epoch保存一次checkpoint # callbacks = [ # ReduceLROnPlateau(verbose=1), # EarlyStopping(patience=3, verbose=1), # ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', # verbose=1, save_weights_only=True, period = 2), # TensorBoard(log_dir='logs') # ] # 模型训练,使用新的回调训练模型 history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset, validation_freq=1)
def main(_argv): physical_devices = tf.config.experimental.list_physical_devices('GPU') for physical_device in physical_devices: tf.config.experimental.set_memory_growth(physical_device, True) if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_anchors anchor_masks = yolo_anchor_masks train_dataset = dataset.load_fake_dataset() if FLAGS.dataset: train_dataset = dataset.load_tf_record( tfrecord=FLAGS.dataset, mode=tf.estimator.ModeKeys.TRAIN, class_file=FLAGS.classes, anchors=anchors, anchor_masks=anchor_masks, batch_size=FLAGS.batch_size, max_detections=FLAGS.yolo_max_boxes, size=FLAGS.size, augmentation=FLAGS.augmentation) val_dataset = dataset.load_fake_dataset() if FLAGS.val_dataset: val_dataset = dataset.load_tf_record( tfrecord=FLAGS.val_dataset, mode=tf.estimator.ModeKeys.EVAL, class_file=FLAGS.classes, anchors=anchors, anchor_masks=anchor_masks, batch_size=FLAGS.batch_size, max_detections=FLAGS.yolo_max_boxes, size=FLAGS.size, augmentation=False) # Configure the model for transfer learning if FLAGS.transfer == 'none': pass # Nothing to do elif FLAGS.transfer in ['darknet', 'no_output']: # Darknet transfer is a special case that works # with incompatible number of classes # reset top layers if FLAGS.tiny: model_pretrained = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) else: model_pretrained = YoloV3(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) model_pretrained.load_weights(FLAGS.weights) if FLAGS.transfer == 'darknet': model.get_layer('yolo_darknet').set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) freeze_all(model.get_layer('yolo_darknet')) elif FLAGS.transfer == 'no_output': for l in model.layers: if not l.name.startswith('yolo_output'): l.set_weights( model_pretrained.get_layer(l.name).get_weights()) freeze_all(l) else: # All other transfer require matching classes model.load_weights(FLAGS.weights) if FLAGS.transfer == 'fine_tune': # freeze darknet and fine tune other layers darknet = model.get_layer('yolo_darknet') freeze_all(darknet) elif FLAGS.transfer == 'frozen': # freeze everything freeze_all(model) optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) loss = [ YoloLoss(anchors[mask], classes=FLAGS.num_classes) for mask in anchor_masks ] train_log_dir = './logs/train' val_log_dir = './logs/valid' clear_directory(train_log_dir, clear_subdirectories=True) clear_directory(val_log_dir, clear_subdirectories=True) train_summary_writer = tf.summary.create_file_writer(train_log_dir) val_summary_writer = tf.summary.create_file_writer(val_log_dir) # Define checkpoint handler: track macro mAP ckpt_handler = BestEpochCheckpoint(model, './checkpoints/', 10, min_delta=0.005, mode='max') # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) # Training for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): images, filenames = images with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_loss.update_state(total_loss) with train_summary_writer.as_default(): tf.summary.scalar('Avg_loss', avg_loss.result(), step=epoch) all_annotations = [] all_detections = [] for batch, (images, labels, int_labels) in enumerate(val_dataset): images, filenames = images outputs = model(images) boxes, scores, classes, valid_detections = convert_yolo_output( outputs[0], outputs[1], outputs[2], anchors=anchors, anchor_masks=anchor_masks, num_classes=FLAGS.num_classes, max_boxes=FLAGS.yolo_max_boxes, iou_threshold=FLAGS.yolo_iou_threshold, score_threshold=FLAGS.yolo_score_threshold) all_annotations = create_annotations(all_annotations, int_labels, FLAGS.num_classes) all_detections = create_detections(all_detections, boxes, scores, classes, valid_detections, FLAGS.num_classes) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) ap_val = average_precisions(all_detections, all_annotations, FLAGS.num_classes, FLAGS.yolo_iou_threshold) micro_map, macro_map = calculate_map(ap_val) logging.info( "{}, train: {}, val: {}, micro_map: {}, macro_map:{}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy(), micro_map, macro_map)) print('micro_map: ', micro_map) print('macro_map: ', micro_map) with val_summary_writer.as_default(): tf.summary.scalar('Avg_loss', avg_val_loss.result(), step=epoch) tf.summary.scalar('Micro_mAP', micro_map, step=epoch) tf.summary.scalar('Macro_mAP', macro_map, step=epoch) ckpt_handler.on_epoch_end(epoch=epoch, current_monitor=macro_map) avg_loss.reset_states() avg_val_loss.reset_states()
def main(_argv): if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True, classes=49) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks elif FLAGS.custom: model = Custom(FLAGS.size, training=True) anchors = custom_anchors anchor_masks = custom_anchor_masks else: model = YoloV3(FLAGS.size, training=True) anchors = yolo_anchors anchor_masks = yolo_anchor_masks train_dataset = dataset.load_fake_dataset() if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes) train_dataset = train_dataset.shuffle(buffer_size=1024) # TODO: not 1024 train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map( lambda x, y: (dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, 49))) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) val_dataset = dataset.load_fake_dataset() if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset, FLAGS.classes) val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map( lambda x, y: (dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, 49))) if FLAGS.transfer != 'none': model.load_weights(FLAGS.weights) if FLAGS.transfer == 'fine_tune': # freeze darknet darknet = model.get_layer('yolo_darknet') freeze_all(darknet) elif FLAGS.mode == 'frozen': # freeze everything freeze_all(model) else: # reset top layers if FLAGS.tiny: # get initial weights init_model = YoloV3Tiny(FLAGS.size, training=True) elif FLAGS.custom: init_model = Custom(FLAGS.size, training=True) else: init_model = YoloV3(FLAGS.size, training=True) if FLAGS.transfer == 'darknet': for l in model.layers: if l.name != 'yolo_darknet' and l.name.startswith('yolo_'): l.set_weights( init_model.get_layer(l.name).get_weights()) else: freeze_all(l) elif FLAGS.transfer == 'no_output': for l in model.layers: if l.name.startswith('yolo_output'): l.set_weights( init_model.get_layer(l.name).get_weights()) else: freeze_all(l) optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) loss = [YoloLoss(anchors[mask], classes=49) for mask in anchor_masks] if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_loss.update_state(total_loss) for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) avg_loss.reset_states() avg_val_loss.reset_states() model.save_weights('checkpoints/yolov3_train_{}.tf'.format(epoch)) else: model.compile(optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit')) callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=30, verbose=1), ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True), TensorBoard(log_dir='logs') ] history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset)
def main(_argv): physical_devices = tf.config.experimental.list_physical_devices('GPU') for physical_device in physical_devices: tf.config.experimental.set_memory_growth(physical_device, True) check_weighs_exist(tiny=FLAGS.tiny) if FLAGS.tiny: model = YoloV3Tiny( FLAGS.size, training=True, classes=FLAGS.num_classes ) model.summary() plot_model(model, to_file='yoloV3Tiny-model-plot.png', show_shapes=True, show_layer_names=True) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3( FLAGS.size, training=True, classes=FLAGS.num_classes ) model.summary() plot_model(model, to_file='yoloV3-model-plot.png', show_shapes=True, show_layer_names=True) anchors = yolo_anchors anchor_masks = yolo_anchor_masks # Load the dataset train_dataset = dataset.load_fake_dataset() if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset( file_pattern=FLAGS.dataset, class_file=FLAGS.classes, size=FLAGS.size ) # Shuffle the dataset train_dataset = train_dataset.shuffle(buffer_size=FLAGS.buffer_size, reshuffle_each_iteration=True) train_dataset_length = [i for i, _ in enumerate(train_dataset)][-1] + 1 print(f"Dataset for training consists of {train_dataset_length} images.") train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map(lambda x, y: (dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))).repeat() train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) val_dataset = dataset.load_fake_dataset() if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset( FLAGS.val_dataset, FLAGS.classes, FLAGS.size) val_dataset_length = [i for i, _ in enumerate(val_dataset)][-1] + 1 print(f"Dataset for validation consists of {val_dataset_length} images.") val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map(lambda x, y: (dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))).repeat() # Configure the model for transfer learning if FLAGS.transfer == 'none': pass # Nothing to do elif FLAGS.transfer in ['darknet', 'no_output']: # Darknet transfer is a special case that works # with incompatible number of classes # reset top layers if FLAGS.tiny: model_pretrained = YoloV3Tiny( size=FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) model_pretrained.load_weights(FLAGS.weights_tf_format_tiny) else: model_pretrained = YoloV3( size=FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) model_pretrained.load_weights(FLAGS.weights_tf_format) if FLAGS.transfer == 'darknet': # Set yolo darknet layer weights to the loaded pretrained model weights model.get_layer('yolo_darknet').set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) # Freeze these layers freeze_all(model.get_layer('yolo_darknet')) elif FLAGS.transfer == 'no_output': for i in model.layers: if not i.name.startswith('yolo_output'): i.set_weights(model_pretrained.get_layer( i.name).get_weights()) freeze_all(i) else: # All other transfer require matching classes if FLAGS.tiny: model.load_weights(FLAGS.weights_tf_format_tiny) else: model.load_weights(FLAGS.weights_tf_format) if FLAGS.transfer == 'fine_tune': # freeze darknet and fine tune other layers darknet = model.get_layer('yolo_darknet') freeze_all(darknet) elif FLAGS.transfer == 'frozen': # freeze everything freeze_all(model) # Use the Adam optimizer with the specified learning rate optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) # YoloLoss function loss = [YoloLoss(anchors[mask], classes=FLAGS.num_classes) for mask in anchor_masks] if FLAGS.mode == 'eager_tf': print(f"Mode is: {FLAGS.mode}") # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) logging.info(f"epoch_{epoch}_train_batch_{batch}," f"{total_loss.numpy()}," f"{list(map(lambda x: np.sum(x.numpy()), pred_loss))}") avg_loss.update_state(total_loss) for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss))) ) avg_val_loss.update_state(total_loss) logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) avg_loss.reset_states() avg_val_loss.reset_states() model.save_weights(f'checkpoints/{data_set}_tiny_{FLAGS.tiny}_im_size_{FLAGS.size}.tf') else: print(f"Compiling the model") model.compile( optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit'), metrics=['accuracy']) callbacks = [ EarlyStopping(monitor='val_loss', patience=125, verbose=1), ReduceLROnPlateau(monitor='val_loss', verbose=1, factor=0.90, min_lr=0, patience=20, mode="auto"), ModelCheckpoint( str(f'checkpoints/{data_set}_tiny_{FLAGS.tiny}_im_size_{FLAGS.size}.tf'), verbose=1, save_weights_only=True, save_best_only=True, mode="auto", ), TensorBoard(log_dir='logs'), CSVLogger(f'checkpoints/logs/{data_set}_tiny_{FLAGS.tiny}_im_size_{FLAGS.size}', separator=',') ] history = model.fit(train_dataset, epochs=FLAGS.epochs, verbose=2, callbacks=callbacks, validation_data=val_dataset, steps_per_epoch=np.ceil(train_dataset_length / FLAGS.batch_size), validation_steps=np.ceil(val_dataset_length / FLAGS.batch_size))
def main(_argv): physical_devices = tf.config.experimental.list_physical_devices('GPU') for physical_device in physical_devices: tf.config.experimental.set_memory_growth(physical_device, True) if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_anchors anchor_masks = yolo_anchor_masks post_process_outputs = post_process_block(model.outputs, classes=FLAGS.num_classes) post_process_model = Model(model.inputs, post_process_outputs) train_dataset = dataset.load_fake_dataset() if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes, FLAGS.size) train_dataset = train_dataset.shuffle(buffer_size=512) train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map( lambda x, y: (dataset.transform_images(x, FLAGS.size), y)) # dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) val_dataset = dataset.load_fake_dataset() if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset, FLAGS.classes, FLAGS.size) val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map(lambda x, y: (dataset.transform_images(x, FLAGS.size), y)) # dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) # Configure the model for transfer learning if FLAGS.transfer == 'none': pass # Nothing to do elif FLAGS.transfer in ['darknet', 'no_output']: # Darknet transfer is a special case that works # with incompatible number of classes # reset top layers if FLAGS.tiny: model_pretrained = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) else: model_pretrained = YoloV3(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) model_pretrained.load_weights(FLAGS.weights) if FLAGS.transfer == 'darknet': model.get_layer('yolo_darknet').set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) freeze_all(model.get_layer('yolo_darknet')) elif FLAGS.transfer == 'no_output': for l in model.layers: if not l.name.startswith('yolo_output'): l.set_weights( model_pretrained.get_layer(l.name).get_weights()) freeze_all(l) else: # All other transfer require matching classes model.load_weights(FLAGS.weights) if FLAGS.transfer == 'fine_tune': # freeze darknet and fine tune other layers darknet = model.get_layer('yolo_darknet') freeze_all(darknet) elif FLAGS.transfer == 'frozen': # freeze everything freeze_all(model) optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) loss = [ YoloLoss(anchors[mask], classes=FLAGS.num_classes) for mask in anchor_masks ] # (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...cls)) # model.outputs shape: [[N, 13, 13, 3, 85], [N, 26, 26, 3, 85], [N, 52, 52, 3, 85]] # labels shape: ([N, 13, 13, 3, 6], [N, 26, 26, 3, 6], [N, 52, 52, 3, 6]) if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] transf_labels = dataset.transform_targets( labels, anchors, anchor_masks, FLAGS.size) for output, label, loss_fn in zip(outputs, transf_labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss, axis=None) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) log_batch(logging, epoch, batch, total_loss, pred_loss) avg_loss.update_state(total_loss) if batch >= 100: break true_pos_total = np.zeros(FLAGS.num_classes) false_pos_total = np.zeros(FLAGS.num_classes) n_pos_total = np.zeros(FLAGS.num_classes) for batch, (images, labels) in enumerate(val_dataset): # get losses outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] transf_labels = dataset.transform_targets( labels, anchors, anchor_masks, FLAGS.size) for output, label, loss_fn in zip(outputs, transf_labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss log_batch(logging, epoch, batch, total_loss, pred_loss) avg_val_loss.update_state(total_loss) # get true positives, false positives, and positive labels preds = post_process_model(images) true_pos, false_pos, n_pos = batch_true_false_positives( preds.numpy(), labels.numpy(), FLAGS.num_classes) true_pos_total += true_pos false_pos_total += false_pos n_pos_total += n_pos if batch >= 20: break # precision-recall by class precision, recall = batch_precision_recall(true_pos_total, false_pos_total, n_pos_total) for c in range(FLAGS.num_classes): print('Class {} - Prec: {}, Rec: {}'.format( c, precision[c], recall[c])) # total precision-recall print('Total - Prec: {}, Rec: {}'.format( calc_precision(np.sum(true_pos_total), np.sum(false_pos_total)), calc_recall(np.sum(true_pos_total), np.sum(n_pos_total)))) import pdb pdb.set_trace() # log losses logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) # reset loop and save weights avg_loss.reset_states() avg_val_loss.reset_states() model.save_weights( os.path.join(FLAGS.checkpoint_dir, 'yolov3_train_{}.tf'\ .format(epoch))) else: model.compile(optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit')) callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint(os.path.join(FLAGS.checkpoint_dir, 'yolov3_train_{epoch}.tf'), verbose=1, save_weights_only=True), TensorBoard(log_dir=FLAGS.log_dir) ] history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset)
def main(_argv): try: tpu = tf.distribute.cluster_resolver.TPUClusterResolver() print('Running on TPU ', tpu.cluster_spec().as_dict()['worker']) except: tpu = None if tpu: tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initialize_tpu_system(tpu) strategy = tf.distribute.experimental.TPUStrategy(tpu) else: strategy = tf.distribute.get_strategy() #print("REPLICAS: ", strategy.num_replicas_in_sync) FLAGS.batch_size = FLAGS.batch_size * strategy.num_replicas_in_sync with strategy.scope(): if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_anchors anchor_masks = yolo_anchor_masks train_dataset = dataset.load_fake_dataset() if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes, FLAGS.size) train_dataset = train_dataset.shuffle(buffer_size=FLAGS.buffer_size) train_dataset = train_dataset.batch(FLAGS.batch_size, drop_remainder=True) train_dataset = train_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) val_dataset = dataset.load_fake_dataset() if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset, FLAGS.classes, FLAGS.size) val_dataset = val_dataset.batch(FLAGS.batch_size, drop_remainder=True) val_dataset = val_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) # Configure the model for transfer learning if FLAGS.transfer == 'none': pass # Nothing to do elif FLAGS.transfer in ['darknet', 'no_output']: # Darknet transfer is a special case that works # with incompatible number of classes # reset top layers if FLAGS.tiny: model_pretrained = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) else: model_pretrained = YoloV3(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) model_pretrained.load_weights(FLAGS.weights) if FLAGS.transfer == 'darknet': model.get_layer('yolo_darknet').set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) freeze_all(model.get_layer('yolo_darknet')) elif FLAGS.transfer == 'no_output': for l in model.layers: if not l.name.startswith('yolo_output'): l.set_weights( model_pretrained.get_layer(l.name).get_weights()) freeze_all(l) else: # All other transfer require matching classes model.load_weights(FLAGS.weights) if FLAGS.transfer == 'fine_tune': # freeze darknet and fine tune other layers darknet = model.get_layer('yolo_darknet') freeze_all(darknet) elif FLAGS.transfer == 'frozen': # freeze everything freeze_all(model) if FLAGS.optimizer == 'Adam': optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) elif FLAGS.optimizer == 'nAdam': optimizer = tf.keras.optimizers.Nadam(lr=FLAGS.learning_rate) elif FLAGS.optimizer == 'Adagrad': optimizer = tf.keras.optimizers.Adagrad(lr=FLAGS.learning_rate) elif FLAGS.optimizer == 'RMSprop': optimizer = tf.keras.optimizers.RMSprop(lr=FLAGS.learning_rate, rho=0.9) loss = [ YoloLoss(anchors[mask], classes=FLAGS.num_classes) for mask in anchor_masks ] if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_loss.update_state(total_loss) for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) avg_loss.reset_states() avg_val_loss.reset_states() model.save_weights('checkpoints/yolov3_train_{}.tf'.format(epoch)) else: model.compile(optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit')) if tpu: callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint( 'yolov3_train_{epoch}.h5', save_weights_only=True, verbose=1, period=FLAGS.period ) #, monitor='val_loss', mode='min', save_best_only=True), #1000 ] else: callbacks = [ ReduceLROnPlateau(verbose=1), #EarlyStopping(patience=3, verbose=1), ModelCheckpoint('./checkpoints/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True, period=FLAGS.period), #1000 TensorBoard(log_dir='logs') ] history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset)
def main(_argv): # Change flag values if FLAGS.height is None: FLAGS.height = FLAGS.size if FLAGS.width is None: FLAGS.width = FLAGS.size size = (FLAGS.height, FLAGS.width) physical_devices = tf.config.experimental.list_physical_devices('GPU') for physical_device in physical_devices: tf.config.experimental.set_memory_growth(physical_device, True) if FLAGS.tiny: model = YoloV3Tiny(size, training=True, classes=FLAGS.num_classes, recurrent=FLAGS.recurrent) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(size, training=True, classes=FLAGS.num_classes, recurrent=FLAGS.recurrent) anchors = yolo_anchors anchor_masks = yolo_anchor_masks if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes, size) else: train_dataset = dataset.load_fake_dataset() train_dataset = train_dataset.shuffle(buffer_size=8) train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map(lambda x, y: (dataset.transform_images( x, size), dataset.transform_targets(y, anchors, anchor_masks, size))) if FLAGS.recurrent: train_dataset = train_dataset.map( lambda x, y: (dataset.get_recurrect_inputs( x, y, anchors, anchor_masks, FLAGS.num_classes), y)) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset, FLAGS.classes, size) else: val_dataset = dataset.load_fake_dataset() val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map(lambda x, y: (dataset.transform_images( x, size), dataset.transform_targets(y, anchors, anchor_masks, size))) if FLAGS.recurrent: val_dataset = val_dataset.map( lambda x, y: (dataset.get_recurrect_inputs( x, y, anchors, anchor_masks, FLAGS.num_classes), y)) # Configure the model for transfer learning if FLAGS.transfer != 'none': # if we need all weights, no need to create another model if FLAGS.transfer == 'all': model.load_weights(FLAGS.weights) # else, we need only some of the weights # create appropriate model_pretrained, load all weights and copy the ones we need else: if FLAGS.tiny: model_pretrained = YoloV3Tiny(size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes, recurrent=FLAGS.recurrent) else: model_pretrained = YoloV3(size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes, recurrent=FLAGS.recurrent) # load pretrained weights model_pretrained.load_weights(FLAGS.weights) # transfer darknet darknet = model.get_layer('yolo_darknet') darknet.set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) # transfer 'yolo_conv_i' layer weights if FLAGS.transfer in [ 'yolo_conv', 'yolo_output_conv', 'yolo_output' ]: for l in model.layers: if l.name.startswith('yolo_conv'): model.get_layer(l.name).set_weights( model_pretrained.get_layer(l.name).get_weights()) # transfer 'yolo_output_i' first conv2d layer if FLAGS.transfer == 'yolo_output_conv': # transfer tiny output conv2d for l in model.layers: if l.name.startswith('yolo_output'): # get and set the weights of the appropriate layers model.get_layer(l.name).layers[1].set_weights( model_pretrained.get_layer( l.name).layers[1].get_weights()) # should I freeze batch_norm as well? # transfer 'yolo_output_i' layer weights if FLAGS.transfer == 'yolo_output': for l in model.layers: if l.name.startswith('yolo_output'): model.get_layer(l.name).set_weights( model_pretrained.get_layer(l.name).get_weights()) # no transfer learning else: pass # freeze layers, if requested if FLAGS.freeze != 'none': if FLAGS.freeze == 'all': freeze_all(model) if FLAGS.freeze in [ 'yolo_darknet' 'yolo_conv', 'yolo_output_conv', 'yolo_output' ]: darknet = model.get_layer('yolo_darknet') freeze_all(darknet) if FLAGS.freeze in ['yolo_conv', 'yolo_output_conv', 'yolo_output']: for l in model.layers: if l.name.startswith('yolo_conv'): freeze_all(l) if FLAGS.freeze == 'yolo_output_conv': if FLAGS.tiny: # freeze the appropriate layers freeze_all(model.layers[4].layers[1]) freeze_all(model.layers[5].layers[1]) else: # freeze the appropriate layers freeze_all(model.layers[5].layers[1]) freeze_all(model.layers[6].layers[1]) freeze_all(model.layers[7].layers[1]) if FLAGS.transfer == 'yolo_output': for l in model.layers: if l.name.startswith('yolo_output'): freeze_all(l) # freeze nothing else: pass optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) loss = [ YoloLoss(anchors[mask], classes=FLAGS.num_classes) for mask in anchor_masks ] if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_loss.update_state(total_loss) for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) avg_loss.reset_states() avg_val_loss.reset_states() model.save_weights('checkpoints/yolov3_train_{}.tf'.format(epoch)) else: model.compile(optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit')) callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True), TensorBoard(log_dir='logs') ] history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset)
def main(_argv): physical_devices = tf.config.experimental.list_physical_devices('GPU') if len(physical_devices) > 0: # 设置仅在需要时申请显存空间 tf.config.experimental.set_memory_growth(physical_devices[0], True) # 判断训练tiny版本的YOLO还是完整版的YOLO if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_anchors anchor_masks = yolo_anchor_masks # 如果未指定数据集则加载一张图片作为数据集=>fake_dataset train_dataset = dataset.load_fake_dataset() # 判断数据集路径是否为空 if FLAGS.dataset: # 从TFRecode文件加载数据集 train_dataset:(x_train, y_train) train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes, FLAGS.size) # 生成批训练数据 # 打乱数据顺序 train_dataset = train_dataset.shuffle(buffer_size=512) train_dataset = train_dataset.batch(FLAGS.batch_size) # y.shape:train_dataset.as_numpy_iterator().next()[1].shape # =>(batch_size, yolo_max_boxes, 5) 5=>(xmin, ymin, xmax, ymax, classlabel) train_dataset = train_dataset.map(lambda x, y: ( # 图像数据归一化[0,1] dataset.transform_images(x, FLAGS.size), # 根据先验框anchor确定bbox属于哪一层特征图(13*13, 26*26, 52*52) # 并计算出bbox的中心点在特征图上的位置 dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) # 数据预读取,提高延迟和吞吐量 # tf.data.experimental.AUTOTUNE:根据可用CPU动态设置并行调用的数量 train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) # 加载伪验证集,防止没有添加验证集路径时报错 val_dataset = dataset.load_fake_dataset() # 加载验证集 if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset, FLAGS.classes, FLAGS.size) val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) # Configure the model for transfer learning # 训练模式选择 # 随机初始化权重,从0开始训练整个网络 if FLAGS.transfer == 'none': pass # Nothing to do # 迁移训练的两种方式 elif FLAGS.transfer in ['darknet', 'no_output']: # Darknet transfer is a special case that works # with incompatible number of classes # reset top layers if FLAGS.tiny: model_pretrained = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) else: # 模型网络结构 model_pretrained = YoloV3(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) # 加载预训练权重 model_pretrained.load_weights(FLAGS.weights) # 设置darknet网络权重并冻结网络,即主干网络不参与训练,其余参数随机初始化 if FLAGS.transfer == 'darknet': model.get_layer('yolo_darknet').set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) freeze_all(model.get_layer('yolo_darknet')) # 设置YOLO输出层以外的网络的权重并冻结, 即仅训练YOLO的输出层且参数随机初始化 elif FLAGS.transfer == 'no_output': for l in model.layers: if not l.name.startswith('yolo_output'): l.set_weights( model_pretrained.get_layer(l.name).get_weights()) freeze_all(l) # 迁移学习fine_tune和frozen模式要求训练的类别数和预训练权重一致(80类) else: # All other transfer require matching classes # 加载网络所有预训练权重参数 model.load_weights(FLAGS.weights) # 冻结darknet(骨干网络)权重, 其余参数在预训练权重的基础上训练 if FLAGS.transfer == 'fine_tune': # freeze darknet and fine tune other layers darknet = model.get_layer('yolo_darknet') freeze_all(darknet) # 冻结所有参数,训练不起作用. elif FLAGS.transfer == 'frozen': # freeze everything freeze_all(model) # 定义优化器:Adam optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) loss = [ YoloLoss(anchors[mask], classes=FLAGS.num_classes) for mask in anchor_masks ] # 调试模型:速度慢: Eager: op 在调用后会立即运行 if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training # 训练集上的平均loss/验证集上的平均loss avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) # 迭代每个epoch for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): # 梯度带:自动计算变量梯度 with tf.GradientTape() as tape: # model(): eager模式下选择此方式,不需要编译直接运行, 速度快. # model.predict()第一次运行时需要先编译图模式 outputs = model(images, training=True) # 计算张量各维度的元素之和. regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss # 梯度 grads = tape.gradient(total_loss, model.trainable_variables) # 执行最优化器 optimizer.apply_gradients(zip(grads, model.trainable_variables)) # 记录日志文件 logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) # 更新平均loss avg_loss.update_state(total_loss) # 在验证集上验证 for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) # .result():返回累计结果 logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) # reset_states:清除累计值 avg_loss.reset_states() avg_val_loss.reset_states() # 每个epoch保存一次模型权重 model.save_weights('checkpoints/yolov3_train_{}.tf'.format(epoch)) # 训练模式 else: # 编译模型 model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'], run_eagerly=(FLAGS.mode == 'eager_fit')) # 回调函数 callbacks = [ # lr衰减 ReduceLROnPlateau(verbose=1), # lr不变时停止训练 EarlyStopping(patience=3, verbose=1), # 保存模型 ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True), # 训练结果可视化 TensorBoard(log_dir='logs', write_images=True, update_freq='batch') ] # 进行迭代训练 history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset)