def __init__(self, obj_threshold, nms_threshold, classes_file, anchors_file): """ Introduction ------------ 初始化函数 Parameters ---------- obj_threshold: 目标检测为物体的阈值 nms_threshold: nms阈值 """ self.obj_threshold = obj_threshold self.nms_threshold = nms_threshold self.classes_path = classes_file self.anchors_path = anchors_file self.class_names = self._get_class() self.anchors = self._get_anchors() hsv_tuples = [(x / len(self.class_names), 1., 1.) for x in range(len(self.class_names))] self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) self.colors = list( map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), self.colors)) random.seed(10101) random.shuffle(self.colors) random.seed(None) self.model = yolo(config.norm_epsilon, config.norm_decay, self.anchors_path, self.classes_path, pre_train=False)
def predict(self, inputs, image_shape): """ Introduction ------------ 构建预测模型 Parameters ---------- inputs: 处理之后的输入图片 image_shape: 图像原始大小 Returns ------- boxes: 物体框坐标 scores: 物体概率值 classes: 物体类别 """ model = yolo(config.norm_epsilon, config.norm_decay, self.anchors_path, self.classes_path, pre_train=False) output = model.yolo_inference(inputs, config.num_anchors // 3, config.num_classes, training=False) boxes, scores, classes = self.eval(output, image_shape, max_boxes=20) return boxes, scores, classes
def predict(self, inputs, image_shape): """ Introduction ------------ detect the boxes and their classes Parameters ---------- inputs: the image image_shape: Returns ------- boxes: All of the boxes. scores: scores for each box. classes: for each box. """ a model = yolo(config.norm_epsilon, config.norm_decay, self.anchors_path, self.classes_path, pre_train=False) output = model.yolo_inference(inputs, config.num_anchors // 3, config.num_classes, training=False) boxes, scores, classes = self.eval(output, image_shape, max_boxes=20) return boxes, scores, classes
def predict(self, inputs, image_shape): model = yolo(config.norm_epsilon, config.norm_decay, self.anchors_path, self.classes_path, pre_train=False) output = model.yolo_inference(inputs, config.num_anchors // 3, config.num_classes, training=False) boxes, scores, classes = self.eval(output, image_shape, max_boxes=20) return boxes, scores, classes
def train(): """ Introduction ------------ This function will train the yolo3 (and you can just load the darknet weights) """ train_reader = Reader('train', config.data_dir, config.anchors_path, config.num_classes, input_shape=config.input_shape, max_boxes=config.max_boxes) train_data = train_reader.build_dataset(config.train_batch_size) is_training = tf.placeholder(tf.bool, shape=[]) iterator = train_data.make_one_shot_iterator() images, bbox, bbox_true_13, bbox_true_26, bbox_true_52 = iterator.get_next( ) images.set_shape([None, config.input_shape, config.input_shape, 3]) bbox.set_shape([None, config.max_boxes, 5]) grid_shapes = [ config.input_shape // 32, config.input_shape // 16, config.input_shape // 8 ] bbox_true_13.set_shape( [None, grid_shapes[0], grid_shapes[0], 3, 5 + config.num_classes]) bbox_true_26.set_shape( [None, grid_shapes[1], grid_shapes[1], 3, 5 + config.num_classes]) bbox_true_52.set_shape( [None, grid_shapes[2], grid_shapes[2], 3, 5 + config.num_classes]) # draw_box(images, bbox) model = yolo(config.norm_epsilon, config.norm_decay, config.anchors_path, config.classes_path, config.pre_train) bbox_true = [bbox_true_13] output = model.yolo_inference(images, config.num_anchors / 3, config.num_classes, is_training) loss, loss_xy, loss_wh, loss_conf, loss_class = model.yolo_loss( output, bbox_true, model.anchors, config.num_classes, config.ignore_thresh) l2_loss = tf.losses.get_regularization_loss() loss += l2_loss tf.summary.scalar('loss', loss) tf.summary.scalar('loss_xy', loss_xy) tf.summary.scalar('loss_wh', loss_wh) tf.summary.scalar('loss_conf', loss_conf) # tf.summary.scalar('loss_class',loss_class) global_step = tf.Variable(0, trainable=False) lr = tf.train.exponential_decay(config.learning_rate, global_step, decay_steps=config.decay_step, decay_rate=0.96) tf.summary.scalar('learning rate', lr) merged_summary = tf.summary.merge_all() optimizer = tf.train.AdamOptimizer(learning_rate=lr) # 如果读取预训练权重,则冻结darknet53网络的变量 update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): if config.pre_train: train_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='yolo') train_op = optimizer.minimize(loss=loss, global_step=global_step, var_list=train_var) else: train_op = optimizer.minimize(loss=loss, global_step=global_step) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess: ckpt = tf.train.get_checkpoint_state(config.model_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('restore model', ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) if config.pre_train is True: load_ops = load_weights(tf.global_variables(scope='darknet53'), config.darknet53_weights_path) sess.run(load_ops) summary_writer = tf.summary.FileWriter(config.log_dir, sess.graph) loss_value = 0 for epoch in range(config.Epoch): for step in range(int(config.train_num / config.train_batch_size)): try: start_time = time.time() summary, train_loss, train_loss_xy, train_loss_wh, train_loss_conf, train_loss_class, global_step_value, _ =\ sess.run([merged_summary, loss,loss_xy, loss_wh, loss_conf, loss_class, global_step, train_op], feed_dict={is_training: True}) loss_value += train_loss duration = time.time() - start_time examples_per_sec = float( duration) / config.train_batch_size format_str = ( 'Epoch {} step {}, avg los: {:.3f}, train loss = {:.3f}, gs: {}, xy:{:.3f}, wh:{:.3f}, conf:{:.3f}, class:{:.3f} ( {:.3f} examples/sec; {:.3f} ' 'sec/batch)') # format_str = ( # 'Epoch {} step {}, avg los: {:.3f}, train loss = {:.3f}, gs: {} ( {:.3f} examples/sec; {:.3f} ''sec/batch)') # print('.') # print(format_str.format(epoch, step, train_loss, global_step_value, examples_per_sec, duration)) print( format_str.format(epoch, step, loss_value / global_step_value, train_loss, global_step_value, train_loss_xy, train_loss_wh, train_loss_conf, train_loss_class, examples_per_sec, duration)) # print(format_str.format(epoch, step, loss_value / global_step_value, train_loss, global_step_value, # examples_per_sec, duration)) summary_writer.add_summary(summary=tf.Summary(value=[ tf.Summary.Value(tag="train loss", simple_value=train_loss) ]), global_step=step) summary_writer.add_summary(summary, global_step_value) if step % 100: summary_writer.flush() except Exception as ex: print(ex) # 每3个epoch保存一次模型 # if epoch % 1 == 0: checkpoint_path = os.path.join(config.model_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=global_step_value) print('saved')
def train(): """ Introduction ------------ 训练模型 """ with tf.Graph().as_default(), tf.device("/cpu:0"): train_reader = Reader('train', config.data_dir, config.anchors_path, config.num_classes, input_shape=config.input_shape, max_boxes=config.max_boxes, shuffle_size=config.shuffle_size) train_data = train_reader.build_dataset(config.train_batch_size) is_training = tf.placeholder(tf.bool, shape=[]) iterator = train_data.make_one_shot_iterator() images, bbox, bbox_true_13, bbox_true_26, bbox_true_52 = iterator.get_next( ) images.set_shape([None, config.input_shape, config.input_shape, 3]) bbox.set_shape([None, config.max_boxes, 5]) grid_shapes = [ config.input_shape // 32, config.input_shape // 16, config.input_shape // 8 ] bbox_true_13.set_shape( [None, grid_shapes[0], grid_shapes[0], 3, 5 + config.num_classes]) bbox_true_26.set_shape( [None, grid_shapes[1], grid_shapes[1], 3, 5 + config.num_classes]) bbox_true_52.set_shape( [None, grid_shapes[2], grid_shapes[2], 3, 5 + config.num_classes]) #draw_box(images, bbox) #split data for training images_list = tf.split(images, gpu_num) #bbox_list = tf.split(bbox, gpu_num) bbox_true_13_list = tf.split(bbox_true_13, gpu_num) bbox_true_26_list = tf.split(bbox_true_26, gpu_num) bbox_true_52_list = tf.split(bbox_true_52, gpu_num) global_step = tf.Variable(0, trainable=False) lr = tf.train.exponential_decay( config.learning_rate, global_step, decay_steps=1000, decay_rate=0.8 ) #decay_steps 约等于一个epoch = total_sample/batchsize int(config.train_num/config.train_batch_size) optimizer = tf.train.AdamOptimizer(learning_rate=lr) tower_grads = [] tower_loss = [] with tf.variable_scope(tf.get_variable_scope()): for gpu_id in range(gpu_num): with tf.device('/gpu:%d' % gpu_id): with tf.name_scope('%s_%d' % ('tower', gpu_id)): model = yolo(config.norm_epsilon, config.norm_decay, config.anchors_path, config.classes_path, config.pre_train) bbox_true = [ bbox_true_13_list[gpu_id], bbox_true_26_list[gpu_id], bbox_true_52_list[gpu_id] ] output = model.yolo_inference(images_list[gpu_id], config.num_anchors / 3, config.num_classes, is_training) loss = model.yolo_loss(output, bbox_true, model.anchors, config.num_classes, config.ignore_thresh) l2_loss = tf.losses.get_regularization_loss() loss += l2_loss tf.get_variable_scope().reuse_variables() grads = optimizer.compute_gradients(loss) tower_grads.append(grads) tower_loss.append(loss) loss = average_loss(tower_loss) grads = average_gradients(tower_grads) #train_op = optimizer.apply_gradients(grads, global_step=global_step) tf.summary.scalar('loss', loss) merged_summary = tf.summary.merge_all() # 如果读取预训练权重,则冻结darknet53网络的变量 update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): if config.pre_train: train_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='yolo') #train_op = optimizer.minimize(loss = loss, global_step = global_step, var_list = train_var) train_op = optimizer.apply_gradients(grads, global_step=global_step) else: train_op = optimizer.apply_gradients(grads, global_step=global_step) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session( config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) as sess: ckpt = tf.train.get_checkpoint_state(config.model_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('restore model', ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) if config.pre_train is True: load_ops = load_weights(tf.global_variables(scope='darknet53'), config.darknet53_weights_path) sess.run(load_ops) print("pretrained the model") summary_writer = tf.summary.FileWriter(config.log_dir, sess.graph) loss_value = 0 for epoch in range(config.Epoch): for step in range( int(config.train_num / config.train_batch_size)): start_time = time.time() train_loss, summary, global_step_value, lr_step, _ = sess.run( [loss, merged_summary, global_step, lr, train_op], {is_training: True}) loss_value += train_loss duration = time.time() - start_time examples_per_sec = float( duration) / config.train_batch_size format_str = ( 'Epoch {} step {} lr = {}, loss_step = {} train loss = {} ( {} examples/sec; {} ' 'sec/batch)') print( format_str.format(epoch, step, lr_step, train_loss, loss_value / global_step_value, examples_per_sec, duration)) summary_writer.add_summary( summary=tf.Summary(value=[ tf.Summary.Value(tag="train loss", simple_value=train_loss) ]), global_step=step + epoch * int(config.train_num / config.train_batch_size)) summary_writer.add_summary( summary, global_step=step + epoch * int(config.train_num / config.train_batch_size)) summary_writer.flush() # 每3个epoch保存一次模型 if epoch % 2 == 0: checkpoint_path = os.path.join(config.model_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=global_step)
def dstributed_train(ps_hosts, worker_hosts, job_name, task_index): """ Introduction ------------ 分布式训练 Parameters ---------- ps_hosts: sever的host worker_hosts: worker的host job_name: 判断是作为ps还是worker task_index: 任务index """ ps_hosts = ps_hosts.split(',') worker_hosts = worker_hosts.split(',') cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) server = tf.train.Server(cluster, job_name = job_name, task_index = task_index) if job_name == 'ps': server.join() else: with tf.device(tf.train.replica_device_setter(worker_device = "/job:worker/task:%d" % task_index, cluster = cluster)): train_data = Reader('train', config.data_dir, config.anchors_path, config.num_classes, input_shape=config.input_shape, max_boxes=config.max_boxes, jitter=config.jitter, hue=config.hue, sat=config.sat, cont=config.cont, bri=config.bri) val_data = Reader('val', config.data_dir, config.anchors_path, config.num_classes, input_shape=config.input_shape, max_boxes=config.max_boxes) images_train, bbox_true_13_train, bbox_true_26_train, bbox_true_52_train = train_data.provide(config.train_batch_size) images_val, bbox_true_13_val, bbox_true_26_val, bbox_true_52_val = val_data.provide(config.val_batch_size) model = yolo(config.norm_epsilon, config.norm_decay, config.anchors_path, config.classes_path, config.pre_train) is_training = tf.placeholder(dtype=tf.bool, shape=[]) images = tf.placeholder(shape=[None, 416, 416, 3], dtype=tf.float32) bbox_true_13 = tf.placeholder(shape=[None, 13, 13, 3, 85], dtype=tf.float32) bbox_true_26 = tf.placeholder(shape=[None, 26, 26, 3, 85], dtype=tf.float32) bbox_true_52 = tf.placeholder(shape=[None, 52, 52, 3, 85], dtype=tf.float32) bbox_true = [bbox_true_13, bbox_true_26, bbox_true_52] output = model.yolo_inference(images, config.num_anchors / 3, config.num_classes, is_training) loss = model.yolo_loss(output, bbox_true, model.anchors, config.num_classes, config.ignore_thresh) tf.summary.scalar('loss', loss) global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(config.learning_rate, global_step, 20000, 0.1, staircase=True) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) # 如果读取预训练权重,则冻结darknet53网络的变量 update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): if config.pre_train: train_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='yolo') train_op = optimizer.minimize(loss=loss, global_step=global_step, var_list=train_var) else: train_op = optimizer.minimize(loss=loss, global_step=global_step) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess: ckpt = tf.train.get_checkpoint_state(config.model_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('restore model', ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) if config.pre_train is True: load_ops = model.load_weights(tf.global_variables(scope='darknet53'), config.darknet53_weights_path) sess.run(load_ops) summary_writer = tf.summary.FileWriter('./logs', sess.graph) tf.train.start_queue_runners(sess=sess) for epoch in range(config.Epoch): for step in range(int(config.train_num / config.train_batch_size)): start_time = time.time() images_value, bbox_true_13_value, bbox_true_26_value, bbox_true_52_value = sess.run( [images_train, bbox_true_13_train, bbox_true_26_train, bbox_true_52_train]) train_loss, _ = sess.run([loss, train_op], {images: images_value, bbox_true_13: bbox_true_13_value, bbox_true_26: bbox_true_26_value, bbox_true_52: bbox_true_52_value, is_training: True}) duration = time.time() - start_time examples_per_sec = float(duration) / config.train_batch_size format_str = ('Epoch {} step {}, train loss = {} ( {} examples/sec; {} ''sec/batch)') print(format_str.format(epoch, step, train_loss, examples_per_sec, duration)) summary_writer.add_summary(summary=tf.Summary(value=[tf.Summary.Value(tag="train loss", simple_value=train_loss)]), global_step=step) summary_writer.flush() for step in range(int(config.val_num / config.val_batch_size)): start_time = time.time() images_value, bbox_true_13_value, bbox_true_26_value, bbox_true_52_value = sess.run([images_val, bbox_true_13_val, bbox_true_26_val, bbox_true_52_val]) val_loss = sess.run(loss, {images: images_value, bbox_true_13: bbox_true_13_value, bbox_true_26: bbox_true_26_value, bbox_true_52: bbox_true_52_value, is_training: False}) duration = time.time() - start_time examples_per_sec = float(duration) / config.val_batch_size format_str = ('Epoch {} step {}, val loss = {} ({} examples/sec; {} ''sec/batch)') print(format_str.format(epoch, step, val_loss, examples_per_sec, duration)) summary_writer.add_summary(summary = tf.Summary(value=[tf.Summary.Value(tag = "val loss", simple_value = val_loss)]), global_step = step) summary_writer.flush() # 每3个epoch保存一次模型 if epoch % 3 == 0: checkpoint_path = os.path.join(config.model_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=epoch)
def train(): """ Introduction ------------ 训练模型 """ # 指定使用GPU的Index os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu_index train_data = Reader('train', config.data_dir, config.anchors_path, config.num_classes, input_shape = config.input_shape, max_boxes = config.max_boxes, jitter = config.jitter, hue = config.hue, sat = config.sat, cont = config.cont, bri = config.bri) val_data = Reader('val', config.data_dir, config.anchors_path, config.num_classes, input_shape = config.input_shape, max_boxes = config.max_boxes) images_train, bbox_true_13_train, bbox_true_26_train, bbox_true_52_train = train_data.provide(config.train_batch_size) images_val, bbox_true_13_val, bbox_true_26_val, bbox_true_52_val = val_data.provide(config.val_batch_size) model = yolo(config.norm_epsilon, config.norm_decay, config.anchors_path, config.classes_path, config.pre_train) is_training = tf.placeholder(dtype = tf.bool, shape = []) images = tf.placeholder(shape = [None, 416, 416, 3], dtype = tf.float32) bbox_true_13 = tf.placeholder(shape = [None, 13, 13, 3, 85], dtype = tf.float32) bbox_true_26 = tf.placeholder(shape = [None, 26, 26, 3, 85], dtype = tf.float32) bbox_true_52 = tf.placeholder(shape = [None, 52, 52, 3, 85], dtype = tf.float32) bbox_true = [bbox_true_13, bbox_true_26, bbox_true_52] output = model.yolo_inference(images, config.num_anchors / 3, config.num_classes, is_training) loss = model.yolo_loss(output, bbox_true, model.anchors, config.num_classes, config.ignore_thresh) tf.summary.scalar('loss', loss) global_step = tf.Variable(0, trainable = False) learning_rate = tf.train.exponential_decay(config.learning_rate, global_step, 3000, 0.1) optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate) # 如果读取预训练权重,则冻结darknet53网络的变量 update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): if config.pre_train: train_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='yolo') train_op = optimizer.minimize(loss = loss, global_step = global_step, var_list = train_var) else: train_op = optimizer.minimize(loss = loss, global_step = global_step) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session(config = tf.ConfigProto(log_device_placement = False)) as sess: ckpt = tf.train.get_checkpoint_state(config.model_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('restore model', ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) if config.pre_train is True: load_ops = model.load_weights(tf.global_variables(scope = 'darknet53'), config.darknet53_weights_path) sess.run(load_ops) summary_writer = tf.summary.FileWriter('./logs', sess.graph) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess = sess, coord = coord) for epoch in range(config.Epoch): for step in range(int(config.train_num / config.train_batch_size)): start_time = time.time() images_value, bbox_true_13_value, bbox_true_26_value, bbox_true_52_value = sess.run([images_train, bbox_true_13_train, bbox_true_26_train, bbox_true_52_train]) train_loss, _ = sess.run([loss, train_op], {images : images_value, bbox_true_13 : bbox_true_13_value, bbox_true_26 : bbox_true_26_value, bbox_true_52 : bbox_true_52_value, is_training : True}) duration = time.time() - start_time examples_per_sec = float(duration) / config.train_batch_size format_str = ('Epoch {} step {}, train loss = {} ( {} examples/sec; {} ''sec/batch)') print(format_str.format(epoch, step, train_loss, examples_per_sec, duration)) summary_writer.add_summary(summary = tf.Summary(value = [tf.Summary.Value(tag = "train loss", simple_value = train_loss)]), global_step = step) summary_writer.flush() for step in range(int(config.val_num / config.val_batch_size)): start_time = time.time() images_value, bbox_true_13_value, bbox_true_26_value, bbox_true_52_value = sess.run([images_val, bbox_true_13_val, bbox_true_26_val, bbox_true_52_val]) val_loss = sess.run(loss, {images: images_value, bbox_true_13: bbox_true_13_value, bbox_true_26: bbox_true_26_value, bbox_true_52: bbox_true_52_value , is_training: False}) duration = time.time() - start_time examples_per_sec = float(duration) / config.val_batch_size format_str = ('Epoch {} step {}, val loss = {} ({} examples/sec; {} ''sec/batch)') print(format_str.format(epoch, step, val_loss, examples_per_sec, duration)) summary_writer.add_summary(summary = tf.Summary(value = [tf.Summary.Value(tag = "val loss", simple_value = val_loss)]), global_step = step) summary_writer.flush() # 每3个epoch保存一次模型 if epoch % 3 == 0: checkpoint_path = os.path.join(config.model_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = global_step) coord.request_stop() coord.join(threads)
def train(): """ Introduction ------------ """ # train_reader = Reader('train', config.data_dir, config.anchors_path, config.num_classes, input_shape = config.input_shape, max_boxes = config.max_boxes) train_reader = Reader('train', config.record_file, config.anchors_path, config.num_classes, input_shape=config.input_shape, max_boxes=config.max_boxes) train_data = train_reader.build_dataset(config.train_batch_size) is_training = tf.placeholder(tf.bool, shape=[]) iterator = train_data.make_one_shot_iterator() images, bbox, bbox_true_13, bbox_true_26, bbox_true_52 = iterator.get_next( ) images.set_shape([None, config.input_shape, config.input_shape, 3]) bbox.set_shape([None, config.max_boxes, 5]) grid_shapes = [ config.input_shape // 32, config.input_shape // 16, config.input_shape // 8 ] bbox_true_13.set_shape( [None, grid_shapes[0], grid_shapes[0], 3, 5 + config.num_classes]) bbox_true_26.set_shape( [None, grid_shapes[1], grid_shapes[1], 3, 5 + config.num_classes]) bbox_true_52.set_shape( [None, grid_shapes[2], grid_shapes[2], 3, 5 + config.num_classes]) draw_box(images, bbox) model = yolo(config.norm_epsilon, config.norm_decay, config.anchors_path, config.classes_path, config.pre_train) bbox_true = [bbox_true_13, bbox_true_26, bbox_true_52] output = model.yolo_inference(images, config.num_anchors / 3, config.num_classes, is_training) loss = model.yolo_loss(output, bbox_true, model.anchors, config.num_classes, config.ignore_thresh) l2_loss = tf.losses.get_regularization_loss() loss += l2_loss tf.summary.scalar('loss', loss) merged_summary = tf.summary.merge_all() global_step = tf.Variable(0, trainable=False) lr = tf.train.exponential_decay(config.learning_rate, global_step, decay_steps=2000, decay_rate=0.8) optimizer = tf.train.AdamOptimizer(learning_rate=lr) # darknet53 update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): if config.pre_train: train_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='yolo') train_op = optimizer.minimize(loss=loss, global_step=global_step, var_list=train_var) else: train_op = optimizer.minimize(loss=loss, global_step=global_step) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess: with tf.device("/gpu:0"): ckpt = tf.train.get_checkpoint_state(config.model_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('restore model', ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) if config.pre_train is True: load_ops = load_weights(tf.global_variables(scope='darknet53'), config.darknet53_weights_path) sess.run(load_ops) summary_writer = tf.summary.FileWriter(config.log_dir, sess.graph) loss_value = 0 max_iter = config.train_num / config.train_batch_size for epoch in range(config.Epoch): for step in range( int(config.train_num / config.train_batch_size)): start_time = time.time() train_loss, summary, global_step_value, _ = sess.run( [loss, merged_summary, global_step, train_op], {is_training: True}) loss_value += train_loss step_count = epoch * max_iter + step if step_count % 20 == 0 and step_count != 0: duration = time.time() - start_time examples_per_sec = float( duration) / config.train_batch_size format_str = ( 'Epoch {} step {}, train loss = {} ( {} examples/sec; {} ' 'sec/batch)') print( format_str.format(epoch, step, loss_value / 20, examples_per_sec, duration)) loss_value = 0 summary_writer.add_summary(summary=tf.Summary(value=[ tf.Summary.Value(tag="train loss", simple_value=train_loss) ]), global_step=step) summary_writer.add_summary(summary, step) summary_writer.flush() if step_count % 500 == 0: checkpoint_path = os.path.join( config.model_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=global_step) checkpoint_path = os.path.join(config.model_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=global_step)
def train(): """ Introduction ------------ 训练模型 """ # gpu_num = check_available_gpus() # # for gpu_id in range(int(gpu_num)): # with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_id)): # with tf.variable_scope(tf.get_variable_scope(), reuse=(gpu_id > 0)): # with tf.variable_scope(tf.get_variable_scope(), reuse=False): #-----------------------train_data------------------------- train_reader = Reader('train', config.data_dir, config.anchors_path2, config.num_classes, input_shape=config.input_shape, max_boxes=config.max_boxes) train_data = train_reader.build_dataset(config.train_batch_size) is_training = tf.placeholder(tf.bool, shape=[]) iterator = train_data.make_one_shot_iterator() images, bbox, bbox_true_13, bbox_true_26, bbox_true_52 = iterator.get_next( ) #----------------------- definition------------------------- images.set_shape([None, config.input_shape, config.input_shape, 3]) bbox.set_shape([None, config.max_boxes, 5]) grid_shapes = [ config.input_shape // 32, config.input_shape // 16, config.input_shape // 8 ] lr_images = tf.image.resize_images( images, size=[config.input_shape // 4, config.input_shape // 4], method=0, align_corners=False) lr_images.set_shape( [None, config.input_shape // 4, config.input_shape // 4, 3]) bbox_true_13.set_shape( [None, grid_shapes[0], grid_shapes[0], 3, 5 + config.num_classes]) bbox_true_26.set_shape( [None, grid_shapes[1], grid_shapes[1], 3, 5 + config.num_classes]) bbox_true_52.set_shape( [None, grid_shapes[2], grid_shapes[2], 3, 5 + config.num_classes]) bbox_true = [bbox_true_13, bbox_true_26, bbox_true_52] #------------------------summary + draw----------------------------------- tf.summary.image('input1', images, max_outputs=3) draw_box(images, bbox) #------------------------------model--------------------------------- model = yolo(config.norm_epsilon, config.norm_decay, config.anchors_path2, config.classes_path, config.pre_train) # with tf.variable_scope("train_var"): # g_img1 = model.GAN_g1(lr_images) # print(g_img1.outputs) # tf.summary.image('img', g_img1.outputs, 3) # g_img2 = model.GAN_g2(g_img1) # print(model.g_variables) # net_g1 = model.GAN_g1(lr_images, is_train=True) with tf.variable_scope("model_gd"): net_g1 = model.GAN_g(lr_images, is_train=True, mask=False) net_g = model.GAN_g(lr_images, is_train=True, reuse=True, mask=True) d_real = model.yolo_inference(images, config.num_anchors / 3, config.num_classes, training=True) tf.get_variable_scope().reuse_variables() d_fake = model.yolo_inference(net_g.outputs, config.num_anchors / 3, config.num_classes, training=True) #---------------------------d_loss--------------------------------- d_loss1 = model.yolo_loss(d_real, bbox_true, model.anchors, config.num_classes, 1, config.ignore_thresh) d_loss2 = model.yolo_loss(d_fake, bbox_true, model.anchors, config.num_classes, 0, config.ignore_thresh) d_loss = d_loss1 + d_loss2 l2_loss = tf.losses.get_regularization_loss() d_loss += l2_loss #--------------------------g_loss------------------------------------ adv_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like( d_fake[3]), logits=d_fake[3]) # adv_loss = 1e-3 * tf.reduce_sum(adv_loss) / tf.cast(tf.shape(d_fake[3])[0], tf.float32) adv_loss = tf.reduce_sum(adv_loss) / tf.cast( tf.shape(d_fake[3])[0], tf.float32) mse_loss1 = tl.cost.mean_squared_error(net_g1.outputs, images, is_mean=True) mse_loss1 = tf.reduce_sum(mse_loss1) / tf.cast( tf.shape(net_g1.outputs)[0], tf.float32) mse_loss2 = tl.cost.mean_squared_error(net_g.outputs, images, is_mean=True) mse_loss2 = tf.reduce_sum(mse_loss2) / tf.cast( tf.shape(net_g.outputs)[0], tf.float32) mse_loss = mse_loss1 + mse_loss2 # clc_loss = 2e-6 * d_loss2 clc_loss = model.yolo_loss(d_fake, bbox_true, model.anchors, config.num_classes, 1, config.ignore_thresh) g_loss = mse_loss + adv_loss + clc_loss l2_loss = tf.losses.get_regularization_loss() g_loss += l2_loss #----------------summary loss------------------------- # tf.summary.image('img', images, 3) tf.summary.scalar('d_loss', d_loss) tf.summary.scalar('g_loss', g_loss) merged_summary = tf.summary.merge_all() #----------------------optimizer--------------------------- global_step = tf.Variable(0, trainable=False) lr = tf.train.exponential_decay(config.learning_rate, global_step, decay_steps=2000, decay_rate=0.8) optimizer = tf.train.AdamOptimizer(learning_rate=lr) # 如果读取预训练权重,则冻结darknet53网络的变量 update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # print(tf.all_variables()) with tf.control_dependencies(update_ops): if config.pre_train: # aaa = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='generator') train_varg1 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model_gd/generator/generator1') train_varg2 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model_gd/generator/generator2') train_varg = train_varg1 + train_varg2 # print(train_varg) train_vard = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model_gd/yolo_inference/discriminator') # print(train_vard) train_opg = optimizer.minimize(loss=g_loss, global_step=global_step, var_list=train_varg) train_opd = optimizer.minimize(loss=d_loss, global_step=global_step, var_list=train_vard) else: train_opd = optimizer.minimize(loss=d_loss, global_step=global_step) train_opg = optimizer.minimize(loss=g_loss, global_step=global_step) #-------------------------session----------------------------------- init = tf.global_variables_initializer() # tl.layers.print_all_variables() saver = tf.train.Saver() with tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) as sess: # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) ckpt = tf.train.get_checkpoint_state(config.model_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('restore model', ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) if config.pre_train is True: load_ops = load_weights(tf.global_variables(scope='darknet53'), config.darknet53_weights_path) sess.run(load_ops) summary_writer = tf.summary.FileWriter(config.log_dir, sess.graph) dloss_value = 0 gloss_value = 0 for epoch in range(config.Epoch): for step in range(int(config.train_num / config.train_batch_size)): start_time = time.time() train_dloss, summary, global_step_value, _ = sess.run( [d_loss, merged_summary, global_step, train_opd], {is_training: True}) train_gloss, summary, global_step_value, _ = sess.run( [g_loss, merged_summary, global_step, train_opg], {is_training: True}) dloss_value += train_dloss gloss_value += train_gloss duration = time.time() - start_time examples_per_sec = float(duration) / config.train_batch_size print(global_step_value) #------------------------print(epoch)-------------------------- format_str1 = ( 'Epoch {} step {}, train dloss = {} train gloss = {} ( {} examples/sec; {} ' 'sec/batch)') print( format_str1.format(epoch, step, dloss_value / global_step_value, gloss_value / global_step_value, examples_per_sec, duration)) # print(format_str1.format(epoch, step, train_dloss, train_gloss, examples_per_sec, duration)) #----------------------------summary loss------------------------ summary_writer.add_summary(summary=tf.Summary(value=[ tf.Summary.Value(tag="train dloss", simple_value=train_dloss) ]), global_step=step) summary_writer.add_summary(summary=tf.Summary(value=[ tf.Summary.Value(tag="train gloss", simple_value=train_gloss) ]), global_step=step) summary_writer.add_summary(summary, step) summary_writer.flush() #--------------------------save model------------------------------ # 每3个epoch保存一次模型 if epoch % 3 == 0: checkpoint_path = os.path.join(config.model_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=global_step)