def train(is_ft=False): with tf.Graph().as_default(): with tf.variable_scope("model") as scope: root_path = "tfData/part" train_queue = list() for part_index in range(1, 10): train_queue.append(root_path + str(part_index) + '.tfrecords') images, label = decode_from_tfrecords(train_queue, batch_size, image_height, image_width) images = tf.py_func(cv_resize, [images, image_height, image_width], tf.float32) images = tf.reshape(images, [batch_size, image_height, image_width, 1]) logits = inference(images) + images logits = tf.clip_by_value(logits, 0, 255) loss = tf.losses.mean_squared_error(logits, label) reg_loss = tf.add_n(tf.losses.get_regularization_losses()) total_loss = loss opt = tf.train.AdamOptimizer(1e-4) global_step = tf.Variable(0, name='global_step', trainable=False) train_op = slim.learning.create_train_op(total_loss, opt, global_step=global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: updates = tf.group(*update_ops) total_loss = control_flow_ops.with_dependencies([updates], total_loss) saver = tf.train.Saver(tf.all_variables()) init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=False)) sess.run(init) tf.train.start_queue_runners(sess=sess) if is_ft: model_file = tf.train.latest_checkpoint('./model') saver.restore(sess, model_file) tf.logging.set_verbosity(tf.logging.INFO) loss_cnt = 0.0 for step in range(max_iters): _, loss_value, l = sess.run([train_op, loss, logits]) assert not np.isnan( loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: print l[0] loss_cnt += loss_value if step % 100 == 0: format_str = ('%s: step %d, loss = %.2f') print(format_str % (datetime.now(), step, loss_cnt / 10.0)) loss_cnt = 0.0 if step % 500 == 0 or (step + 1) == max_iters: checkpoint_path = os.path.join('../model', 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(is_ft=False): with tf.Graph().as_default(): with tf.variable_scope("model") as scope: # train_queue = ["train_data2.tfrecords"] train_queue = ["train_data.tfrecords"] images, labels = decode_from_tfrecords(train_queue,128) logits = tiny_darknet(images) logits = tf.nn.softmax(tf.reduce_mean(logits,[1,2])) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) reg_loss = tf.add_n(tf.losses.get_regularization_losses()) total_loss = tf.reduce_mean(loss)+reg_loss opt = tf.train.MomentumOptimizer(0.01,0.9) global_step = tf.Variable(0, name='global_step', trainable=False) train_op = slim.learning.create_train_op(total_loss, opt, global_step=global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: updates = tf.group(*update_ops) total_loss = control_flow_ops.with_dependencies([updates], total_loss) saver = tf.train.Saver(tf.all_variables()) init = tf.initialize_all_variables() # sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) sess = tf.Session() sess.run(init) tf.train.start_queue_runners(sess=sess) if is_ft:#if not train model # model_file=tf.train.latest_checkpoint('./model_max') model_file=tf.train.latest_checkpoint('/root/JZ_test/darknet0_model') saver.restore(sess, model_file) #is_ft = False tf.logging.set_verbosity(tf.logging.INFO) loss_cnt = 0.0 loss_flag = 999.0 for step in range(max_iters): _, loss_value = sess.run([train_op, total_loss]) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' loss_cnt+=loss_value if step % 10 == 0: format_str = ('%s: step %d, loss = %.2f') if step == 0: avg_loss_cnt = loss_cnt else: avg_loss_cnt = loss_cnt/10.0 print(format_str % (datetime.now(), step, avg_loss_cnt)) loss_cnt = 0.0 if step % 200 == 0 or (step + 1) == max_iters: # if step % 50 == 0 or (step + 1) == max_iters: # checkpoint_path = os.path.join('/root/classify/model', 'model.ckpt') checkpoint_path = os.path.join('/root/JZ_test/darknet0_model', 'model.ckpt')#save model path saver.save(sess, checkpoint_path, global_step=step)
def train(is_ft=False): with tf.Graph().as_default(): with tf.variable_scope("model") as scope: # train_queue = ["train_data2.tfrecords"] train_queue = ["train_lj.tfrecords"] images, labels = decode_from_tfrecords(train_queue, 128) logits = tiny_darknet(images) # tf.summary.image('iuput', images) # logits = tf.nn.softmax(tf.reduce_mean(logits,[1,2])) logits = tf.reduce_mean(logits, [1, 2]) # print logits.get_shape().as_list() # loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) loss = tf.nn.weighted_cross_entropy_with_logits(targets=labels, logits=logits, pos_weight=1.5) reg_loss = tf.add_n(tf.losses.get_regularization_losses()) # with tf.name_scope('total_loss'): total_loss = tf.reduce_mean(loss) + reg_loss ################################################################## thre = 0.9 accuracy_ = tf.placeholder(tf.float32) logist_acc = tf.nn.sigmoid(logits) tf.summary.scalar('total_loss', total_loss) tf.summary.scalar('accuracy', accuracy_) ################################################################### opt = tf.train.MomentumOptimizer(0.5, 0.9) global_step = tf.Variable(0, name='global_step', trainable=False) # learning_rate = tf.train.exponential_decay(0.1, global_step, 10200, 0.35, staircase=True) # min_lr= tf.constant(0.00001, name='min_lr') # if learning_rate<0.00001: # learning_rate=0.00001 # learning_rate = tf.Session.run(tf.where(tf.greater(min_lr, learning_rate), min_lr, learning_rate)) # opt = tf.train.MomentumOptimizer(learning_rate,0.9) # opt.minimize(total_loss, global_step=global_step) train_op = slim.learning.create_train_op(total_loss, opt, global_step=global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: updates = tf.group(*update_ops) total_loss = control_flow_ops.with_dependencies([updates], total_loss) saver = tf.train.Saver(tf.all_variables(), max_to_keep=400) init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=False)) sess = tf.Session() merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter( '/root/linjian/darknet_0/models/try-linjian/lj_data/A-0/loss_wd4e5_lj-0.5', sess.graph) sess.run(init) tf.train.start_queue_runners(sess=sess) if is_ft: #if not train model # model_file=tf.train.latest_checkpoint('./model_max') ##################################################################################### # model_file=tf.train.latest_checkpoint('./models/finetune/lr0.1_wd4e5') # saver.restore(sess, model_file) model_file = tf.train.get_checkpoint_state( './models/try-linjian/JZ_data/0_lj-wd8e5-0.01') saver.restore(sess, model_file.all_model_checkpoint_paths[-1]) ##################################################################################### # if learning_rate<0.00001: # learning_rate=0.00001 #is_ft = False # ckpt = tf.train.get_checkpoint_state('./models') # if ckpt and ckpt.model_checkpoint_path: # model_file=tf.train.latest_checkpoint('./models') # saver.restore(sess, model_file) tf.logging.set_verbosity(tf.logging.INFO) loss_cnt = 0.0 loss_flag = 999.0 acc_batch = 0.0 for step in range(max_iters): # _, loss_value = sess.run([train_op, total_loss]) _, loss_value, acc, gt = sess.run( [train_op, total_loss, logist_acc, labels]) ################################################# for i in range(128): if acc[i][0] >= thre: predict = 0 else: predict = 1 if predict == gt[i]: acc_batch += 1 ################################################## assert not np.isnan( loss_value), 'Model diverged with loss = NaN' loss_cnt += loss_value if step % 10 == 0: format_str = ('%s: step %d, loss = %.4f, acc = %.4f') if step == 0: avg_loss_cnt = loss_cnt else: avg_loss_cnt = loss_cnt / 10.0 accuracy = acc_batch / float(1280) # tf.summary.scalar('accuracy', accuracy) # merged = tf.summary.merge_all() summary_str = sess.run(merged, feed_dict={accuracy_: accuracy}) train_writer.add_summary(summary_str, step) print(format_str % (datetime.now(), step, avg_loss_cnt, accuracy)) # print("The accuracy is :"+str(accuracy)) acc_batch = 0.0 loss_cnt = 0.0 if step % 50 == 0 or (step + 1) == max_iters: # if step % 50 == 0 or (step + 1) == max_iters: # checkpoint_path = os.path.join('/root/classify/model', 'dp15_model.ckpt') checkpoint_path = os.path.join( '/root/linjian/darknet_0/models/try-linjian/lj_data/A-0/loss_wd4e5_lj-0.5', 'model.ckpt') #save model path saver.save(sess, checkpoint_path, global_step=step) train_writer.close()
def train(is_ft=True): with tf.Graph().as_default(): with tf.variable_scope("model") as scope: # train_queue = ["train_data2.tfrecords"] train_queue = ["train_quarter.tfrecords"] images, labels = decode_from_tfrecords(train_queue, 128) logits = tiny_darknet(images) tf.summary.image('iuput', images) # logits = tf.nn.softmax(tf.reduce_mean(logits,[1,2])) logits = tf.reduce_mean(logits, [1, 2]) print logits.get_shape().as_list() loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) reg_loss = tf.add_n(tf.losses.get_regularization_losses()) # with tf.name_scope('total_loss'): total_loss = tf.reduce_mean(loss) + reg_loss tf.summary.scalar('total_loss', total_loss) opt = tf.train.MomentumOptimizer(0.01, 0.9) global_step = tf.Variable(0, name='global_step', trainable=False) # learning_rate = tf.train.exponential_decay(0.1, global_step, 10200, 0.35, staircase=True) # min_lr= tf.constant(0.00001, name='min_lr') # if learning_rate<0.00001: # learning_rate=0.00001 # learning_rate = tf.Session.run(tf.where(tf.greater(min_lr, learning_rate), min_lr, learning_rate)) # opt = tf.train.MomentumOptimizer(learning_rate,0.9) # opt.minimize(total_loss, global_step=global_step) train_op = slim.learning.create_train_op(total_loss, opt, global_step=global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: updates = tf.group(*update_ops) total_loss = control_flow_ops.with_dependencies([updates], total_loss) saver = tf.train.Saver(tf.all_variables(), max_to_keep=50) init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=False)) sess = tf.Session() merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter( '/root/linjian/darknet_0/models/lr0.01_iter30w_qnew/lr0.01', sess.graph) sess.run(init) tf.train.start_queue_runners(sess=sess) if is_ft: #if not train model # model_file=tf.train.latest_checkpoint('./model_max') model_file = tf.train.latest_checkpoint( './models/lr0.01_iter30w_qnew') saver.restore(sess, model_file) # if learning_rate<0.00001: # learning_rate=0.00001 #is_ft = False # ckpt = tf.train.get_checkpoint_state('./models') # if ckpt and ckpt.model_checkpoint_path: # model_file=tf.train.latest_checkpoint('./models') # saver.restore(sess, model_file) tf.logging.set_verbosity(tf.logging.INFO) loss_cnt = 0.0 loss_flag = 999.0 for step in range(max_iters): _, loss_value = sess.run([train_op, total_loss]) assert not np.isnan( loss_value), 'Model diverged with loss = NaN' loss_cnt += loss_value if step % 10 == 0: format_str = ('%s: step %d, loss = %.4f') if step == 0: avg_loss_cnt = loss_cnt else: avg_loss_cnt = loss_cnt / 10.0 summary_str = sess.run(merged) train_writer.add_summary(summary_str, step) print(format_str % (datetime.now(), step, avg_loss_cnt)) loss_cnt = 0.0 if step % 4000 == 0 or (step + 1) == max_iters: # if step % 50 == 0 or (step + 1) == max_iters: # checkpoint_path = os.path.join('/root/classify/model', 'dp15_model.ckpt') checkpoint_path = os.path.join( '/root/linjian/darknet_0/models/lr0.01_iter30w_qnew/lr0.01', 'model.ckpt') #save model path saver.save(sess, checkpoint_path, global_step=step) train_writer.close()