def main(_): # TODO: split file support with tf.Graph().as_default(): global save_model_dir with KittiLoader(object_dir=os.path.join(dataset_dir, 'training'), queue_size=50, require_shuffle=True, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, use_multi_process_num=8, multi_gpu_sum=cfg.GPU_USE_COUNT, aug=True) as train_loader, \ KittiLoader(object_dir=os.path.join(dataset_dir, 'testing'), queue_size=50, require_shuffle=True, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, use_multi_process_num=8, multi_gpu_sum=cfg.GPU_USE_COUNT, aug=False) as valid_loader: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION, visible_device_list=cfg.GPU_AVAILABLE, allow_growth=True) config = tf.ConfigProto( gpu_options=gpu_options, device_count={ "GPU": cfg.GPU_USE_COUNT, }, allow_soft_placement=True, ) with tf.Session(config=config) as sess: model = RPN3D( cls=cfg.DETECT_OBJ, single_batch_size=args.single_batch_size, learning_rate=args.lr, max_gradient_norm=5.0, is_train=True, alpha=1.5, beta=1, avail_gpus=cfg.GPU_AVAILABLE.split(',') ) # param init/restore if tf.train.get_checkpoint_state(save_model_dir): print("Reading model parameters from %s" % save_model_dir) model.saver.restore( sess, tf.train.latest_checkpoint(save_model_dir)) else: print("Created model with fresh parameters.") tf.global_variables_initializer().run() # train and validate iter_per_epoch = int( len(train_loader) / (args.single_batch_size * cfg.GPU_USE_COUNT)) is_summary, is_summary_image, is_validate = False, False, False summary_interval = 5 summary_image_interval = 20 save_model_interval = int(iter_per_epoch / 3) validate_interval = 60 summary_writer = tf.summary.FileWriter(log_dir, sess.graph) while model.epoch.eval() < args.max_epoch: is_summary, is_summary_image, is_validate = False, False, False iter = model.global_step.eval() if not iter % summary_interval: is_summary = True if not iter % summary_image_interval: is_summary_image = True if not iter % save_model_interval: model.saver.save(sess, os.path.join( save_model_dir, 'checkpoint'), global_step=model.global_step) if not iter % validate_interval: is_validate = True if not iter % iter_per_epoch: sess.run(model.epoch_add_op) print('train {} epoch, total: {}'.format( model.epoch.eval(), args.max_epoch)) ret = model.train_step( sess, train_loader.load(), train=True, summary=is_summary) print('train: {}/{} @ epoch:{}/{} loss: {} reg_loss: {} cls_loss: {} {}'.format(iter, iter_per_epoch * args.max_epoch, model.epoch.eval(), args.max_epoch, ret[0], ret[1], ret[2], args.tag)) if is_summary: summary_writer.add_summary(ret[-1], iter) if is_summary_image: ret = model.predict_step( sess, valid_loader.load(), summary=True) summary_writer.add_summary(ret[-1], iter) if is_validate: ret = model.validate_step( sess, valid_loader.load(), summary=True) summary_writer.add_summary(ret[-1], iter) if check_if_should_pause(args.tag): model.saver.save(sess, os.path.join( save_model_dir, 'checkpoint'), global_step=model.global_step) print('pause and save model @ {} steps:{}'.format( save_model_dir, model.global_step.eval())) sys.exit(0) print('train done. total epoch:{} iter:{}'.format( model.epoch.eval(), model.global_step.eval())) # finallly save model model.saver.save(sess, os.path.join( save_model_dir, 'checkpoint'), global_step=model.global_step)
def main(_): # TODO: split file support with tf.Graph().as_default(): global save_model_dir start_epoch = 0 global_counter = 0 gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION, visible_device_list=cfg.GPU_AVAILABLE, allow_growth=True) config = tf.ConfigProto( gpu_options=gpu_options, device_count={ "GPU": cfg.GPU_USE_COUNT, }, allow_soft_placement=True, ) with tf.Session(config=config) as sess: model = RPN3D( cls=cfg.DETECT_OBJ, single_batch_size=args.single_batch_size, learning_rate=args.lr, max_gradient_norm=5.0, alpha=args.alpha, beta=args.beta, avail_gpus=cfg.GPU_AVAILABLE #.split(',') ) # param init/restore if tf.train.get_checkpoint_state(save_model_dir): print("Reading model parameters from %s" % save_model_dir) model.saver.restore(sess, tf.train.latest_checkpoint(save_model_dir)) start_epoch = model.epoch.eval() + 1 global_counter = model.global_step.eval() + 1 else: print("Created model with fresh parameters.") tf.global_variables_initializer().run() # train and validate is_summary, is_summary_image, is_validate = False, False, False summary_interval = 5 summary_val_interval = 10 summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # training for epoch in range(start_epoch, args.max_epoch): counter = 0 batch_time = time.time() for batch in iterate_data(train_dir, shuffle=True, aug=True, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): counter += 1 global_counter += 1 if counter % summary_interval == 0: is_summary = True else: is_summary = False start_time = time.time() ret = model.train_step(sess, batch, train=True, summary=is_summary) forward_time = time.time() - start_time batch_time = time.time() - batch_time print('train: {} @ epoch:{}/{} loss: {:.4f} reg_loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'\ .format(counter,epoch, args.max_epoch, ret[0], ret[1], ret[2], ret[3], ret[4], forward_time, batch_time)) with open('log/train.txt', 'a') as f: f.write( 'train: {} @ epoch:{}/{} loss: {:.4f} reg_loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f} \n' .format(counter, epoch, args.max_epoch, ret[0], ret[1], ret[2], ret[3], ret[4], forward_time, batch_time)) #print(counter, summary_interval, counter % summary_interval) if counter % summary_interval == 0: print("summary_interval now") summary_writer.add_summary(ret[-1], global_counter) #print(counter, summary_val_interval, counter % summary_val_interval) if counter % summary_val_interval == 0: print("summary_val_interval now") batch = sample_test_data( val_dir, args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT) ret = model.validate_step(sess, batch, summary=True) summary_writer.add_summary(ret[-1], global_counter) try: ret = model.predict_step(sess, batch, summary=True) summary_writer.add_summary(ret[-1], global_counter) except: print("prediction skipped due to error") if check_if_should_pause(args.tag): model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step) print('pause and save model @ {} steps:{}'.format( save_model_dir, model.global_step.eval())) sys.exit(0) batch_time = time.time() sess.run(model.epoch_add_op) model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step) # dump test data every 10 epochs if (epoch + 1) % 10 == 0: # create output folder os.makedirs(os.path.join(args.output_path, str(epoch)), exist_ok=True) os.makedirs(os.path.join(args.output_path, str(epoch), 'data'), exist_ok=True) if args.vis: os.makedirs(os.path.join(args.output_path, str(epoch), 'vis'), exist_ok=True) for batch in iterate_data( val_dir, shuffle=False, aug=False, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): if args.vis: tags, results, front_images, bird_views, heatmaps = model.predict_step( sess, batch, summary=False, vis=True) else: tags, results = model.predict_step(sess, batch, summary=False, vis=False) for tag, result in zip(tags, results): of_path = os.path.join(args.output_path, str(epoch), 'data', tag + '.txt') with open(of_path, 'w+') as f: labels = box3d_to_label([result[:, 1:8]], [result[:, 0]], [result[:, -1]], coordinate='lidar')[0] for line in labels: f.write(line) print('write out {} objects to {}'.format( len(labels), tag)) # dump visualizations if args.vis: for tag, front_image, bird_view, heatmap in zip( tags, front_images, bird_views, heatmaps): front_img_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_front.jpg') bird_view_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_bv.jpg') heatmap_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_heatmap.jpg') cv2.imwrite(front_img_path, front_image) cv2.imwrite(bird_view_path, bird_view) cv2.imwrite(heatmap_path, heatmap) # execute evaluation code cmd_1 = "./kitti_eval/launch_test.sh" cmd_2 = os.path.join(args.output_path, str(epoch)) cmd_3 = os.path.join(args.output_path, str(epoch), 'log') os.system(" ".join([cmd_1, cmd_2, cmd_3])) print('train done. total epoch:{} iter:{}'.format( epoch, model.global_step.eval())) # finallly save model model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step)
def main(_): with tf.Graph().as_default(): start_epoch = 0 global_counter = 0 gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION, visible_device_list=cfg.GPU_AVAILABLE, allow_growth=True) config = tf.ConfigProto( gpu_options=gpu_options, device_count={ "GPU": cfg.GPU_USE_COUNT, }, allow_soft_placement=True, ) with tf.Session(config=config) as sess: model = RPN3D(cls=cfg.DETECT_OBJ, decrease=args.decrease, minimize=args.minimize, single_batch_size=args.single_batch_size, learning_rate=args.lr, max_gradient_norm=5.0, alpha=args.alpha, beta=args.beta, avail_gpus=cfg.GPU_AVAILABLE.split(',')) # param init/restore if tf.train.get_checkpoint_state(save_model_dir): print("Reading model parameters from %s" % save_model_dir) model.saver.restore(sess, tf.train.latest_checkpoint(save_model_dir)) start_epoch = model.epoch.eval() + 1 global_counter = model.global_step.eval() + 1 else: print("Created model with fresh parameters.") tf.global_variables_initializer().run() # train and validate is_summary, is_summary_image, is_validate = False, False, False summary_interval = 5 summary_val_interval = 10 summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # training for epoch in range(start_epoch, args.max_epoch): counter = 0 batch_time = time.time() for batch in iterate_data(train_dir, shuffle=True, aug=True, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): counter += 1 global_counter += 1 if counter % summary_interval == 0: is_summary = True else: is_summary = False start_time = time.time() ret = model.train_step(sess, batch, train=True, summary=is_summary) forward_time = time.time() - start_time batch_time = time.time() - batch_time print( 'train: {} @ epoch:{}/{} loss: {:.4f} reg_loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}' .format(counter, epoch + 1, args.max_epoch, ret[0], ret[1], ret[2], ret[3], ret[4], forward_time, batch_time)) with open(os.path.join('log', 'train.txt'), 'a') as f: f.write( 'train: {} @ epoch:{}/{} loss: {:.4f} reg_loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f} \n' .format(counter, epoch + 1, args.max_epoch, ret[0], ret[1], ret[2], ret[3], ret[4], forward_time, batch_time)) if counter % summary_interval == 0: print("summary_interval now") summary_writer.add_summary(ret[-1], global_counter) if counter % summary_val_interval == 0: print("summary_val_interval now") batch = sample_test_data( val_dir, args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT) ret = model.validate_step(sess, batch, summary=True) summary_writer.add_summary(ret[-1], global_counter) if check_if_should_pause(args.tag): model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step) print('pause and save model @ {} steps:{}'.format( save_model_dir, model.global_step.eval())) sys.exit(0) batch_time = time.time() sess.run(model.epoch_add_op) model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step) # dump test data every 10 epochs if (epoch + 1) % 10 == 0: os.makedirs(os.path.join(res_dir, str(epoch)), exist_ok=True) os.makedirs(os.path.join(res_dir, str(epoch), 'data'), exist_ok=True) for batch in iterate_data( val_dir, shuffle=False, aug=False, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): tags, results = model.predict_step(sess, batch, summary=False, vis=False) for tag, result in zip(tags, results): of_path = os.path.join(res_dir, str(epoch), 'data', tag + '.txt') with open(of_path, 'w+') as f: labels = box3d_to_label([result[:, 1:8]], [result[:, 0]], [result[:, -1]], coordinate='lidar')[0] for line in labels: f.write(line) print('write out {} objects to {}'.format( len(labels), tag)) # finally save model model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step)
def main(_): global log_f timestr = time.strftime("%b-%d_%H-%M-%S", time.localtime()) log_f = open('log/train_{}.txt'.format(timestr), 'w') log_print(str(cfg)) # TODO: split file support with tf.Graph().as_default(): global save_model_dir start_epoch = 0 global_counter = 0 gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION, visible_device_list=cfg.GPU_AVAILABLE, allow_growth=True) config = tf.ConfigProto( gpu_options=gpu_options, device_count={ "GPU": cfg.GPU_USE_COUNT, }, allow_soft_placement=True, ) with tf.Session(config=config) as sess: model = RPN3D(cls=cfg.DETECT_OBJ, single_batch_size=args.single_batch_size, learning_rate=args.lr, max_gradient_norm=5.0, alpha=args.alpha, beta=args.beta, avail_gpus=cfg.GPU_AVAILABLE.split(',')) # param init/restore if args.restore and tf.train.get_checkpoint_state(save_model_dir): log_print("Reading model parameters from %s" % save_model_dir) model.saver.restore(sess, tf.train.latest_checkpoint(save_model_dir)) start_epoch = model.epoch.eval() + 1 global_counter = model.global_step.eval() + 1 else: log_print("Created model with fresh parameters.") tf.global_variables_initializer().run() if cfg.FEATURE_NET_TYPE == 'FeatureNet_AE' and cfg.FeatureNet_AE_WPATH: ae_checkpoint_file = tf.train.latest_checkpoint( cfg.FeatureNet_AE_WPATH) log_print("Load Pretrained FeatureNet_AE weights %s" % ae_checkpoint_file) ae_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='ae_encoder') ae_saver = tf.train.Saver( var_list={v.op.name: v for v in ae_vars}) ae_saver.restore(sess, ae_checkpoint_file) if cfg.FEATURE_NET_TYPE == 'FeatureNet_VAE' and cfg.FeatureNet_VAE_WPATH: vae_checkpoint_file = tf.train.latest_checkpoint( cfg.FeatureNet_VAE_WPATH) log_print("Load Pretrained FeatureNet_VAE weights %s" % vae_checkpoint_file) vae_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='vae_encoder') vae_saver = tf.train.Saver( var_list={v.op.name: v for v in vae_vars}) vae_saver.restore(sess, vae_checkpoint_file) # train and validate is_summary, is_summary_image, is_validate = False, False, False summary_interval = 5 summary_val_interval = 20 summary_writer = tf.summary.FileWriter(log_dir, sess.graph) parameter_num = np.sum( [np.prod(v.shape.as_list()) for v in tf.trainable_variables()]) log_print('Parameter number: {}'.format(parameter_num)) # training for epoch in range(start_epoch, args.max_epoch): counter = 0 batch_time = time.time() for batch in iterate_data(train_dir, db_sampler=sampler, shuffle=True, aug=AUG_DATA, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): counter += 1 global_counter += 1 if counter % summary_interval == 0: is_summary = True else: is_summary = False start_time = time.time() ret = model.train_step(sess, batch, train=True, summary=is_summary) forward_time = time.time() - start_time batch_time = time.time() - batch_time log_print( 'train: {} @ epoch:{}/{} loss: {:.4f} reg_loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}' .format(counter, epoch, args.max_epoch, ret[0], ret[1], ret[2], ret[3], ret[4], forward_time, batch_time), write=is_summary) #print(counter, summary_interval, counter % summary_interval) if counter % summary_interval == 0: log_print("summary_interval now") summary_writer.add_summary(ret[-1], global_counter) #print(counter, summary_val_interval, counter % summary_val_interval) if counter % summary_val_interval == 0: log_print("summary_val_interval now") # Random sample single batch data batch = sample_test_data( val_dir, args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT) ret = model.validate_step(sess, batch, summary=True) summary_writer.add_summary(ret[-1], global_counter) log_print( 'validation: loss: {:.4f} reg_loss: {:.4f} cls_loss: {:.4f} ' .format(ret[0], ret[1], ret[2])) with warnings.catch_warnings(): warnings.filterwarnings('error') try: ret = model.predict_step(sess, batch, summary=True) summary_writer.add_summary( ret[-1], global_counter) except: log_print('prediction skipped due to error', 'red') if check_if_should_pause(args.tag): model.saver.save(sess, os.path.join(save_model_dir, timestr), global_step=model.global_step) log_print('pause and save model @ {} steps:{}'.format( save_model_dir, model.global_step.eval())) sys.exit(0) batch_time = time.time() sess.run(model.epoch_add_op) model.saver.save(sess, os.path.join(save_model_dir, timestr), global_step=model.global_step) # dump test data every 10 epochs if (epoch + 1) % 10 == 0: # create output folder os.makedirs(os.path.join(args.output_path, str(epoch)), exist_ok=True) os.makedirs(os.path.join(args.output_path, str(epoch), 'data'), exist_ok=True) if args.vis: os.makedirs(os.path.join(args.output_path, str(epoch), 'vis'), exist_ok=True) for batch in iterate_data( val_dir, shuffle=False, aug=False, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): if args.vis: tags, results, front_images, bird_views, heatmaps = model.predict_step( sess, batch, summary=False, vis=True) else: tags, results = model.predict_step(sess, batch, summary=False, vis=False) for tag, result in zip(tags, results): of_path = os.path.join(args.output_path, str(epoch), 'data', tag + '.txt') with open(of_path, 'w+') as f: P, Tr, R = load_calib( os.path.join(cfg.CALIB_DIR, tag + '.txt')) labels = box3d_to_label([result[:, 1:8]], [result[:, 0]], [result[:, -1]], coordinate='lidar', P2=P, T_VELO_2_CAM=Tr, R_RECT_0=R)[0] for line in labels: f.write(line) log_print('write out {} objects to {}'.format( len(labels), tag)) # dump visualizations if args.vis: for tag, front_image, bird_view, heatmap in zip( tags, front_images, bird_views, heatmaps): front_img_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_front.jpg') bird_view_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_bv.jpg') heatmap_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_heatmap.jpg') cv2.imwrite(front_img_path, front_image) cv2.imwrite(bird_view_path, bird_view) cv2.imwrite(heatmap_path, heatmap) # execute evaluation code cmd_1 = "./kitti_eval/launch_test.sh" cmd_2 = os.path.join(args.output_path, str(epoch)) cmd_3 = os.path.join(args.output_path, str(epoch), 'log') os.system(" ".join([cmd_1, cmd_2, cmd_3])) log_print('train done. total epoch:{} iter:{}'.format( epoch, model.global_step.eval())) # finallly save model model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step)
def main(_): # TODO: split file support with tf.Graph().as_default(): global save_model_dir start_epoch = 0 global_counter = 0 gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION, visible_device_list=cfg.GPU_AVAILABLE, allow_growth=True) config = tf.ConfigProto( gpu_options=gpu_options, device_count={ "GPU": cfg.GPU_USE_COUNT, }, allow_soft_placement=True, ) with tf.Session(config=config) as sess: model = RPN3D(cls=cfg.DETECT_OBJ, single_batch_size=args.single_batch_size, learning_rate=args.lr, max_gradient_norm=5.0, is_train=True, alpha=args.alpha, beta=args.beta, avail_gpus=cfg.GPU_AVAILABLE.split(',')) # param init/restore if tf.train.get_checkpoint_state(save_model_dir): print("Reading model parameters from %s" % save_model_dir) model.saver.restore(sess, tf.train.latest_checkpoint(save_model_dir)) start_epoch = model.epoch.eval() + 1 global_counter = model.global_step.eval() + 1 else: print("Created model with fresh parameters.") tf.global_variables_initializer().run() # train and validate is_summary, is_summary_image, is_validate = False, False, False summary_interval = 5 summary_val_interval = 10 summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # training for epoch in range(start_epoch, args.max_epoch): counter = 0 for batch in iterate_data(train_dir, shuffle=True, aug=True, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): counter += 1 global_counter += 1 if counter % summary_interval == 0: is_summary = True else: is_summary = False start_time = time.time() ret = model.train_step(sess, batch, train=True, summary=is_summary) times = time.time() - start_time print( 'train: {} @ epoch:{}/{} loss: {} reg_loss: {} cls_loss: {} time: {}' .format(counter, epoch, args.max_epoch, ret[0], ret[1], ret[2], times)) with open('log/train.txt', 'a') as f: f.write( 'train: {} @ epoch:{}/{} loss: {} reg_loss: {} cls_loss: {} time: {} \n' .format(counter, epoch, args.max_epoch, ret[0], ret[1], ret[2], times)) #print(counter, summary_interval, counter % summary_interval) if counter % summary_interval == 0: print("summary_interval now") summary_writer.add_summary(ret[-1], global_counter) #print(counter, summary_val_interval, counter % summary_val_interval) if counter % summary_val_interval == 0: print("summary_val_interval now") batch = sample_test_data( val_dir, args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT) ret = model.validate_step(sess, batch, summary=True) summary_writer.add_summary(ret[-1], global_counter) try: ret = model.predict_step(sess, batch, summary=True) summary_writer.add_summary(ret[-1], global_counter) except: print("prediction skipped due to error") if check_if_should_pause(args.tag): model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step) print('pause and save model @ {} steps:{}'.format( save_model_dir, model.global_step.eval())) sys.exit(0) sess.run(model.epoch_add_op) model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step) print('train done. total epoch:{} iter:{}'.format( epoch, model.global_step.eval())) # finallly save model model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step)
def main(_): with tf.Graph().as_default(): global save_model_dir with KittiLoader(object_dir=os.path.join(dataset_dir, 'object', 'training'), require_shuffle=True, split_file=os.path.join(cfg.ROOT_DIR, 'DataSplits', 'train.txt'), is_testset=False, batch_size=args.single_batch_size, aug=False, aug_num=0) as train_loader, \ KittiLoader(object_dir=os.path.join(dataset_dir, 'object', 'training'), require_shuffle=False, split_file=os.path.join(cfg.ROOT_DIR, 'DataSplits', 'val.txt'), is_testset=False, batch_size=args.single_batch_size, aug=False, aug_num=0) as valid_loader: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION, visible_device_list=cfg.GPU_AVAILABLE, allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_options, device_count={"GPU": cfg.GPU_USE_COUNT}, allow_soft_placement=True) with tf.Session(config=config) as sess: premodelTime = time.time() model = VoxelNet(cls=args.cls, single_batch_size=args.single_batch_size, learning_rate=args.learning_rate, max_gradient_norm=5.0, is_train=True, alpha=1.5, beta=1, avail_gpus=cfg.GPU_AVAILABLE.split(',')) postmodelTime = time.time() getTotalNumberOfParams(model) print("It took {} seconds to create model".format(postmodelTime - premodelTime)) # Restore from checkpoint if it exists if tf.train.get_checkpoint_state(save_model_dir): print("Reading model parameters from ", save_model_dir) prereadTime = time.time() model.saver.restore(sess, tf.train.latest_checkpoint(save_model_dir)) postreadTime = time.time() print("It took {} seconds to read parameters from file".format(postreadTime - prereadTime)) else: # No checkpoint exists print("Initializing model parameters") preInitTime = time.time() tf.global_variables_initializer().run() postInitTime = time.time() print("It took {} seconds to freshly initialize model parameters".format(postInitTime - preInitTime)) # Train and validate iter_per_epoch = int(len(train_loader) / (args.single_batch_size*cfg.GPU_USE_COUNT)) is_summary, is_summary_image, is_validate = False, False, False summary_interval = 5 summary_image_interval = 20 #summary_image_interval = 1 save_model_interval = int(iter_per_epoch / 3) validate_interval = 60 bestValLoss = 100000 print('\n--------------------------------------------------------------') print('Training parameters') print('batch size={} with {} augmented members added per batch'.format(args.single_batch_size, args.num_aug_per_batch)) print('\tnum members per pass {}'.format(args.single_batch_size+args.num_aug_per_batch)) print('max epoch={}'.format(args.max_epoch)) print('iter_per_epoch={}'.format(iter_per_epoch)) print('current epoch={}'.format(model.epoch.eval())) print('summary_interval={}'.format(summary_interval)) print('summary_image_interval={}'.format(summary_image_interval)) print('save_model_interval={}'.format(save_model_interval)) print('validate_interval={}'.format(validate_interval)) summary_writer = tf.summary.FileWriter(log_dir, sess.graph) startTraining = time.time() #while model.epoch.eval() < args.max_epoch: while True: is_summary, is_summary_image, is_validate = False, False, False iter = model.global_step.eval() print('iteration = {}'.format(iter)) if not iter % summary_interval: is_summary = True if not iter % summary_image_interval: is_summary_image = True if not iter % save_model_interval: model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step) if not iter % validate_interval: is_validate = True if not iter % iter_per_epoch: sess.run(model.epoch_add_op) print('training epoch {} of {} total'.format(model.epoch.eval(), args.max_epoch)) flag, data = train_loader.load(args.single_batch_size) if flag: train_loader.reset() ret = model.train_step(sess, data, train=True, summary=is_summary) print('train: {}/{} @ epoch:{}/{} loss: {} reg_loss: {} cls_loss: {} {}'.format(iter, iter_per_epoch * args.max_epoch, model.epoch.eval(), args.max_epoch, ret[0], ret[1], ret[2], args.tag)) print('Time since training started {} secs'.format(time.time() - startTraining)) if is_summary: print('\twritting summary') summary_writer.add_summary(ret[-1], iter) if is_summary_image: print('\tmaking images') flag, valdat = valid_loader.load(args.single_batch_size) if flag: valid_loader.reset() ret = model.predict_step(sess, valdat, summary=True) summary_writer.add_summary(ret[-1], iter) if is_validate: print('\trunning validate') losses = [] for i in range(50): flag, valdat = valid_loader.load(args.single_batch_size) if flag: valid_loader.reset() ret = model.validate_step(sess, valdat, summary=True) losses.append(ret[0]) ave_loss = np.average(np.array(losses)) if ave_loss < bestValLoss: print('\tnew best average validation loss for 50 forward passes was {} now {} at iteration {}'.format(bestValLoss, ave_loss, iter)) bestValLoss = ave_loss model.saver.save(sess, os.path.join(save_best_dir, 'checkpoint'), global_step=model.global_step) if check_if_should_pause(args.tag): print('\tsaving model') model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step) stopTraining = time.time() print('Training took a total of {} secs for {} total iterations'.format(stopTraining - startTraining, args.max_epoch*iter_per_epoch))