print("cfg_file: {}".format(cfg_file)) with open(cfg_file, "r") as f: configs = json.load(f) configs["system"]["snapshot_name"] = args.cfg_file system_configs.update_config(configs["system"]) print("system config...") pprint.pprint(system_configs.full) print("loading parameters: {}".format(args.model_path)) print("building neural network...") nnet = NetworkFactory(None) print("loading parameters...") nnet.load_pretrained_params(args.model_path) nnet.cuda() nnet.eval_mode() K = configs["db"]["top_k"] aggr_weight = configs["db"]["aggr_weight"] scores_thresh = configs["db"]["scores_thresh"] center_thresh = configs["db"]["center_thresh"] suppres_ghost = True nms_kernel = 3 scales = configs["db"]["test_scales"] weight_exp = 8 categories = configs["db"]["categories"] nms_threshold = configs["db"]["nms_threshold"] max_per_image = configs["db"]["max_per_image"] nms_algorithm = {
from torch.autograd import Variable from torchvision.models import resnet import pytorch_to_caffe from config import system_configs from nnet.py_factory import NetworkFactory if __name__ == '__main__': # name = 'resnet18' # resnet18 = resnet.resnet18(pretrained=True) # checkpoint = torch.load("/home/shining/Downloads/resnet18-5c106cde.pth") name = 'cornernet' db = 1 cfg_file = os.path.join(system_configs.config_dir, "CornerNet" + ".json") with open(cfg_file, "r") as f: configs = json.load(f) configs["system"]["snapshot_name"] = "CornerNet" system_configs.update_config(configs["system"]) cornernet = NetworkFactory(db).model # resnet18.load_state_dict(checkpoint) cornernet.cuda() cornernet.eval() input = torch.ones([1, 3, 511, 511]).cuda() # input=torch.ones([1,3,224,224]) pytorch_to_caffe.trans_net(cornernet, input, name) pytorch_to_caffe.save_prototxt('{}.prototxt'.format(name)) pytorch_to_caffe.save_caffemodel('{}.caffemodel'.format(name))
def train(training_dbs, validation_db, start_iter=0): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) sample_data = importlib.import_module(data_file).sample_data # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() print("building model...") nnet = NetworkFactory(training_dbs[0]) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) # defining tensorboard writer tensorboard = Tensorboard('logs') print("training start...") nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) out_train = nnet.train(**training) if display and iteration % display == 0: for idX, eleX in enumerate( ["training", "focal", "pull", "push", "regr"]): print("{} loss at iteration {}: {}".format( eleX, iteration, out_train[idX].item())) tensorboard.log_scalar('training/{} loss'.format(eleX), out_train[idX].item(), iteration) del out_train if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) tensorboard.log_scalar('validation/loss', validation_loss.item(), iteration) if iteration % (val_iter * 2) == 0: kp_detection(validation_db, nnet, "./cache/", debug=False, subset_val=True, TB_obj=tensorboard, TB_iter=iteration) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # closing tensorboard writer tensorboard.close() # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def train(training_dbs, validation_db, start_iter=0, debug=False): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot # val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize # getting the size of each database training_size = len(training_dbs[0].db_inds) # validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) # validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) # pinned_validation_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) sample_data = importlib.import_module(data_file).sample_data # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True, debug) # if val_iter: # validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) training_pin_semaphore = threading.Semaphore() # validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() # validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() # validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) # validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) # validation_pin_thread.daemon = True # validation_pin_thread.start() print("building model...") nnet = NetworkFactory(training_dbs[0]) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start...") nnet.cuda() # nnet.eval_mode() # nnet.calculate_bboxes(cfg_file, 0) nnet.train_mode() avg_loss = AverageMeter() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=1): training = pinned_training_queue.get(block=True) training_loss = nnet.train(**training) avg_loss.update(training_loss.item()) if display and iteration % display == 0: print("training loss at iteration {}: {:.6f} ({:.6f})".format( iteration, training_loss.item(), avg_loss.avg)) del training_loss # if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: # nnet.eval_mode() # # validation = pinned_validation_queue.get(block=True) # # validation_loss = nnet.validate(**validation) # nnet.calculate_bboxes(cfg_file, iteration) # # print("validation loss at iteration {}: {}".format(iteration, validation_loss.item())) # nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % 1000 == 0: nnet.save_params(-1) avg_loss = AverageMeter() if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() # validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate()
def train(training_dbs, validation_db, start_iter=0): # train.py--> here # training_db = [MSCOCO x 4] and use the dataset specified by "trainval2014" # validation_db is a MSCOCO instance whose configs should firstly check in file CenterNet-104.json # start_iter should check args.star_iter it should be 0 learning_rate = system_configs.learning_rate # 0.00025 max_iteration = system_configs.max_iter # 480000 pretrained_model = system_configs.pretrain # None snapshot = system_configs.snapshot # 5000 val_iter = system_configs.val_iter # 500 display = system_configs.display # 5 decay_rate = system_configs.decay_rate # 10 stepsize = system_configs.stepsize # 450000 # all above hyperparameters should first check CenterNet-104.py to see , # then check config.py # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) # prefetch_size = 6 you can find this number in CenterNet.json validation_queue = Queue(5) # Queue is for torch.multiprocessing module to share data, manipulate data, exchange data, operate data. # since torch.multiprocessing function can't return value. so the operation is using Queue # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) # queue.Queue is for threading to share data # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) # sample.coco # and there is a coco.py in directory sample sample_data = importlib.import_module(data_file).sample_data # importlib.import_module(data_file) means to import the sample.coco.py # and there is a function named sample_data in that file. # so sample_data means the function in sample/coco.py # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) # training_dbs is a list of four MSCOCO instance, and MSCOCO instance is used for loading annotation data # the training_queue is a queue of 6 # sample_data is a function # four thread each thread load a batch of data.images, heatmaps, location in flattened image, fractions part of keypoints # training_tasks is a list of torch.multiprocessing.Process objects, # so when each Process object.start() the original images and annotation files will be processed into # the formula accord with input shape to the network # data for training will be stored in training_queue if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) # data for validation will be stored in validation_queue. training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() # class threading.Semaphore([value]) # values是一个内部计数,values默认是1,如果小于0,则会抛出 ValueError 异常,可以用于控制线程数并发数 # here semaphore use default value so is 1 # Semaphore 是 Python 内置模块 threading 中的一个类 # Semaphore 管理一个计数器,每调用一次 acquire() 方法, # 计数器就减一,每调用一次 release() 方法,计数器就加一。 # 计时器的值默认为 1 ,计数器的值不能小于 0, # 当计数器的值为 0 时,调用 acquire() 的线程就会等待,直到 release() 被调用。 # 因此,可以利用这个特性来控制线程数量 # 代码示例。 # from threading import Thread, Semaphore # import time # # # def test(a): # #打印线程的名字 # print(t.name) # print(a) # time.sleep(2) # #释放 semaphore # sem.release() # # #设置计数器的值为 5 # sem = Semaphore(5) # for i in range(10): # #获取一个 semaphore # sem.acquire() # t = Thread(target=test, args=(i, )) # t.start() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) # training_queue is images, heatmaps,location regressions # pinned_training_queue hasn't initialized yet, it may be initialized in later line using function pin_memory # training_pin_semaphore is a counter training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) # Python Thread类表示在单独的控制线程中运行的活动。有两种方法可以指定这种活动: # 给构造函数传递回调对象: # https://blog.csdn.net/drdairen/article/details/60962439 # target is a function # args is inputs for the function # the function pin_memory move the data into GPU and read them one batch after another each time training_pin_thread.daemon = True # daemon的使用场景是:你需要一个始终运行的进程,用来监控其他服务的运行情况, # 或者发送心跳包或者类似的东西,你创建了这个进程都就不用管它了, # 他会随着主线程的退出出而退出了。 # so this line make the thread of reading data from CPU to GPU and read them batch by batch always alive # in the whole training stage training_pin_thread.start() # validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() # above four lines move validation data from CPU to GPU and read in batch by batch print("building model...") nnet = NetworkFactory(training_dbs[0]) if pretrained_model is not None: # the CenterNet-104.json set pretrained model to None so these code skip if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: # start_iter is 0 learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) # set the learning rate to 0.00025 print("training start...") nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train( **training) #training_loss, focal_loss, pull_loss, push_loss, regr_loss, cls_loss = nnet.train(**training) if display and iteration % display == 0: print("training loss at iteration {}: {}".format( iteration, training_loss.item())) print("focal loss at iteration {}: {}".format( iteration, focal_loss.item())) print("pull loss at iteration {}: {}".format( iteration, pull_loss.item())) print("push loss at iteration {}: {}".format( iteration, push_loss.item())) print("regr loss at iteration {}: {}".format( iteration, regr_loss.item())) #print("cls loss at iteration {}: {}\n".format(iteration, cls_loss.item())) del training_loss, focal_loss, pull_loss, push_loss, regr_loss #, cls_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def eval(): eval_path = pathlib.Path(args.eval_dir) eval_path.mkdir(exist_ok=True) class_names = [name.strip() for name in open(args.label_file).readlines()] print("loading parameters at iteration: {}".format(args.testiter)) print("building neural network...") nnet = NetworkFactory(db=None, dataset="voc") print("loading parameters...") nnet.load_params(args.testiter) nnet.cuda() nnet.eval_mode() predictor = importlib.import_module("test.voc").kp_detection_oneImage dataset = VOCDataset(args.dataset, is_test=True) true_case_stat, all_gb_boxes, all_difficult_cases = group_annotation_by_class( dataset) results = [] for i in range(len(dataset)): print("process image", i) image = dataset.get_image(i) # original [h,w,c] boxes, labels, probs = predictor( nnet, image) # [num_box, 4], [num_box], [num_box] boxes = torch.from_numpy(boxes) labels = torch.from_numpy(labels) probs = torch.from_numpy(probs) # print(probs) # print(labels) # exit() indexes = torch.ones(labels.size(0), 1, dtype=torch.float32) * i # [num_bb, 1], value=i results.append( torch.cat( [ indexes.reshape(-1, 1), labels.reshape(-1, 1).float(), probs.reshape(-1, 1), boxes + 1.0 # matlab's indexes start from 1 ], dim=1) ) # results[i] = [num_box, 7]in columns, [index, label, prob, boxes+1] results = torch.cat(results) for class_index, class_name in enumerate(class_names): if class_index == 0: continue # ignore background prediction_path = eval_path / f"det_test_{class_name}.txt" with open(prediction_path, "w") as f: sub = results[results[:, 1] == class_index, :] for i in range(sub.size(0)): # num_boxes in this class prob_box = sub[i, 2:].numpy() image_id = dataset.ids[int(sub[i, 0])] print(image_id + " " + " ".join([str(v) for v in prob_box]), file=f) f = open(os.path.join(args.eval_dir, 'all.txt'), 'w') aps = [] print("\n\nAverage Precision Per-class:") f.write("Average Precision Per-class:\n\n") for class_index, class_name in enumerate(class_names): if class_index == 0: continue prediction_path = eval_path / f"det_test_{class_name}.txt" ap = compute_average_precision_per_class( true_case_stat[class_index], all_gb_boxes[class_index], all_difficult_cases[class_index], prediction_path, args.mAP_iou_threshold, args.use_2007_metric) aps.append(ap) print(f"{class_name}: {ap}") f.write(f"{class_name}: {ap}\n") print(f"\nAverage Precision Across All Classes:{sum(aps)/len(aps)}") f.write(f"\nAverage Precision Across All Classes:{sum(aps)/len(aps)}")
def train(training_dbs, validation_db, start_iter=0): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize writer = SummaryWriter(max_queue=5) # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) sample_data = importlib.import_module(data_file).sample_data # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() print("building model...") nnet = NetworkFactory(training_dbs[0]) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start...") nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train( **training) #training_loss, focal_loss, pull_loss, push_loss, regr_loss, cls_loss = nnet.train(**training) if display and iteration % display == 0: print("training loss at iteration {}: {}".format( iteration, training_loss.item())) print("focal loss at iteration {}: {}".format( iteration, focal_loss.item())) print("pull loss at iteration {}: {}".format( iteration, pull_loss.item())) print("push loss at iteration {}: {}".format( iteration, push_loss.item())) print("regr loss at iteration {}: {}".format( iteration, regr_loss.item())) #print("cls loss at iteration {}: {}\n".format(iteration, cls_loss.item())) writer.add_scalar('training_loss', training_loss.item(), iteration) writer.add_scalar('focal_loss', focal_loss.item(), iteration) writer.add_scalar('pull_loss', pull_loss.item(), iteration) writer.add_scalar('push_loss', push_loss.item(), iteration) writer.add_scalar('regr_loss', regr_loss.item(), iteration) del training_loss, focal_loss, pull_loss, push_loss, regr_loss #, cls_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) writer.add_scalar('validation_loss', validation_loss.item(), iteration) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def test(): result_dir = "/data3/building_dataset/whu_building_Dataset/cropped images/test/result/" print("building neural network...") nnet = NetworkFactory(test_dataloader) print("loading parameters...") test_model_dir = "/home/boyu/matrixnet-master/MatrixNetAnchors_whu/nnet/MatrixNetAnchors/MatrixNetAnchors_20481.pkl" nnet.load_params(20481) nnet.cuda() test_image_dir = '/data3/building_dataset/whu_building_Dataset/cropped images/test/image/' num_img = len(os.listdir(test_image_dir)) classnum = 2 iou_totoal_mean = 0.0 recall_mean = 0.0 precision_mean = 0.0 F1_mean = 0.0 img_divide = 0 tp_builging_total = 0 fp_building_total = 0 fn_buildling_total = 0 time_sum = 0 res = [] for index in range(num_img): img_name = os.listdir(test_image_dir)[index] imgA = cv2.imread( '/data3/building_dataset/whu_building_Dataset/cropped images/test/image/' + img_name) imgA = cv2.resize(imgA, (512, 512)) imgA = transform(imgA) imgA = imgA.cuda() imgA = imgA.unsqueeze(0) start = time.time() output = nnet.test(imgA) #output = torch.sigmoid(output) end = time.time() res.append(end - start) output_np = output.cpu().detach().numpy().copy() output_np = np.argmax(output_np, axis=1) output_array = np.squeeze(output_np[0, ...]) output_array = 1 - output_array output_array_save = 255 * output_array result_path = result_dir + img_name[:-4] + '.png' cv2.imwrite(result_path, output_array_save) gt = cv2.imread( '/data3/building_dataset/whu_building_Dataset/cropped images/test/label/' + img_name, 0) gt = cv2.resize(gt, (512, 512)) gt = gt / 255 #miou=iou_mean(output_np,gt) #print("mean_iou is: {}".format(miou.item())) img_divide = img_divide + 1 tp_builging = np.sum((gt == 1) & (output_array == 1)) fp_building = np.sum((gt != 1) & (output_array == 1)) fn_buildling = np.sum((gt == 1) & (output_array != 1)) tp_builging_total = tp_builging_total + tp_builging fp_building_total = fp_building_total + fp_building fn_buildling_total = fn_buildling_total + fn_buildling recall = np.divide(tp_builging, float(tp_builging + fn_buildling)) precision = np.divide(tp_builging, float(tp_builging + fp_building)) F1 = np.divide(2 * recall * precision, (recall + precision)) iou_single = np.divide(tp_builging, float(tp_builging + fp_building + fn_buildling)) if tp_builging == 0: iou_single = 0.0 recall = 0.0 precision = 0.0 F1 = 0.0 img_divide = img_divide - 1 #print("iou=%f"%iou_single) # print("recall=%f"%recall) # print("precision=%f"%precision) # print("F1_mean=%f"%F1) iou_totoal_mean = iou_totoal_mean + iou_single recall_mean = recall_mean + recall precision_mean = precision_mean + precision F1_mean = F1_mean + F1 iou_totoal_mean = np.divide(iou_totoal_mean, float(img_divide)) recall_mean = np.divide(recall_mean, float(img_divide)) precision_mean = np.divide(precision_mean, float(img_divide)) F1_mean = np.divide(F1_mean, float(img_divide)) print("iou=%f" % iou_totoal_mean) print("recall=%f" % recall_mean) print("precision=%f" % precision_mean) print("F1_mean=%f" % F1_mean) iou_total = np.divide( tp_builging_total, float(tp_builging_total + fp_building_total + fn_buildling_total)) recall_total = np.divide(tp_builging_total, float(tp_builging_total + fn_buildling_total)) precision_total = np.divide(tp_builging_total, float(tp_builging_total + fp_building_total)) F1_total = np.divide(2 * recall_total * precision_total, (recall_total + precision_total)) print("iou_total=%f" % iou_total) print("recall_total=%f" % recall_total) print("precision_total=%f" % precision_total) print("F1_mean_total=%f" % F1_total) for i in res: time_sum += i print("FPS: %f" % (1.0 / (time_sum / len(res))))
def train(start_iter=20150): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize #vis = visdom.Visdom() # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("building model...") nnet = NetworkFactory(train_dataloader) #nnet = nnet.cuda() #nnet = nn.DataParallel(nnet).cuda() if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") change_feature(pretrained_model) nnet.load_pretrained_params( "./MatrixNetAnchorsResnet50_48LayerRange_640isize/nnet/MatrixNetAnchors/MatrixNetAnchors_50_modified.pkl" ) #params = torch.load(pretrained_model) #nnet.load_state_dict({k.replace('module.',''):v for k,v in params['state_dict'].items()}) if start_iter: #learning_rate /= (decay_rate ** (start_iter // stepsize)) #print(learning_rate) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start...") #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') nnet.cuda() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): loss_total = 0 nnet.train_mode() for index, (ls1, ls_msk) in enumerate(test_dataloader): training_loss = nnet.train(ls1, ls_msk) #print(training_loss) loss_total = loss_total + training_loss test_loss = 0 nnet.eval_mode() with torch.no_grad(): for index, (ls1, ls_msk) in enumerate(test_dataloader): test_iter_loss = nnet.validate(ls1, ls_msk) test_loss = test_loss + test_iter_loss print('epoch train loss = %f, epoch test loss = %f' % (loss_total / len(test_dataloader), test_loss / len(test_dataloader))) if display and iteration % display == 0: print("training loss at iteration {}: {}".format( iteration, loss_total.item())) test_loss_iter = test_loss / len(test_dataloader) del loss_total del test_loss if test_loss_iter < 0.0009: nnet.save_params(iteration) if iteration % snapshot == 0: nnet.save_params(iteration) #test() if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate)
def train(training_dbs, validation_db, start_iter=0): # 从json文件读取参数 learning_rate = system_configs.learning_rate # 学习率 max_iteration = system_configs.max_iter # 最大迭代次数 pretrained_model = system_configs.pretrain # 预训练模型 snapshot = system_configs.snapshot # 训练snapshot次就保存一次模型 val_iter = system_configs.val_iter # 每隔几步验证一次 display = system_configs.display # 每训练display次就显示一次loss decay_rate = system_configs.decay_rate # 衰减 stepsize = system_configs.stepsize # 步长 ??? # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue( system_configs.prefetch_size ) # prefetch_size:预取数据量。Queue()表示多核运算中,队列的长度最大为prefetch_size validation_queue = Queue(5) # 表示队列长度最大为5 # queues storing pinned data for training,队列存储固定数据以进行训练 pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) # sample_data = importlib.import_module( data_file).sample_data # 将data_file导入, # allocating resources for parallel reading, 为并行读取分配资源 training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) # 它调用init_parallel_jobs函数创建多进程,期间调用prefetch_data预取数据,期间调用sample_data函数。这些操作中涉及到了数据增强、各种groundtruth的生成等, if val_iter: # 验证阶段数据增强=False validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) training_pin_semaphore = threading.Semaphore() # 信号量,可以用于控制线程数并发数 validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() # 锁住线程 validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) # 参数元组 training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) # 创建锁页线程 training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True # 子线程,在start之前使用,默认为False, true表示,无需等待子线程结束,主线程结束就结束 validation_pin_thread.start() # 子线程开始执行 print("building model...") nnet = NetworkFactory(training_dbs[0]) # 搭建网络对象 if pretrained_model is not None: # 判断是否用预训练模型 if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) # 加载预训练模型的参数 if start_iter: # 假如start_iter为0,就算了,否则执行 learning_rate /= (decay_rate**(start_iter // stepsize) ) # 根据start_iter来计算学习率 nnet.load_params(start_iter) # 根据命令行输入的参数,加载对应模型的参数,说白了就是从指定位置开始训练 nnet.set_lr(learning_rate) # 上面计算了学习率,此处是将学习率加载到模型中 print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) # 从头迭代时候直接将cfg文件中的学习率加载进去 print("training start...") nnet.cuda() # 网络转化成GPU nnet.train_mode() # 训练模式 with stdout_to_tqdm( ) as save_stdout: # Tqdm 是 Python 进度条库,可以在 Python 长循环中添加一个进度提示信息用法:tqdm(iterator) for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train( **training) #training_loss, focal_loss, pull_loss, push_loss, regr_loss, cls_loss = nnet.train(**training) if display and iteration % display == 0: # 每迭代display次打印一次training loss print("training loss at iteration {}: {}".format( iteration, training_loss.item())) print("focal loss at iteration {}: {}".format( iteration, focal_loss.item())) print("pull loss at iteration {}: {}".format( iteration, pull_loss.item())) print("push loss at iteration {}: {}".format( iteration, push_loss.item())) print("regr loss at iteration {}: {}".format( iteration, regr_loss.item())) #print("cls loss at iteration {}: {}\n".format(iteration, cls_loss.item())) del training_loss, focal_loss, pull_loss, push_loss, regr_loss #, cls_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: # 每迭代val_iter次打印一次validation loss nnet.eval_mode() # 验证模式 validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) nnet.train_mode() # 验证结束后转换为训练模式 if iteration % snapshot == 0: # 每过snapshot次就保存模型 nnet.save_params(iteration) if iteration % stepsize == 0: # 每过stepsize就衰减一次学习率 learning_rate /= decay_rate nnet.set_lr(learning_rate) # 将衰减后的学习率加载至模型 # sending signal to kill the thread training_pin_semaphore.release() # 发信号,杀线程 validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: # 终止数据获取过程 training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def test(db, split, testiter, debug=False, suffix=None): result_dir = system_configs.result_dir result_dir = os.path.join(result_dir, str(testiter), split) class_name = [] for i in range(1, len(db._coco.cats)): # if db._coco.cats[i] is None: # continue # else: ind = db._cat_ids[i] class_name.append(db._coco.cats[ind]['name']) if suffix is not None: result_dir = os.path.join(result_dir, suffix) make_dirs([result_dir]) test_iter = system_configs.max_iter if testiter is None else testiter print("loading parameters at iteration: {}".format(test_iter)) print("building neural network...") nnet = NetworkFactory(db) print("loading parameters...") nnet.load_params(test_iter) # test_file = "test.{}".format(db.data) # testing = importlib.import_module(test_file).testing nnet.cuda() nnet.eval_mode() debug_dir = os.path.join(result_dir, "debug") if not os.path.exists(debug_dir): os.makedirs(debug_dir) if db.split != "trainval": db_inds = db.db_inds[:100] if debug else db.db_inds else: db_inds = db.db_inds[:100] if debug else db.db_inds[:5000] K = db.configs["top_k"] ae_threshold = db.configs["ae_threshold"] nms_kernel = db.configs["nms_kernel"] scales = db.configs["test_scales"] weight_exp = db.configs["weight_exp"] merge_bbox = db.configs["merge_bbox"] categories = db.configs["categories"] nms_threshold = db.configs["nms_threshold"] max_per_image = db.configs["max_per_image"] nms_algorithm = { "nms": 0, "linear_soft_nms": 1, "exp_soft_nms": 2 }[db.configs["nms_algorithm"]] img_name = os.listdir(db._image_dir) for i in range(0, len(img_name)): top_bboxes = {} # for ind in tqdm(range(0, num_images), ncols=80, desc="locating kps"): db_ind = i + 1 # image_id = db.image_ids(db_ind) image_id = img_name[i] image_file = db._image_dir + '/' + img_name[i] image = cv2.imread(image_file) height, width = image.shape[0:2] detections = [] for scale in scales: new_height = int(height * scale) new_width = int(width * scale) new_center = np.array([new_height // 2, new_width // 2]) inp_height = new_height | 127 inp_width = new_width | 127 images = np.zeros((1, 3, inp_height, inp_width), dtype=np.float32) ratios = np.zeros((1, 2), dtype=np.float32) borders = np.zeros((1, 4), dtype=np.float32) sizes = np.zeros((1, 2), dtype=np.float32) out_height, out_width = (inp_height + 1) // 4, (inp_width + 1) // 4 height_ratio = out_height / inp_height width_ratio = out_width / inp_width resized_image = cv2.resize(image, (new_width, new_height)) resized_image, border, offset = crop_image(resized_image, new_center, [inp_height, inp_width]) resized_image = resized_image / 255. normalize_(resized_image, db.mean, db.std) images[0] = resized_image.transpose((2, 0, 1)) borders[0] = border sizes[0] = [int(height * scale), int(width * scale)] ratios[0] = [height_ratio, width_ratio] images = np.concatenate((images, images[:, :, :, ::-1]), axis=0) images = torch.from_numpy(images) dets = kp_decode(nnet, images, K, ae_threshold=ae_threshold, kernel=nms_kernel) dets = dets.reshape(2, -1, 8) dets[1, :, [0, 2]] = out_width - dets[1, :, [2, 0]] dets = dets.reshape(1, -1, 8) _rescale_dets(dets, ratios, borders, sizes) dets[:, :, 0:4] /= scale detections.append(dets) detections = np.concatenate(detections, axis=1) classes = detections[..., -1] classes = classes[0] detections = detections[0] # reject detections with negative scores keep_inds = (detections[:, 4] > -1) detections = detections[keep_inds] classes = classes[keep_inds] top_bboxes[image_id] = {} for j in range(categories): keep_inds = (classes == j) top_bboxes[image_id][j + 1] = detections[keep_inds][:, 0:7].astype(np.float32) if merge_bbox: soft_nms_merge(top_bboxes[image_id][j + 1], Nt=nms_threshold, method=nms_algorithm, weight_exp=weight_exp) else: soft_nms(top_bboxes[image_id][j + 1], Nt=nms_threshold, method=nms_algorithm) top_bboxes[image_id][j + 1] = top_bboxes[image_id][j + 1][:, 0:5] scores = np.hstack([ top_bboxes[image_id][j][:, -1] for j in range(1, categories + 1) ]) if len(scores) > max_per_image: kth = len(scores) - max_per_image thresh = np.partition(scores, kth)[kth] for j in range(1, categories + 1): keep_inds = (top_bboxes[image_id][j][:, -1] >= thresh) top_bboxes[image_id][j] = top_bboxes[image_id][j][keep_inds] # result_json = os.path.join(result_dir, "results.json") detections = db.convert_to_list(top_bboxes) print('demo for {}'.format(image_id)) img = cv2.imread(image_file) box = [] if detections is not None: for i in range(len(detections)): name = db._coco.cats[detections[i][1]]['name'] #db._coco.cats[ind]['name'] confi = detections[i][-1] if confi <0.3: continue for j in range(0, 4): box.append(detections[i][j + 2]) cv2.rectangle(img, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (0, 255, 255), 1) # cv2.putText(img, name[0] + ' ' + '{:.3f}'.format(confi), (int(box[0]), int(box[1] - 10)), # cv2.FONT_ITALIC, 1, (0, 0, 255), 1) while (box): box.pop(-1) cv2.imshow('Detecting image...', img) # timer.total_time = 0 if cv2.waitKey(3000) & 0xFF == ord('q'): break print(detections)
def train(training_dbs, validation_db, start_iter=0, freeze=False): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize batch_size = system_configs.batch_size # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) # 5 validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) # 5 pinned_validation_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) # "sample.coco" sample_data = importlib.import_module(data_file).sample_data # print(type(sample_data)) # function # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data) if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() print("building model...") nnet = NetworkFactory(flag=True) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start...") nnet.cuda() nnet.train_mode() header = None metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) metric_logger.add_meter( 'class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) with stdout_to_tqdm() as save_stdout: for iteration in metric_logger.log_every(tqdm(range( start_iter + 1, max_iteration + 1), file=save_stdout, ncols=67), print_freq=10, header=header): training = pinned_training_queue.get(block=True) viz_split = 'train' save = True if (display and iteration % display == 0) else False (set_loss, loss_dict) \ = nnet.train(iteration, save, viz_split, **training) (loss_dict_reduced, loss_dict_reduced_unscaled, loss_dict_reduced_scaled, loss_value) = loss_dict metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) metric_logger.update(class_error=loss_dict_reduced['class_error']) metric_logger.update(lr=learning_rate) del set_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() viz_split = 'val' save = True validation = pinned_validation_queue.get(block=True) (val_set_loss, val_loss_dict) \ = nnet.validate(iteration, save, viz_split, **validation) (loss_dict_reduced, loss_dict_reduced_unscaled, loss_dict_reduced_scaled, loss_value) = val_loss_dict print('[VAL LOG]\t[Saving training and evaluating images...]') metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) metric_logger.update( class_error=loss_dict_reduced['class_error']) metric_logger.update(lr=learning_rate) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) if iteration % (training_size // batch_size) == 0: metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def train(training_dbs, validation_db, start_iter=0): learning_rate = system_configs.learning_rate # 学习率 max_iteration = system_configs.max_iter # 最大迭代次数 pretrained_model = system_configs.pretrain # 预训练模型 snapshot = system_configs.snapshot # 每隔snapshot进行参数保存 val_iter = system_configs.val_iter # 每隔val_iter进行验证操作 display = system_configs.display # 每隔display次进行loss输出 decay_rate = system_configs.decay_rate # 学习率衰减 stepsize = system_configs.stepsize # 学习率衰减的起始步 # 获取每个数据集的大小 training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # 队列存储数据用于训练 training_queue = Queue(system_configs.prefetch_size) validation_queue = Queue(5) # 队列存储固定数据用于训练 pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) # 加载数据采样函数 data_file = "sample.{}".format(training_dbs[0].data) sample_data = importlib.import_module(data_file).sample_data # 分配资源用于平行读取 training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) # 验证数据集的并行 training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() # 创建模型 print("building model...") nnet = NetworkFactory(training_dbs[0]) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize) ) # 根据当前的起始点来计算学习率 nnet.load_params(start_iter) # 加载起始点处的参数 nnet.set_lr(learning_rate) # 设置学习率 print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) # 设置学习率 # 开始训练 print("training start...") nnet.cuda() # GPU nnet.train_mode() # 训练模式 with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) # 训练数据 training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train( **training) # 获取loss if display and iteration % display == 0: print("training loss at iteration {}: {}".format( iteration, training_loss.item())) print("focal loss at iteration {}: {}".format( iteration, focal_loss.item())) print("pull loss at iteration {}: {}".format( iteration, pull_loss.item())) print("push loss at iteration {}: {}".format( iteration, push_loss.item())) print("regr loss at iteration {}: {}".format( iteration, regr_loss.item())) #print("cls loss at iteration {}: {}\n".format(iteration, cls_loss.item())) del training_loss, focal_loss, pull_loss, push_loss, regr_loss # 删除操作 # 验证操作 if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() # 验证模型 validation = pinned_validation_queue.get(block=True) # 验证集 validation_loss = nnet.validate(**validation) # 验证误差 print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) nnet.train_mode() # 切换到训练模式 if iteration % snapshot == 0: nnet.save_params(iteration) # 保存参数 if iteration % stepsize == 0: learning_rate /= decay_rate # 学习率进行衰减 nnet.set_lr(learning_rate) # 训练结束后发送信号杀死线程 training_pin_semaphore.release() validation_pin_semaphore.release() # 终止数据预读进程 for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def train(training_dbs, validation_db, validation_db_2, tb, suffix, cfg_file, es, start_iter): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) #validation_2_size = len(validation_db_2.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) validation_queue = Queue(5) #validation_2_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) #pinned_validation_2_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) sample_data = importlib.import_module(data_file).sample_data # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) #validation_2_tasks = init_parallel_jobs([validation_db_2], validation_2_queue, sample_data, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() #validation_2_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() #validation_2_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() # validation_2_pin_args = (validation_2_queue, pinned_validation_2_queue, validation_2_pin_semaphore) # validation_2_pin_thread = threading.Thread(target=pin_memory, args=validation_2_pin_args) # validation_2_pin_thread.daemon = True # validation_2_pin_thread.start() print("building model...") nnet = NetworkFactory(training_dbs[0]) #, suffix) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) if es: early_stopping = EarlyStopping(patience=100, verbose=True) print("training start...") nnet.cuda() #nnet.cpu() #if suffix == 104: # net = model_104(training_dbs[0]) # tb.add_graph(net, torch.rand(2, 3, 511, 511))#, torch.FloatTensor(training_dbs[0].db_inds)) #elif suffix == 52: # net = model_52(training_dbs[0]) # dummy_input = torch.randn(2, 3, 511, 511) # tb.add_graph(net, dummy_input) #else: # return #tb.close() ##### Model's Warm-up ##### nnet.eval_mode() input = cv2.imread(training_dbs[0].image_file(0)) start_time = time.time() detections = kp_detection(input, nnet, score_min=0.5) end_time = time.time() infer_time = end_time - start_time print("\n##################################################") print("Warm-up + Inference Time: " + str(infer_time * 1000) + "ms") print("##################################################") ########################### ##### Model's Inference Time ##### input = cv2.imread(training_dbs[0].image_file(0)) start_time = time.time() detections = kp_detection(input, nnet, score_min=0.5) end_time = time.time() infer_time = end_time - start_time print("\n##################################################") print("Inference Time: " + str(infer_time * 1000) + "ms") print("##################################################") ################################## result_dir = system_configs.result_dir result_dir = os.path.join(result_dir, str("Training_Validation"), str("val2017"), str(suffix)) #if suffix is not None: # result_dir = os.path.join(result_dir, suffix) make_dirs([result_dir]) nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) #start_time = time.time() training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train( **training) #end_time = time.time() #infer_time = end_time - start_time #training_loss, focal_loss, pull_loss, push_loss, regr_loss, cls_loss = nnet.train(**training) #print("\nTotal Time per Iteration:" + str(infer_time) + "ms") #tb.add_scalar('Total Time (ms) vs Iteration', infer_time * 1000, iteration) if display and iteration % display == 0: print("\ntraining loss at iteration {}: {}".format( iteration, training_loss.item())) print("focal loss at iteration {}: {}".format( iteration, focal_loss.item())) print("pull loss at iteration {}: {}".format( iteration, pull_loss.item())) print("push loss at iteration {}: {}".format( iteration, push_loss.item())) print("regr loss at iteration {}: {}".format( iteration, regr_loss.item())) #print("cls loss at iteration {}: {}\n".format(iteration, cls_loss.item())) tb.add_scalar('Training Loss vs Iteration', training_loss.item(), iteration) tb.add_scalar('Focal Loss vs Iteration', focal_loss.item(), iteration) tb.add_scalar('Pull Loss vs Iteration', pull_loss.item(), iteration) tb.add_scalar('Push Loss vs Iteration', push_loss.item(), iteration) tb.add_scalar('Offset Loss vs Iteration', regr_loss.item(), iteration) #tb.add_scalar('Class Loss vs Iteration', cls_loss.item(), iteration) del training_loss, focal_loss, pull_loss, push_loss, regr_loss #, cls_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("\n##################################################") print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) print("##################################################") tb.add_scalar('Validation Loss vs Iteration', validation_loss.item(), iteration) if es: early_stopping(validation_loss, iteration, nnet, cfg_file) nnet.train_mode() epoch = len(training_dbs[0].db_inds) // system_configs.batch_size #print(epoch) if iteration % epoch == 0: # Enter every epoch nnet.eval_mode() stats = kp_detection_train(validation_db_2, nnet, result_dir) map_avg = stats[0] map_50 = stats[1] map_75 = stats[2] map_small = stats[3] map_medium = stats[4] map_large = stats[5] mar_1 = stats[6] mar_10 = stats[7] mar_100 = stats[8] mar_small = stats[9] mar_medium = stats[10] mar_large = stats[11] tb.add_scalar('Average mAP vs Epoch', map_avg, iteration / epoch) tb.add_scalar('mAP (IoU 0.5) vs Epoch', map_50, iteration / epoch) tb.add_scalar('mAP (IoU 0.75) vs Epoch', map_75, iteration / epoch) tb.add_scalar('mAP (Area = Small) vs Epoch', map_small, iteration / epoch) tb.add_scalar('mAP (Area = Medium) vs Epoch', map_medium, iteration / epoch) tb.add_scalar('mAP (Area = Large) vs Epoch', map_large, iteration / epoch) tb.add_scalar('mAR (Max Detection = 1) vs Epoch', mar_1, iteration / epoch) tb.add_scalar('mAR (Max Detection = 10) vs Epoch', mar_10, iteration / epoch) tb.add_scalar('mAR (Max Detection = 100) vs Epoch', mar_100, iteration / epoch) tb.add_scalar('mAR (Area = Small) vs Epoch', mar_small, iteration / epoch) tb.add_scalar('mAR (Area = Medium) vs Epoch', mar_medium, iteration / epoch) tb.add_scalar('mAR (Area = Large) vs Epoch', mar_large, iteration / epoch) nnet.train_mode() if es and early_stopping.early_stop: print("Early stopping") break if not es: if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def train(training_dbs, validation_db, start_iter=0): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize val_ind = 0 print("building model...") nnet = NetworkFactory(training_dbs[0]) # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue(32) # queues storing pinned data for training pinned_training_queue = queue.Queue(32) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) sample_data = importlib.import_module(data_file).sample_data # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) training_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() run = Run.get_context() if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: if start_iter == -1: print("training starts from the latest iteration") save_list = os.listdir(system_configs.snapshot_dir) save_list.sort(reverse=True) if len(save_list) > 0: target_save = save_list[0] start_iter = int(re.findall(r'\d+', target_save)[0]) learning_rate /= (decay_rate ** (start_iter // stepsize)) nnet.load_params(start_iter) else: start_iter = 0 nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format(start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start...") nnet.cuda() nnet.train_mode() if not os.path.exists('./outputs'): os.makedirs('./outputs') print('outputs file created') else: print(os.listdir('./outputs')) error_count = 0 for iteration in tqdm(range(start_iter + 1, max_iteration + 1)): try: training = pinned_training_queue.get(block=True) except: print('Error when extracting data') error_count += 1 if error_count > 10: print('failed') time.sleep(1) break continue training_loss = nnet.train(**training) if display and iteration % display == 0: print("training loss at iteration {}: {}".format(iteration, training_loss.item())) run.log('train_loss', training_loss.item()) if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation, val_ind = sample_data(validation_db, val_ind, data_aug=False) validation_loss = nnet.validate(**validation) print("validation loss at iteration {}: {}".format(iteration, validation_loss.item())) run.log('val_loss', validation_loss.item()) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate()