예제 #1
0
def train(training_dbs, validation_db, start_iter=0):
    learning_rate = system_configs.learning_rate
    max_iteration = system_configs.max_iter
    pretrained_model = system_configs.pretrain
    snapshot = system_configs.snapshot
    val_iter = system_configs.val_iter
    display = system_configs.display
    decay_rate = system_configs.decay_rate
    stepsize = system_configs.stepsize

    # getting the size of each database
    training_size = len(training_dbs[0].db_inds)
    validation_size = len(validation_db.db_inds)

    # queues storing data for training
    training_queue = Queue(system_configs.prefetch_size)
    validation_queue = Queue(5)

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_configs.prefetch_size)
    pinned_validation_queue = queue.Queue(5)

    # load data sampling function
    data_file = "sample.{}".format(training_dbs[0].data)
    sample_data = importlib.import_module(data_file).sample_data

    # allocating resources for parallel reading
    training_tasks = init_parallel_jobs(training_dbs, training_queue,
                                        sample_data, True)
    if val_iter:
        validation_tasks = init_parallel_jobs([validation_db],
                                              validation_queue, sample_data,
                                              False)

    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()

    print("building model...")
    nnet = NetworkFactory(training_dbs[0])

    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("loading from pretrained model")
        nnet.load_pretrained_params(pretrained_model)

    if start_iter:
        learning_rate /= (decay_rate**(start_iter // stepsize))

        nnet.load_params(start_iter)
        nnet.set_lr(learning_rate)
        print("training starts from iteration {} with learning_rate {}".format(
            start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    # defining tensorboard writer
    tensorboard = Tensorboard('logs')

    print("training start...")
    nnet.cuda()
    nnet.train_mode()
    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            training = pinned_training_queue.get(block=True)
            out_train = nnet.train(**training)

            if display and iteration % display == 0:
                for idX, eleX in enumerate(
                    ["training", "focal", "pull", "push", "regr"]):
                    print("{} loss at iteration {}: {}".format(
                        eleX, iteration, out_train[idX].item()))
                    tensorboard.log_scalar('training/{} loss'.format(eleX),
                                           out_train[idX].item(), iteration)

            del out_train

            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(**validation)
                print("validation loss at iteration {}: {}".format(
                    iteration, validation_loss.item()))
                tensorboard.log_scalar('validation/loss',
                                       validation_loss.item(), iteration)
                if iteration % (val_iter * 2) == 0:
                    kp_detection(validation_db,
                                 nnet,
                                 "./cache/",
                                 debug=False,
                                 subset_val=True,
                                 TB_obj=tensorboard,
                                 TB_iter=iteration)
                nnet.train_mode()

            if iteration % snapshot == 0:
                nnet.save_params(iteration)

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                nnet.set_lr(learning_rate)

    # closing tensorboard writer
    tensorboard.close()

    # sending signal to kill the thread
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    for training_task in training_tasks:
        training_task.terminate()
    for validation_task in validation_tasks:
        validation_task.terminate()
예제 #2
0
파일: train.py 프로젝트: yawudede/CPNDet
def train(training_dbs, validation_db, start_iter=0):
    learning_rate = system_configs.learning_rate
    max_iteration = system_configs.max_iter
    pretrained_model = system_configs.pretrain
    snapshot = system_configs.snapshot
    val_iter = system_configs.val_iter
    display = system_configs.display
    decay_rate = system_configs.decay_rate
    stepsize = system_configs.stepsize

    # getting the size of each database
    training_size = len(training_dbs[0].db_inds)
    validation_size = len(validation_db.db_inds)

    # queues storing data for training
    training_queue = Queue(system_configs.prefetch_size)
    validation_queue = Queue(5)

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_configs.prefetch_size)
    pinned_validation_queue = queue.Queue(5)

    # load data sampling function
    data_file = "sample.{}".format(training_dbs[0].data)
    sample_data = importlib.import_module(data_file).sample_data

    # allocating resources for parallel reading
    training_tasks = init_parallel_jobs(training_dbs, training_queue,
                                        sample_data, True)
    if val_iter:
        validation_tasks = init_parallel_jobs([validation_db],
                                              validation_queue, sample_data,
                                              False)

    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()

    print("building model...")
    nnet = NetworkFactory(training_dbs[0])

    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("loading from pretrained model")
        nnet.load_pretrained_params(pretrained_model)

    if start_iter:
        learning_rate /= (decay_rate**(start_iter // stepsize))
        nnet.load_params(start_iter)
        nnet.set_lr(learning_rate)
        print("training starts from iteration {} with learning_rate {}".format(
            start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    print("training start...")
    nnet.train_mode()
    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            training = pinned_training_queue.get(block=True)
            training_loss, focal_loss, grouping_loss, region_loss, regr_loss = nnet.train(
                **training)
            if display and iteration % display == 0:
                print(
                    "iter {}, all: {:.4f}, focal: {:.4f}, grouping:{:.4f}, region: {:.4f}, regr: {:.4f}"
                    .format(iteration, training_loss.item(), focal_loss.item(),
                            grouping_loss.item(), region_loss.item(),
                            regr_loss.item()))

            del training_loss, focal_loss, grouping_loss, region_loss, regr_loss

            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(**validation)
                print("validation loss at iteration {}: {}".format(
                    iteration, validation_loss.item()))
                nnet.train_mode()

            if iteration % snapshot == 0:
                nnet.save_params(iteration)

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                nnet.set_lr(learning_rate)

    # sending signal to kill the thread
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    for training_task in training_tasks:
        training_task.terminate()
    for validation_task in validation_tasks:
        validation_task.terminate()
예제 #3
0
def train(training_dbs, validation_db, start_iter=0):
    # 从json文件读取参数
    learning_rate = system_configs.learning_rate  # 学习率
    max_iteration = system_configs.max_iter  # 最大迭代次数
    pretrained_model = system_configs.pretrain  # 预训练模型
    snapshot = system_configs.snapshot  # 训练snapshot次就保存一次模型
    val_iter = system_configs.val_iter  # 每隔几步验证一次
    display = system_configs.display  # 每训练display次就显示一次loss
    decay_rate = system_configs.decay_rate  # 衰减
    stepsize = system_configs.stepsize  # 步长 ???

    # getting the size of each database
    training_size = len(training_dbs[0].db_inds)
    validation_size = len(validation_db.db_inds)

    # queues storing data for training
    training_queue = Queue(
        system_configs.prefetch_size
    )  # prefetch_size:预取数据量。Queue()表示多核运算中,队列的长度最大为prefetch_size
    validation_queue = Queue(5)  # 表示队列长度最大为5

    # queues storing pinned data for training,队列存储固定数据以进行训练
    pinned_training_queue = queue.Queue(system_configs.prefetch_size)
    pinned_validation_queue = queue.Queue(5)

    # load data sampling function
    data_file = "sample.{}".format(training_dbs[0].data)  #
    sample_data = importlib.import_module(
        data_file).sample_data  # 将data_file导入,

    # allocating resources for parallel reading, 为并行读取分配资源
    training_tasks = init_parallel_jobs(training_dbs, training_queue,
                                        sample_data, True)
    # 它调用init_parallel_jobs函数创建多进程,期间调用prefetch_data预取数据,期间调用sample_data函数。这些操作中涉及到了数据增强、各种groundtruth的生成等,
    if val_iter:  # 验证阶段数据增强=False
        validation_tasks = init_parallel_jobs([validation_db],
                                              validation_queue, sample_data,
                                              False)

    training_pin_semaphore = threading.Semaphore()  # 信号量,可以用于控制线程数并发数
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()  # 锁住线程
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)  # 参数元组
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)  # 创建锁页线程
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True  # 子线程,在start之前使用,默认为False, true表示,无需等待子线程结束,主线程结束就结束
    validation_pin_thread.start()  # 子线程开始执行

    print("building model...")
    nnet = NetworkFactory(training_dbs[0])  # 搭建网络对象

    if pretrained_model is not None:  # 判断是否用预训练模型
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("loading from pretrained model")
        nnet.load_pretrained_params(pretrained_model)  # 加载预训练模型的参数

    if start_iter:  # 假如start_iter为0,就算了,否则执行
        learning_rate /= (decay_rate**(start_iter // stepsize)
                          )  # 根据start_iter来计算学习率

        nnet.load_params(start_iter)  # 根据命令行输入的参数,加载对应模型的参数,说白了就是从指定位置开始训练
        nnet.set_lr(learning_rate)  # 上面计算了学习率,此处是将学习率加载到模型中
        print("training starts from iteration {} with learning_rate {}".format(
            start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)  # 从头迭代时候直接将cfg文件中的学习率加载进去

    print("training start...")
    nnet.cuda()  # 网络转化成GPU
    nnet.train_mode()  # 训练模式
    with stdout_to_tqdm(
    ) as save_stdout:  # Tqdm 是 Python 进度条库,可以在 Python 长循环中添加一个进度提示信息用法:tqdm(iterator)
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            training = pinned_training_queue.get(block=True)
            training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train(
                **training)
            #training_loss, focal_loss, pull_loss, push_loss, regr_loss, cls_loss = nnet.train(**training)

            if display and iteration % display == 0:  # 每迭代display次打印一次training loss
                print("training loss at iteration {}: {}".format(
                    iteration, training_loss.item()))
                print("focal loss at iteration {}:    {}".format(
                    iteration, focal_loss.item()))
                print("pull loss at iteration {}:     {}".format(
                    iteration, pull_loss.item()))
                print("push loss at iteration {}:     {}".format(
                    iteration, push_loss.item()))
                print("regr loss at iteration {}:     {}".format(
                    iteration, regr_loss.item()))
                #print("cls loss at iteration {}:      {}\n".format(iteration, cls_loss.item()))

            del training_loss, focal_loss, pull_loss, push_loss, regr_loss  #, cls_loss

            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:  # 每迭代val_iter次打印一次validation loss
                nnet.eval_mode()  # 验证模式
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(**validation)
                print("validation loss at iteration {}: {}".format(
                    iteration, validation_loss.item()))
                nnet.train_mode()  # 验证结束后转换为训练模式

            if iteration % snapshot == 0:  # 每过snapshot次就保存模型
                nnet.save_params(iteration)

            if iteration % stepsize == 0:  # 每过stepsize就衰减一次学习率
                learning_rate /= decay_rate
                nnet.set_lr(learning_rate)  # 将衰减后的学习率加载至模型

    # sending signal to kill the thread
    training_pin_semaphore.release()  # 发信号,杀线程
    validation_pin_semaphore.release()

    # terminating data fetching processes
    for training_task in training_tasks:  # 终止数据获取过程
        training_task.terminate()
    for validation_task in validation_tasks:
        validation_task.terminate()
예제 #4
0
def train(training_dbs, validation_db, start_iter=0):
    # train.py--> here
    # training_db = [MSCOCO x 4] and use the dataset specified by "trainval2014"
    # validation_db is a MSCOCO instance whose configs should firstly check in file CenterNet-104.json
    # start_iter should check args.star_iter it should be 0

    learning_rate = system_configs.learning_rate  # 0.00025
    max_iteration = system_configs.max_iter  # 480000
    pretrained_model = system_configs.pretrain  # None
    snapshot = system_configs.snapshot  # 5000
    val_iter = system_configs.val_iter  # 500
    display = system_configs.display  # 5
    decay_rate = system_configs.decay_rate  # 10
    stepsize = system_configs.stepsize  # 450000
    # all above hyperparameters should first check CenterNet-104.py to see ,
    # then check config.py

    # getting the size of each database
    training_size = len(training_dbs[0].db_inds)
    validation_size = len(validation_db.db_inds)

    # queues storing data for training
    training_queue = Queue(system_configs.prefetch_size)
    # prefetch_size = 6  you can find this number in CenterNet.json
    validation_queue = Queue(5)
    # Queue is for torch.multiprocessing module to share data, manipulate data, exchange data, operate data.
    # since torch.multiprocessing function can't return value. so the operation is using Queue

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_configs.prefetch_size)
    pinned_validation_queue = queue.Queue(5)
    # queue.Queue is for threading to share data

    # load data sampling function
    data_file = "sample.{}".format(training_dbs[0].data)
    # sample.coco
    # and there is a coco.py in directory sample

    sample_data = importlib.import_module(data_file).sample_data
    # importlib.import_module(data_file)  means to import the sample.coco.py
    # and there is a function named sample_data in that file.
    # so sample_data means the function in sample/coco.py

    # allocating resources for parallel reading
    training_tasks = init_parallel_jobs(training_dbs, training_queue,
                                        sample_data, True)
    # training_dbs is a list of four MSCOCO instance, and MSCOCO instance is used for loading annotation data
    # the training_queue is a queue of 6
    # sample_data is a function
    # four thread each thread load a batch of data.images, heatmaps,  location in flattened image, fractions part of keypoints
    # training_tasks is a list of torch.multiprocessing.Process objects,
    # so when each Process object.start()   the original images and annotation files will be processed into
    # the formula accord with input shape to the network
    # data for training will be stored in training_queue

    if val_iter:
        validation_tasks = init_parallel_jobs([validation_db],
                                              validation_queue, sample_data,
                                              False)
        # data for validation will be stored in validation_queue.

    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    # class threading.Semaphore([value])
    # values是一个内部计数,values默认是1,如果小于0,则会抛出 ValueError 异常,可以用于控制线程数并发数
    # here semaphore use default value so is 1
    # Semaphore 是 Python 内置模块 threading 中的一个类
    # Semaphore 管理一个计数器,每调用一次 acquire() 方法,
    # 计数器就减一,每调用一次 release() 方法,计数器就加一。
    # 计时器的值默认为 1 ,计数器的值不能小于 0,
    # 当计数器的值为 0 时,调用 acquire() 的线程就会等待,直到 release() 被调用。
    # 因此,可以利用这个特性来控制线程数量
    # 代码示例。
    # from threading import Thread, Semaphore
    # import time
    #
    #
    # def test(a):
    #     #打印线程的名字
    #     print(t.name)
    #     print(a)
    #     time.sleep(2)
    #     #释放 semaphore
    #     sem.release()
    #
    # #设置计数器的值为 5
    # sem = Semaphore(5)
    # for i in range(10):
    #     #获取一个 semaphore
    #     sem.acquire()
    #     t = Thread(target=test, args=(i, ))
    #     t.start()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    # training_queue is images, heatmaps,location regressions
    # pinned_training_queue hasn't initialized yet, it may be initialized in later line using function pin_memory
    # training_pin_semaphore is a counter
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    # Python Thread类表示在单独的控制线程中运行的活动。有两种方法可以指定这种活动:
    # 给构造函数传递回调对象:
    # https://blog.csdn.net/drdairen/article/details/60962439
    # target is a function
    # args is inputs for the function
    # the function pin_memory move the data into GPU and read them one batch after another each time
    training_pin_thread.daemon = True
    # daemon的使用场景是:你需要一个始终运行的进程,用来监控其他服务的运行情况,
    # 或者发送心跳包或者类似的东西,你创建了这个进程都就不用管它了,
    # 他会随着主线程的退出出而退出了。
    # so this line make the thread of reading data from CPU to GPU and read them batch by batch always alive
    # in the whole training stage
    training_pin_thread.start()
    #

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()
    # above four lines move validation data from CPU to GPU and read in batch by batch

    print("building model...")
    nnet = NetworkFactory(training_dbs[0])

    if pretrained_model is not None:  # the CenterNet-104.json set pretrained model to None so these code skip
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("loading from pretrained model")
        nnet.load_pretrained_params(pretrained_model)

    if start_iter:  # start_iter is 0
        learning_rate /= (decay_rate**(start_iter // stepsize))

        nnet.load_params(start_iter)
        nnet.set_lr(learning_rate)
        print("training starts from iteration {} with learning_rate {}".format(
            start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)  # set the learning rate to 0.00025

    print("training start...")
    nnet.cuda()
    nnet.train_mode()
    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            training = pinned_training_queue.get(block=True)
            training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train(
                **training)
            #training_loss, focal_loss, pull_loss, push_loss, regr_loss, cls_loss = nnet.train(**training)

            if display and iteration % display == 0:
                print("training loss at iteration {}: {}".format(
                    iteration, training_loss.item()))
                print("focal loss at iteration {}:    {}".format(
                    iteration, focal_loss.item()))
                print("pull loss at iteration {}:     {}".format(
                    iteration, pull_loss.item()))
                print("push loss at iteration {}:     {}".format(
                    iteration, push_loss.item()))
                print("regr loss at iteration {}:     {}".format(
                    iteration, regr_loss.item()))
                #print("cls loss at iteration {}:      {}\n".format(iteration, cls_loss.item()))

            del training_loss, focal_loss, pull_loss, push_loss, regr_loss  #, cls_loss

            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(**validation)
                print("validation loss at iteration {}: {}".format(
                    iteration, validation_loss.item()))
                nnet.train_mode()

            if iteration % snapshot == 0:
                nnet.save_params(iteration)

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                nnet.set_lr(learning_rate)

    # sending signal to kill the thread
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    for training_task in training_tasks:
        training_task.terminate()
    for validation_task in validation_tasks:
        validation_task.terminate()
예제 #5
0
def train(start_iter=20150):
    learning_rate = system_configs.learning_rate
    max_iteration = system_configs.max_iter
    pretrained_model = system_configs.pretrain
    snapshot = system_configs.snapshot
    val_iter = system_configs.val_iter
    display = system_configs.display
    decay_rate = system_configs.decay_rate
    stepsize = system_configs.stepsize

    #vis = visdom.Visdom()

    #  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("building model...")
    nnet = NetworkFactory(train_dataloader)
    #nnet = nnet.cuda()
    #nnet = nn.DataParallel(nnet).cuda()
    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("loading from pretrained model")
        change_feature(pretrained_model)
        nnet.load_pretrained_params(
            "./MatrixNetAnchorsResnet50_48LayerRange_640isize/nnet/MatrixNetAnchors/MatrixNetAnchors_50_modified.pkl"
        )
        #params = torch.load(pretrained_model)
        #nnet.load_state_dict({k.replace('module.',''):v for k,v in params['state_dict'].items()})

    if start_iter:
        #learning_rate /= (decay_rate ** (start_iter // stepsize))
        #print(learning_rate)
        nnet.load_params(start_iter)
        nnet.set_lr(learning_rate)
        print("training starts from iteration {} with learning_rate {}".format(
            start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    print("training start...")
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    nnet.cuda()
    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            loss_total = 0
            nnet.train_mode()
            for index, (ls1, ls_msk) in enumerate(test_dataloader):
                training_loss = nnet.train(ls1, ls_msk)
                #print(training_loss)
                loss_total = loss_total + training_loss

            test_loss = 0
            nnet.eval_mode()
            with torch.no_grad():
                for index, (ls1, ls_msk) in enumerate(test_dataloader):
                    test_iter_loss = nnet.validate(ls1, ls_msk)
                    test_loss = test_loss + test_iter_loss

            print('epoch train loss = %f, epoch test loss = %f' %
                  (loss_total / len(test_dataloader),
                   test_loss / len(test_dataloader)))

            if display and iteration % display == 0:
                print("training loss at iteration {}: {}".format(
                    iteration, loss_total.item()))

            test_loss_iter = test_loss / len(test_dataloader)
            del loss_total
            del test_loss

            if test_loss_iter < 0.0009:
                nnet.save_params(iteration)

            if iteration % snapshot == 0:
                nnet.save_params(iteration)
                #test()

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                nnet.set_lr(learning_rate)
예제 #6
0
def train(training_dbs, validation_db, start_iter=0, debug=False):
    learning_rate    = system_configs.learning_rate
    max_iteration    = system_configs.max_iter
    pretrained_model = system_configs.pretrain
    snapshot         = system_configs.snapshot
    # val_iter         =s system_configs.val_iter
    display          = system_configs.display
    decay_rate       = system_configs.decay_rate
    stepsize         = system_configs.stepsize

    training_size   = len(training_dbs[0].db_inds)


    training_queue   = Queue(system_configs.prefetch_size)
    # validation_queue = Queue(5)

    pinned_training_queue   = queue.Queue(system_configs.prefetch_size)
    # pinned_validation_queue = queue.Queue(5)


    data_file   = "sample.{}".format(training_dbs[0].data)

    sample_data = importlib.import_module(data_file).sample_data

    training_tasks   = init_parallel_jobs(
        training_dbs, training_queue, sample_data, True, debug)
    # if val_iter:
    #     validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False)

    training_pin_semaphore   = threading.Semaphore()
    # validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    # validation_pin_semaphore.acquire()

    training_pin_args   = (training_queue, pinned_training_queue, training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    # validation_pin_args   = (validation_queue, pinned_validation_queue, validation_pin_semaphore)
    # validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args)
    # validation_pin_thread.daemon = True
    # validation_pin_thread.start()

    print("building model...")

    nnet = NetworkFactory(training_dbs[0])

 #   if pretrained_model is not None:
    #    if not os.path.exists(pretrained_model):
     #       raise ValueError("pretrained model does not exist")
    #    print("loading from pretrained model")
    #    nnet.load_pretrained_params(pretrained_model)

    if start_iter:
        learning_rate /= (decay_rate ** (start_iter // stepsize))

        nnet.load_params(start_iter)
        nnet.set_lr(learning_rate)
        print("training starts from iteration {} with learning_rate {}".format(start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    print("training start...")
    nnet.cuda()
    nnet.train_mode()
    avg_loss = AverageMeter()

    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80):
            training = pinned_training_queue.get(block=True)
            training_loss = nnet.train(**training)
            avg_loss.update(training_loss.item())

            if display and iteration % display == 0:
                print("training loss at iteration {}: {:.6f} ({:.6f})".format(
                    iteration, training_loss.item(), avg_loss.avg))
            del training_loss

            # if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
            #     nnet.eval_mode()
            #     validation = pinned_validation_queue.get(block=True)
            #     validation_loss = nnet.validate(**validation)
            #     print("validation loss at iteration {}: {}".format(iteration, validation_loss.item()))
            #     nnet.train_mode()

            if iteration % snapshot == 0:
                nnet.save_params(iteration)

            if iteration % 100 == 0:
                nnet.save_params(-1)
                avg_loss = AverageMeter()

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                nnet.set_lr(learning_rate)

    # sending signal to kill the thread
    training_pin_semaphore.release()
    # validation_pin_semaphore.release()

    # terminating data fetching processes
    for training_task in training_tasks:
        training_task.terminate()
예제 #7
0
파일: train.py 프로젝트: zhu-del/LSTR
def train(training_dbs, validation_db, start_iter=0, freeze=False):
    learning_rate = system_configs.learning_rate
    max_iteration = system_configs.max_iter
    pretrained_model = system_configs.pretrain
    snapshot = system_configs.snapshot
    val_iter = system_configs.val_iter
    display = system_configs.display
    decay_rate = system_configs.decay_rate
    stepsize = system_configs.stepsize
    batch_size = system_configs.batch_size

    # getting the size of each database
    training_size = len(training_dbs[0].db_inds)
    validation_size = len(validation_db.db_inds)

    # queues storing data for training
    training_queue = Queue(system_configs.prefetch_size)  # 5
    validation_queue = Queue(5)

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_configs.prefetch_size)  # 5
    pinned_validation_queue = queue.Queue(5)

    # load data sampling function
    data_file = "sample.{}".format(training_dbs[0].data)  # "sample.coco"
    sample_data = importlib.import_module(data_file).sample_data
    # print(type(sample_data)) # function

    # allocating resources for parallel reading
    training_tasks = init_parallel_jobs(training_dbs, training_queue,
                                        sample_data)
    if val_iter:
        validation_tasks = init_parallel_jobs([validation_db],
                                              validation_queue, sample_data)

    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()

    print("building model...")
    nnet = NetworkFactory(flag=True)

    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("loading from pretrained model")
        nnet.load_pretrained_params(pretrained_model)

    if start_iter:
        learning_rate /= (decay_rate**(start_iter // stepsize))

        nnet.load_params(start_iter)
        nnet.set_lr(learning_rate)
        print("training starts from iteration {} with learning_rate {}".format(
            start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    print("training start...")
    nnet.cuda()
    nnet.train_mode()
    header = None
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    metric_logger.add_meter(
        'class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))

    with stdout_to_tqdm() as save_stdout:
        for iteration in metric_logger.log_every(tqdm(range(
                start_iter + 1, max_iteration + 1),
                                                      file=save_stdout,
                                                      ncols=67),
                                                 print_freq=10,
                                                 header=header):

            training = pinned_training_queue.get(block=True)
            viz_split = 'train'
            save = True if (display and iteration % display == 0) else False
            (set_loss, loss_dict) \
                = nnet.train(iteration, save, viz_split, **training)
            (loss_dict_reduced, loss_dict_reduced_unscaled,
             loss_dict_reduced_scaled, loss_value) = loss_dict
            metric_logger.update(loss=loss_value,
                                 **loss_dict_reduced_scaled,
                                 **loss_dict_reduced_unscaled)
            metric_logger.update(class_error=loss_dict_reduced['class_error'])
            metric_logger.update(lr=learning_rate)

            del set_loss

            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                viz_split = 'val'
                save = True
                validation = pinned_validation_queue.get(block=True)
                (val_set_loss, val_loss_dict) \
                    = nnet.validate(iteration, save, viz_split, **validation)
                (loss_dict_reduced, loss_dict_reduced_unscaled,
                 loss_dict_reduced_scaled, loss_value) = val_loss_dict
                print('[VAL LOG]\t[Saving training and evaluating images...]')
                metric_logger.update(loss=loss_value,
                                     **loss_dict_reduced_scaled,
                                     **loss_dict_reduced_unscaled)
                metric_logger.update(
                    class_error=loss_dict_reduced['class_error'])
                metric_logger.update(lr=learning_rate)
                nnet.train_mode()

            if iteration % snapshot == 0:
                nnet.save_params(iteration)

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                nnet.set_lr(learning_rate)

            if iteration % (training_size // batch_size) == 0:
                metric_logger.synchronize_between_processes()
                print("Averaged stats:", metric_logger)

    # sending signal to kill the thread
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    for training_task in training_tasks:
        training_task.terminate()
    for validation_task in validation_tasks:
        validation_task.terminate()
예제 #8
0
def train(training_dbs, validation_db, start_iter=0):
    learning_rate = system_configs.learning_rate  # 学习率
    max_iteration = system_configs.max_iter  # 最大迭代次数
    pretrained_model = system_configs.pretrain  # 预训练模型
    snapshot = system_configs.snapshot  # 每隔snapshot进行参数保存
    val_iter = system_configs.val_iter  # 每隔val_iter进行验证操作
    display = system_configs.display  # 每隔display次进行loss输出
    decay_rate = system_configs.decay_rate  # 学习率衰减
    stepsize = system_configs.stepsize  # 学习率衰减的起始步
    # 获取每个数据集的大小
    training_size = len(training_dbs[0].db_inds)
    validation_size = len(validation_db.db_inds)
    # 队列存储数据用于训练
    training_queue = Queue(system_configs.prefetch_size)
    validation_queue = Queue(5)
    # 队列存储固定数据用于训练
    pinned_training_queue = queue.Queue(system_configs.prefetch_size)
    pinned_validation_queue = queue.Queue(5)
    # 加载数据采样函数
    data_file = "sample.{}".format(training_dbs[0].data)
    sample_data = importlib.import_module(data_file).sample_data
    # 分配资源用于平行读取
    training_tasks = init_parallel_jobs(training_dbs, training_queue,
                                        sample_data, True)
    if val_iter:
        validation_tasks = init_parallel_jobs([validation_db],
                                              validation_queue, sample_data,
                                              False)  # 验证数据集的并行
    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()
    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()
    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()
    # 创建模型
    print("building model...")
    nnet = NetworkFactory(training_dbs[0])

    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("loading from pretrained model")
        nnet.load_pretrained_params(pretrained_model)

    if start_iter:
        learning_rate /= (decay_rate**(start_iter // stepsize)
                          )  # 根据当前的起始点来计算学习率

        nnet.load_params(start_iter)  # 加载起始点处的参数
        nnet.set_lr(learning_rate)  # 设置学习率
        print("training starts from iteration {} with learning_rate {}".format(
            start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)  # 设置学习率
    # 开始训练
    print("training start...")
    nnet.cuda()  # GPU
    nnet.train_mode()  # 训练模式
    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            training = pinned_training_queue.get(block=True)  # 训练数据
            training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train(
                **training)  # 获取loss
            if display and iteration % display == 0:
                print("training loss at iteration {}: {}".format(
                    iteration, training_loss.item()))
                print("focal loss at iteration {}:    {}".format(
                    iteration, focal_loss.item()))
                print("pull loss at iteration {}:     {}".format(
                    iteration, pull_loss.item()))
                print("push loss at iteration {}:     {}".format(
                    iteration, push_loss.item()))
                print("regr loss at iteration {}:     {}".format(
                    iteration, regr_loss.item()))
                #print("cls loss at iteration {}:      {}\n".format(iteration, cls_loss.item()))
            del training_loss, focal_loss, pull_loss, push_loss, regr_loss  # 删除操作
            # 验证操作
            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()  # 验证模型
                validation = pinned_validation_queue.get(block=True)  # 验证集
                validation_loss = nnet.validate(**validation)  # 验证误差
                print("validation loss at iteration {}: {}".format(
                    iteration, validation_loss.item()))
                nnet.train_mode()  # 切换到训练模式
            if iteration % snapshot == 0:
                nnet.save_params(iteration)  # 保存参数
            if iteration % stepsize == 0:
                learning_rate /= decay_rate  # 学习率进行衰减
                nnet.set_lr(learning_rate)

    # 训练结束后发送信号杀死线程
    training_pin_semaphore.release()
    validation_pin_semaphore.release()
    # 终止数据预读进程
    for training_task in training_tasks:
        training_task.terminate()
    for validation_task in validation_tasks:
        validation_task.terminate()
예제 #9
0
def train(training_dbs, validation_db, validation_db_2, tb, suffix, cfg_file,
          es, start_iter):
    learning_rate = system_configs.learning_rate
    max_iteration = system_configs.max_iter
    pretrained_model = system_configs.pretrain
    snapshot = system_configs.snapshot
    val_iter = system_configs.val_iter
    display = system_configs.display
    decay_rate = system_configs.decay_rate
    stepsize = system_configs.stepsize

    # getting the size of each database
    training_size = len(training_dbs[0].db_inds)
    validation_size = len(validation_db.db_inds)
    #validation_2_size = len(validation_db_2.db_inds)

    # queues storing data for training
    training_queue = Queue(system_configs.prefetch_size)
    validation_queue = Queue(5)
    #validation_2_queue = Queue(5)

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_configs.prefetch_size)
    pinned_validation_queue = queue.Queue(5)
    #pinned_validation_2_queue = queue.Queue(5)

    # load data sampling function
    data_file = "sample.{}".format(training_dbs[0].data)
    sample_data = importlib.import_module(data_file).sample_data

    # allocating resources for parallel reading
    training_tasks = init_parallel_jobs(training_dbs, training_queue,
                                        sample_data, True)
    if val_iter:
        validation_tasks = init_parallel_jobs([validation_db],
                                              validation_queue, sample_data,
                                              False)
        #validation_2_tasks = init_parallel_jobs([validation_db_2], validation_2_queue, sample_data, False)

    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    #validation_2_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()
    #validation_2_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()

    # validation_2_pin_args   = (validation_2_queue, pinned_validation_2_queue, validation_2_pin_semaphore)
    # validation_2_pin_thread = threading.Thread(target=pin_memory, args=validation_2_pin_args)
    # validation_2_pin_thread.daemon = True
    # validation_2_pin_thread.start()

    print("building model...")
    nnet = NetworkFactory(training_dbs[0])  #, suffix)

    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("loading from pretrained model")
        nnet.load_pretrained_params(pretrained_model)

    if start_iter:
        learning_rate /= (decay_rate**(start_iter // stepsize))

        nnet.load_params(start_iter)
        nnet.set_lr(learning_rate)
        print("training starts from iteration {} with learning_rate {}".format(
            start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    if es:
        early_stopping = EarlyStopping(patience=100, verbose=True)

    print("training start...")
    nnet.cuda()
    #nnet.cpu()

    #if suffix == 104:
    #    net = model_104(training_dbs[0])
    #    tb.add_graph(net, torch.rand(2, 3, 511, 511))#, torch.FloatTensor(training_dbs[0].db_inds))
    #elif suffix == 52:
    #    net = model_52(training_dbs[0])
    #    dummy_input = torch.randn(2, 3, 511, 511)
    #    tb.add_graph(net, dummy_input)
    #else:
    #    return
    #tb.close()

    ##### Model's Warm-up #####
    nnet.eval_mode()
    input = cv2.imread(training_dbs[0].image_file(0))
    start_time = time.time()
    detections = kp_detection(input, nnet, score_min=0.5)
    end_time = time.time()
    infer_time = end_time - start_time
    print("\n##################################################")
    print("Warm-up + Inference Time: " + str(infer_time * 1000) + "ms")
    print("##################################################")
    ###########################

    ##### Model's Inference Time #####
    input = cv2.imread(training_dbs[0].image_file(0))
    start_time = time.time()
    detections = kp_detection(input, nnet, score_min=0.5)
    end_time = time.time()
    infer_time = end_time - start_time
    print("\n##################################################")
    print("Inference Time: " + str(infer_time * 1000) + "ms")
    print("##################################################")
    ##################################

    result_dir = system_configs.result_dir
    result_dir = os.path.join(result_dir, str("Training_Validation"),
                              str("val2017"), str(suffix))

    #if suffix is not None:
    #    result_dir = os.path.join(result_dir, suffix)

    make_dirs([result_dir])

    nnet.train_mode()

    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            training = pinned_training_queue.get(block=True)
            #start_time = time.time()
            training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train(
                **training)
            #end_time = time.time()
            #infer_time = end_time - start_time
            #training_loss, focal_loss, pull_loss, push_loss, regr_loss, cls_loss = nnet.train(**training)

            #print("\nTotal Time per Iteration:" + str(infer_time) + "ms")
            #tb.add_scalar('Total Time (ms) vs Iteration', infer_time * 1000, iteration)

            if display and iteration % display == 0:
                print("\ntraining loss at iteration {}: {}".format(
                    iteration, training_loss.item()))
                print("focal loss at iteration {}:    {}".format(
                    iteration, focal_loss.item()))
                print("pull loss at iteration {}:     {}".format(
                    iteration, pull_loss.item()))
                print("push loss at iteration {}:     {}".format(
                    iteration, push_loss.item()))
                print("regr loss at iteration {}:     {}".format(
                    iteration, regr_loss.item()))
                #print("cls loss at iteration {}:      {}\n".format(iteration, cls_loss.item()))

            tb.add_scalar('Training Loss vs Iteration', training_loss.item(),
                          iteration)
            tb.add_scalar('Focal Loss vs Iteration', focal_loss.item(),
                          iteration)
            tb.add_scalar('Pull Loss vs Iteration', pull_loss.item(),
                          iteration)
            tb.add_scalar('Push Loss vs Iteration', push_loss.item(),
                          iteration)
            tb.add_scalar('Offset Loss vs Iteration', regr_loss.item(),
                          iteration)
            #tb.add_scalar('Class Loss vs Iteration', cls_loss.item(), iteration)

            del training_loss, focal_loss, pull_loss, push_loss, regr_loss  #, cls_loss

            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(**validation)
                print("\n##################################################")
                print("validation loss at iteration {}: {}".format(
                    iteration, validation_loss.item()))
                print("##################################################")
                tb.add_scalar('Validation Loss vs Iteration',
                              validation_loss.item(), iteration)

                if es:
                    early_stopping(validation_loss, iteration, nnet, cfg_file)

                nnet.train_mode()

            epoch = len(training_dbs[0].db_inds) // system_configs.batch_size
            #print(epoch)

            if iteration % epoch == 0:  # Enter every epoch
                nnet.eval_mode()
                stats = kp_detection_train(validation_db_2, nnet, result_dir)
                map_avg = stats[0]
                map_50 = stats[1]
                map_75 = stats[2]
                map_small = stats[3]
                map_medium = stats[4]
                map_large = stats[5]
                mar_1 = stats[6]
                mar_10 = stats[7]
                mar_100 = stats[8]
                mar_small = stats[9]
                mar_medium = stats[10]
                mar_large = stats[11]
                tb.add_scalar('Average mAP vs Epoch', map_avg,
                              iteration / epoch)
                tb.add_scalar('mAP (IoU 0.5) vs Epoch', map_50,
                              iteration / epoch)
                tb.add_scalar('mAP (IoU 0.75) vs Epoch', map_75,
                              iteration / epoch)
                tb.add_scalar('mAP (Area = Small) vs Epoch', map_small,
                              iteration / epoch)
                tb.add_scalar('mAP (Area = Medium) vs Epoch', map_medium,
                              iteration / epoch)
                tb.add_scalar('mAP (Area = Large) vs Epoch', map_large,
                              iteration / epoch)
                tb.add_scalar('mAR (Max Detection = 1) vs Epoch', mar_1,
                              iteration / epoch)
                tb.add_scalar('mAR (Max Detection = 10) vs Epoch', mar_10,
                              iteration / epoch)
                tb.add_scalar('mAR (Max Detection = 100) vs Epoch', mar_100,
                              iteration / epoch)
                tb.add_scalar('mAR (Area = Small) vs Epoch', mar_small,
                              iteration / epoch)
                tb.add_scalar('mAR (Area = Medium) vs Epoch', mar_medium,
                              iteration / epoch)
                tb.add_scalar('mAR (Area = Large) vs Epoch', mar_large,
                              iteration / epoch)
                nnet.train_mode()

            if es and early_stopping.early_stop:
                print("Early stopping")
                break

            if not es:
                if iteration % snapshot == 0:
                    nnet.save_params(iteration)

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                nnet.set_lr(learning_rate)

    # sending signal to kill the thread
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    for training_task in training_tasks:
        training_task.terminate()
    for validation_task in validation_tasks:
        validation_task.terminate()