Пример #1
0
def time(solver, nccl):
    fprop = []
    bprop = []
    total = caffe.Timer()
    allrd = caffe.Timer()
    for _ in range(len(solver.net.layers)):
        fprop.append(caffe.Timer())
        bprop.append(caffe.Timer())
    display = solver.param.display

    def show_time():
        if solver.iter % display == 0:
            s = '\n'
            for i in range(len(solver.net.layers)):
                s += 'forw %3d %8s ' % (i, solver.net._layer_names[i])
                s += ': %.2f\n' % fprop[i].ms
            for i in range(len(solver.net.layers) - 1, -1, -1):
                s += 'back %3d %8s ' % (i, solver.net._layer_names[i])
                s += ': %.2f\n' % bprop[i].ms
            s += 'solver total: %.2f\n' % total.ms
            s += 'allreduce: %.2f\n' % allrd.ms
            caffe.log(s)

    solver.net.before_forward(lambda layer: fprop[layer].start())
    solver.net.after_forward(lambda layer: fprop[layer].stop())
    solver.net.before_backward(lambda layer: bprop[layer].start())
    solver.net.after_backward(lambda layer: bprop[layer].stop())
    solver.add_callback(lambda: total.start(), lambda:
                        (total.stop(), allrd.start()))
    solver.add_callback(nccl)
    solver.add_callback(lambda: '', lambda: (allrd.stop(), show_time()))
Пример #2
0
def time(net, iters):
    fprop = []
    bprop = []
    total = caffe.Timer()
    for _ in range(len(net.layers)):
        fprop.append(caffe.Timer())
        bprop.append(caffe.Timer())

    def show_time():
        s = '\n'
        for i in range(len(net.layers)):
            s += 'forw %3d %8s ' % (i, net._layer_names[i])
            s += ': %.2f\n' % fprop[i].ms
        for i in range(len(net.layers) - 1, -1, -1):
            s += 'back %3d %8s ' % (i, net._layer_names[i])
            s += ': %.2f\n' % bprop[i].ms
        s += 'solver total: %.2f\n' % total.ms
        caffe.log(s)

    net.before_forward(lambda layer: fprop[layer].start())
    net.after_forward(lambda layer: fprop[layer].stop())
    net.before_backward(lambda layer: bprop[layer].start())
    net.after_backward(lambda layer: bprop[layer].stop())
    total.start()
    for i in xrange(iters):
        net.forward()
        net.backward()
    total.stop()
    show_time()
Пример #3
0
def run_train(solver, max_epoch, max_tol, output_blob, label_blob):
    timer = caffe.Timer()
    solver.add_callback(lambda: timer.start(), lambda: timer.stop())
    loss_weights = [(n, w)
                    for n, w in solver.net.blob_loss_weights.iteritems()
                    if w > 0.0]
    snapshot_prefix = solver.param.snapshot_prefix
    model_dir = os.path.dirname(snapshot_prefix)
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    handler = logging.FileHandler(os.path.join(model_dir, 'train.log'),
                                  mode='w')
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s',
                                  datefmt='%Y-%m-%d %H:%M:%S')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    f_info = open(os.path.join(model_dir, 'info.txt'), 'w')
    f_info.write('AUs:' +
                 ','.join([au[2:]
                           for au in solver.net.layers[0].au_names]) + '\n')

    epoch = 0
    it = 0
    last_it = 0
    tol = 0
    train_loss = 0.0
    best_epoch = -1
    best = -np.inf
    while True:
        solver.step(1)
        loss = np.sum([
            solver.net.blobs[n].data[...].copy() * w for n, w in loss_weights
        ])
        train_loss = (train_loss * (it - last_it) + loss) / (it + 1 - last_it)
        message = '===Epoch<{}>...prog: {}/{}, train_loss: {}, speed: {:.3f}s/iter      \r'
        sys.stdout.flush()
        sys.stdout.write(
            message.format(solver.net.layers[0].epoch,
                           solver.net.layers[0].processed_num,
                           solver.net.layers[0].num_samples, train_loss,
                           timer.ms / 1000.0))

        if epoch < solver.net.layers[0].epoch:
            average, results, au_names = run_validation(
                solver, output_blob, label_blob)
            solver.snapshot()
            os.rename(
                snapshot_prefix + '_iter_{}.solverstate'.format(it + 1),
                os.path.join(model_dir, 'epoch{}.solverstate'.format(epoch)))
            os.rename(
                snapshot_prefix + '_iter_{}.caffemodel'.format(it + 1),
                os.path.join(model_dir, 'epoch{}.caffemodel'.format(epoch)))

            if average > best:
                print('GOOD!\n')
                best = average
                best_epoch = epoch
                tol = 0
            else:
                tol += 1

            val_string = '{:<5} {:<.3f}\n' * len(au_names)
            val_list = [e for t in zip(au_names, results) for e in t]
            val_string = val_string.format(*val_list)
            print('Validation average: {}, tol: {}/{}'.format(
                average, tol, max_tol))
            print(val_string)

            logger.info('Epoch<{}> ends'.format(epoch))
            message = 'train_loss: {}, validation_average: {}, tol: {}/{}'
            logger.info(message.format(train_loss, average, tol, max_tol))
            logger.info('Validation results: \n' + val_string)

            if tol >= max_tol or epoch == (max_epoch - 1):
                print('\n\n\n===== Training finish =====')
                message = 'End at epoch {}, best: {}, best_epoch: {}\n'
                print(message.format(epoch, best, best_epoch))
                logger.info('Training process ends')
                logger.info(message.format(epoch, best, best_epoch))
                break

            epoch += 1
            train_loss = 0.0
            last_it = it + 1
            handler.flush()
        it += 1
    f_info.write('Best epoch:{}\n'.format(best_epoch))
    f_info.write('Best validation average:{}\n'.format(best))
    f_info.close()
    return