示例#1
0
    def testNonParallelModel(self):
        workspace.ResetWorkspace()

        model = model_helper.ModelHelper(name="test")
        old_seq_id = data_workers.global_coordinator._fetcher_id_seq
        coordinator = data_workers.init_data_input_workers(
            model,
            ["data", "label"],
            dummy_fetcher,
            32,
            2,
            input_source_name="unittest"
        )
        new_seq_id = data_workers.global_coordinator._fetcher_id_seq
        self.assertEqual(new_seq_id, old_seq_id + 2)

        coordinator.start()

        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(model.net)

        for _i in range(500):
            with timeout_guard.CompleteInTimeOrDie(5):
                workspace.RunNet(model.net.Proto().name)

            data = workspace.FetchBlob("data")
            labels = workspace.FetchBlob("label")

            self.assertEqual(data.shape[0], labels.shape[0])
            self.assertEqual(data.shape[0], 32)

            for j in range(32):
                self.assertEqual(labels[j], data[j, 0])
                self.assertEqual(labels[j], data[j, 1])
                self.assertEqual(labels[j], data[j, 2])

        coordinator.stop_coordinator("unittest")
        self.assertEqual(coordinator._coordinators, [])
示例#2
0
    def _test_create_blobs_queue_db(self, add_blobs_fun):
        num_samples = 10000
        batch_size = 10
        init_net = core.Net('init_net')
        net = core.Net('test_create_blobs_queue_db')
        queue = init_net.CreateBlobsQueue([], 'queue', capacity=num_samples)
        reader = init_net.CreateBlobsQueueDB(
            [queue],
            'blobs_queue_db_reader',
            value_blob_index=0,
            timeout_secs=0.1,
        )
        workspace.RunNetOnce(init_net)

        add_blobs_fun(queue, num_samples)

        net.TensorProtosDBInput([reader], ['image', 'label'],
                                batch_size=batch_size)
        workspace.CreateNet(net)

        close_net = core.Net('close_net')
        close_net.CloseBlobsQueue([queue], [])

        for i in range(int(num_samples / batch_size)):
            print("Running net, iteration {}".format(i))
            with timeout_guard.CompleteInTimeOrDie(2.0):
                workspace.RunNet(net)

            images = workspace.FetchBlob('image')
            labels = workspace.FetchBlob('label')
            self.assertEqual(batch_size, len(images))
            self.assertEqual(batch_size, len(labels))
            for idx, item in enumerate(images):
                self.assertEqual(
                    "foo{}".format(i * batch_size + idx).encode('utf-8'), item)
            for item in labels:
                self.assertEqual(1, item)
        workspace.RunNetOnce(close_net)
示例#3
0
def RunEpoch(
    args,
    epoch,
    train_model,
    test_model,
    total_batch_size,
    num_shards,
    expname,
    explog,
    best_accuracy,
):
    '''
    Run one epoch of the trainer.
    TODO: add checkpointing here.
    '''
    # TODO: add loading from checkpoint
    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
    epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
    for i in range(epoch_iters):
        # This timeout is required (temporarily) since CUDA-NCCL
        # operators might deadlock when synchronizing between GPUs.
        timeout = 600.0 if i == 0 else 60.0
        with timeout_guard.CompleteInTimeOrDie(timeout):
            t1 = time.time()
            workspace.RunNet(train_model.net.Proto().name)
            t2 = time.time()
            dt = t2 - t1

        fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
        log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
        prefix = "{}_{}".format(
            train_model._device_prefix,
            train_model._devices[0])
        accuracy = workspace.FetchBlob(prefix + '/accuracy')
        loss = workspace.FetchBlob(prefix + '/loss')
        train_fmt = "Training loss: {}, accuracy: {}"
        log.info(train_fmt.format(loss, accuracy))

    num_images = epoch * epoch_iters * total_batch_size
    prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0])
    accuracy = workspace.FetchBlob(prefix + '/accuracy')
    loss = workspace.FetchBlob(prefix + '/loss')
    learning_rate = workspace.FetchBlob(
        data_parallel_model.GetLearningRateBlobNames(train_model)[0]
    )
    test_accuracy = 0
    if (test_model is not None):
        # Run 100 iters of testing
        ntests = 0
        # for _ in range(0, 100):
        # for _ in range(0, 125):
        for _ in range(0, args.test_iters):
            workspace.RunNet(test_model.net.Proto().name)
            for g in test_model._devices:
                test_accuracy += np.asscalar(workspace.FetchBlob(
                    "{}_{}".format(test_model._device_prefix, g) + '/accuracy'
                ))
                ntests += 1
        test_accuracy /= ntests
    else:
        test_accuracy = (-1)
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy

    explog.log(
        input_count=num_images,
        batch_count=(i + epoch * epoch_iters),
        additional_values={
            'accuracy': accuracy,
            'loss': loss,
            'learning_rate': learning_rate,
            'epoch': epoch,
            'test_accuracy': test_accuracy,
            'best_accuracy': best_accuracy,
        }
    )
    assert loss < 40, "Exploded gradients :("

    # TODO: add checkpointing
    return epoch + 1, best_accuracy
示例#4
0
def RunEpoch(
    args,
    epoch,
    train_model,
    test_model,
    batch_size,
    num_shards,
    expname,
    explog,
):
    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
    epoch_iters = int(args.epoch_size / batch_size / num_shards)
    if args.multi_label:
        accumulated_prob = np.empty(shape=[0, args.num_labels], dtype=np.float)
        accumulated_label = np.empty(shape=[0, args.num_labels],
                                     dtype=np.int32)
    for i in range(epoch_iters):
        # This timeout is required (temporarily) since CUDA-NCCL
        # operators might deadlock when synchronizing between GPUs.
        timeout = 6000.0 if i == 0 else 600.0
        with timeout_guard.CompleteInTimeOrDie(timeout):
            t1 = time.time()
            workspace.RunNet(train_model.net.Proto().name)
            t2 = time.time()
            dt = t2 - t1
        if args.multi_label:
            prefix = "gpu_{}".format(train_model._devices[0])
            prob = workspace.FetchBlob(prefix + '/prob')
            label = workspace.FetchBlob(prefix + '/label')
            accumulated_prob = np.concatenate((accumulated_prob, prob), axis=0)
            accumulated_label = np.concatenate((accumulated_label, label),
                                               axis=0)

        if i % args.display_iter == 0:
            fmt = "Finished iteration {}/{} of epoch {} ({:.2f} clips/sec)"
            log.info(fmt.format(i, epoch_iters, epoch, batch_size / dt))
            prefix = "gpu_{}".format(train_model._devices[0])
            loss = workspace.FetchBlob(prefix + '/loss')
            if args.multi_label:
                mean_auc, mean_ap, _, _ = \
                    metric.mean_ap_metric(accumulated_prob, accumulated_label)
                train_msg = \
                    "Training loss: {}, AUC: {}, mAP: {}".format(
                        np.mean(loss), mean_auc, mean_ap
                    )
                if accumulated_label.shape[0] > 4096:
                    accumulated_prob = accumulated_prob[-4096:, :]
                    accumulated_label = accumulated_label[-4096:, :]
            else:
                accuracy = workspace.FetchBlob(prefix + '/accuracy')
                train_msg = "Training loss: {}, accuracy: {}".format(
                    loss, accuracy)

            log.info(train_msg)

    num_clips = epoch * epoch_iters * batch_size
    prefix = "gpu_{}".format(train_model._devices[0])
    loss = workspace.FetchBlob(prefix + '/loss')
    learning_rate = workspace.FetchBlob(prefix + '/LR')
    if args.multi_label:
        accuracy = -1
        loss = np.mean(loss)
    else:
        mean_ap = -1
        mean_auc = -1
    if (test_model is not None):
        # Run 100 iters of testing
        ntests = 0
        test_accuracy = 0
        test_mean_auc = 0
        test_mean_ap = 0
        all_prob = np.empty(shape=[0, args.num_labels], dtype=np.float)
        all_label = np.empty(shape=[0, args.num_labels], dtype=np.int32)
        for _ in range(0, 100):
            workspace.RunNet(test_model.net.Proto().name)
            for g in test_model._devices:
                prefix = "gpu_{}".format(g)
                if args.multi_label:
                    prob = workspace.FetchBlob(prefix + '/prob')
                    label = workspace.FetchBlob(prefix + '/label')
                    all_prob = np.concatenate((all_prob, prob), axis=0)
                    all_label = np.concatenate((all_label, label), axis=0)
                else:
                    accuracy = workspace.FetchBlob(prefix + '/accuracy')
                    test_accuracy += np.asscalar(accuracy)
                ntests += 1
        if args.multi_label:
            test_mean_auc, test_mean_ap, _, _ = \
                metric.mean_ap_metric(all_prob, all_label)
            log.info("Test AUC: {}, mAP: {}".format(mean_auc, mean_ap))
        else:
            test_accuracy /= ntests
            log.info("Test accuracy: {}".format(test_accuracy))
    else:
        test_accuracy = (-1)
        test_mean_auc = (-1)
        test_mean_ap = (-1)

    explog.log(input_count=num_clips,
               batch_count=(i + epoch * epoch_iters),
               additional_values={
                   'accuracy': accuracy,
                   'train_AUC': mean_auc,
                   'train_mAP': mean_ap,
                   'loss': loss,
                   'learning_rate': learning_rate,
                   'epoch': epoch,
                   'test_accuracy': test_accuracy,
                   'test_mean_auc': test_mean_auc,
                   'test_mean_ap': test_mean_ap,
               })
    assert loss < 40, "Exploded gradients :("

    return epoch + 1
def RunEpoch(
    args,
    epoch,
    train_model,
    test_model,
    total_batch_size,
    expname,
    explog,
):
    '''
    Run one epoch of the trainer.
    TODO: add checkpointing here.
    '''
    # TODO: add loading from checkpoint
    epoch_iters = int(args.epoch_size / total_batch_size)
    for i in range(epoch_iters):
        log.info("Start iteration {}/{} of epoch {}".format(
            i,
            epoch_iters,
            epoch,
        ))

        # This timeout is required (temporarily) since CUDA-NCCL
        # operators might deadlock when synchronizing between GPUs.
        timeout = 600.0 if i == 0 else 60.0
        with timeout_guard.CompleteInTimeOrDie(timeout):
            workspace.RunNet(train_model.net.Proto().name)

        num_images = (i + epoch * epoch_iters) * total_batch_size
        record_freq = total_batch_size * 20

        # Report progress, compute train and test accuracies.
        if num_images % record_freq == 0 and i > 0:
            prefix = "gpu_{}".format(train_model._devices[0])
            accuracy = workspace.FetchBlob(prefix + '/accuracy')
            loss = workspace.FetchBlob(prefix + '/loss')
            learning_rate = workspace.FetchBlob(prefix + '/LR')

            test_accuracy = 0
            ntests = 0

            if (test_model is not None):
                # Run 5 iters of testing
                for t in range(0, 5):
                    workspace.RunNet(test_model.net.Proto().name)
                    for g in test_model._devices:
                        test_accuracy += np.asscalar(
                            workspace.FetchBlob("gpu_{}".format(g) +
                                                '/accuracy'))
                        ntests += 1
                test_accuracy /= ntests
            else:
                test_accuracy = (-1)

            explog.log(input_count=num_images,
                       batch_count=(i + epoch * epoch_iters),
                       additional_values={
                           'accuracy': accuracy,
                           'loss': loss,
                           'learning_rate': learning_rate,
                           'epoch': epoch,
                           'test_accuracy': test_accuracy,
                       })
            assert loss < 40, "Exploded gradients :("

    # TODO: add checkpointing
    return epoch + 1
示例#6
0
def RunEpoch(
    args,
    epoch,
    train_model,
    test_model,
    total_batch_size,
    num_shards,
    expname,
    explog,
):
    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
    epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
    for i in range(epoch_iters):
        timeout = 600.0 if i == 0 else 60.0
        with timeout_guard.CompleteInTimeOrDie(timeout):
            t1 = time.time()
            workspace.RunNet(train_model.net.Proto().name)
            t2 = time.time()
            dt = t2 - t1

        fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
        log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
        prefix = "{}_{}".format(train_model._device_prefix,
                                train_model._devices[0])
        accuracy = workspace.FetchBlob(prefix + '/accuracy')
        loss = workspace.FetchBlob(prefix + '/loss')
        train_fmt = "Training loss: {}, accuracy: {}"
        log.info(train_fmt.format(loss, accuracy))

    num_images = epoch * epoch_iters * total_batch_size
    prefix = "{}_{}".format(train_model._device_prefix,
                            train_model._devices[0])
    accuracy = workspace.FetchBlob(prefix + '/accuracy')
    loss = workspace.FetchBlob(prefix + '/loss')
    learning_rate = workspace.FetchBlob(
        data_parallel_model.GetLearningRateBlobNames(train_model)[0])
    test_accuracy = 0
    if (test_model is not None):
        # Run 100 iters of testing
        ntests = 0
        for _ in range(0, 100):
            workspace.RunNet(test_model.net.Proto().name)
            for g in test_model._devices:
                test_accuracy += np.asscalar(
                    workspace.FetchBlob(
                        "{}_{}".format(test_model._device_prefix, g) +
                        '/accuracy'))
                ntests += 1
        test_accuracy /= ntests
    else:
        test_accuracy = (-1)

    explog.log(input_count=num_images,
               batch_count=(i + epoch * epoch_iters),
               additional_values={
                   'accuracy': accuracy,
                   'loss': loss,
                   'learning_rate': learning_rate,
                   'epoch': epoch,
                   'test_accuracy': test_accuracy,
               })
    assert loss < 40, "Exploded gradients :("
    return epoch + 1
示例#7
0
def RunEpoch(
    args,
    epoch,
    train_model,
    test_model,
    total_batch_size,
    num_shards,
    expname,
    explog,
):
    '''
    Run one epoch of the trainer.
    TODO: add checkpointing here.
    '''
    # TODO: add loading from checkpoint
    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
    epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
    ts = time.time()
    drop = 10
    max = 0.0

    spans = []

    for i in range(epoch_iters):
        # This timeout is required (temporarily) since CUDA-NCCL
        # operators might deadlock when synchronizing between GPUs.
        timeout = 3600  #3600.0 if i == 0 else 60.0
        with timeout_guard.CompleteInTimeOrDie(timeout):
            t1 = time.time()
            workspace.RunNet(train_model.net.Proto().name)
            t2 = time.time()
            dt = t2 - t1
            if i > drop:
                spans.append(dt)
                pass
        updateEvery = args.notify_frequency
        #ignore the first 10 iterations
        if i == drop:
            #reset timer
            ts = time.time()
            pass
        if (i - drop) % updateEvery == 0 and i > drop:
            fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec), max = {:.2f}, avg = {:.2f}, median = {}. medthru = {}, avgthru = {}"
            te = time.time()
            td = te - ts
            currSpeed = updateEvery * total_batch_size / td
            if max < currSpeed:
                max = currSpeed
                pass
            log.info(
                fmt.format(i + 1, epoch_iters, epoch, currSpeed, max,
                           np.mean(spans), np.median(spans),
                           1. / np.median(spans), 1. / np.mean(spans)))
            ts = time.time()

            prefix = "{}_{}".format(train_model._device_prefix,
                                    train_model._devices[0])
            accuracy = workspace.FetchBlob(prefix + '/accuracy')
            loss = workspace.FetchBlob(prefix + '/loss')
            train_fmt = "Training loss: {}, accuracy: {}"
            log.info(train_fmt.format(loss, accuracy))

    num_images = epoch * epoch_iters * total_batch_size
    prefix = "{}_{}".format(train_model._device_prefix,
                            train_model._devices[0])
    accuracy = workspace.FetchBlob(prefix + '/accuracy')
    loss = workspace.FetchBlob(prefix + '/loss')
    learning_rate = workspace.FetchBlob(
        data_parallel_model.GetLearningRateBlobNames(train_model)[0])
    test_accuracy = 0
    if (test_model is not None):
        # Run 100 iters of testing
        ntests = 0
        for _ in range(0, 100):
            workspace.RunNet(test_model.net.Proto().name)
            for g in test_model._devices:
                test_accuracy += np.asscalar(
                    workspace.FetchBlob(
                        "{}_{}".format(test_model._device_prefix, g) +
                        '/accuracy'))
                ntests += 1
        test_accuracy /= ntests
    else:
        test_accuracy = (-1)

    explog.log(input_count=num_images,
               batch_count=(i + epoch * epoch_iters),
               additional_values={
                   'accuracy': accuracy,
                   'loss': loss,
                   'learning_rate': learning_rate,
                   'epoch': epoch,
                   'test_accuracy': test_accuracy,
               })
    assert loss < 40, "Exploded gradients :("

    # TODO: add checkpointing
    print("accuracy = %s. test_acc = %s. loss = %s" %
          (accuracy, test_accuracy, loss))
    return epoch + 1
示例#8
0
def RunEpoch(
    args,
    epoch,
    train_model,
    test_model,
    batch_size,
    num_shards,
    expname,
    explog,
):

    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
    epoch_iters = int(args.epoch_size / batch_size / num_shards)

    for i in range(epoch_iters):
        # This timeout is required (temporarily) since CUDA-NCCL
        # operators might deadlock when synchronizing between GPUs.
        timeout = 6000.0 if i == 0 else 600.0
        with timeout_guard.CompleteInTimeOrDie(timeout):
            t1 = time.time()
            workspace.RunNet(train_model.net.Proto().name)
            t2 = time.time()
            dt = t2 - t1

        if i % args.display_iter == 0:
            fmt = "Finished iteration {}/{} of epoch {} ({:.2f} clips/sec)"
            log.info(fmt.format(i, epoch_iters, epoch, batch_size / dt))
            prefix = "gpu_{}".format(train_model._devices[0])
            loss = workspace.FetchBlob(prefix + '/loss')
            accuracy = workspace.FetchBlob(prefix + '/accuracy')
            learning_rate = workspace.FetchBlob(prefix + '/LR')
            train_msg = "Training loss: {}, lr: {}, accuracy: {}".format(
                loss, learning_rate, accuracy)
            log.info(train_msg)

    num_clips = epoch * epoch_iters * batch_size
    prefix = "gpu_{}".format(train_model._devices[0])
    loss = workspace.FetchBlob(prefix + '/loss')
    learning_rate = workspace.FetchBlob(prefix + '/LR')
    if (test_model is not None):
        # Run 100 iters of testing
        ntests = 0
        test_accuracy = 0
        for _ in range(0, 100):
            workspace.RunNet(test_model.net.Proto().name)
            for g in test_model._devices:
                prefix = "gpu_{}".format(g)
                accuracy = workspace.FetchBlob(prefix + '/accuracy')
                test_accuracy += np.asscalar(accuracy)
                ntests += 1
        test_accuracy /= ntests
        log.info("Test accuracy: {}".format(test_accuracy))
    else:
        test_accuracy = (-1)

    explog.log(input_count=num_clips,
               batch_count=(i + epoch * epoch_iters),
               additional_values={
                   'accuracy': accuracy,
                   'loss': loss,
                   'learning_rate': learning_rate,
                   'epoch': epoch,
                   'test_accuracy': test_accuracy,
               })
    assert loss < 40, "Exploded gradients :("

    return epoch + 1
def RunEpoch(args,
             epoch,
             train_model,
             test_model,
             total_batch_size,
             num_shards,
             explog,
             plt_kernel):
    '''
    Run one epoch of the trainer.
    TODO: add checkpointing here.
    '''
    # TODO: add loading from checkpoint
    if args.test_data_type == 'VAL':
        log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
        epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
        epoch_loss = []
        epoch_accuracy = []
        for i in range(epoch_iters):
            # This timeout is required (temporarily) since CUDA-NCCL
            # operators might deadlock when synchronizing between GPUs.
            timeout = 600.0 if i == 0 else 60.0
            with timeout_guard.CompleteInTimeOrDie(timeout):
                t1 = time.time()
                workspace.RunNet(train_model.net.Proto().name)
                t2 = time.time()
                dt = t2 - t1

            # display_first_image()

            fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
            log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
            prefix = "{}_{}".format(
                train_model._device_prefix,
                train_model._devices[0])
            accuracy = workspace.FetchBlob(prefix + '/accuracy')
            loss = workspace.FetchBlob(prefix + '/loss')
            train_fmt = "Training loss: {}, accuracy: {}"
            log.info(train_fmt.format(loss, accuracy))
            epoch_loss.append(loss)
            epoch_accuracy.append(accuracy)

        num_images = epoch * epoch_iters * total_batch_size
        prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0])
        accuracy = workspace.FetchBlob(prefix + '/accuracy')
        loss = workspace.FetchBlob(prefix + '/loss')
        learning_rate = workspace.FetchBlob(
            data_parallel_model.GetLearningRateBlobNames(train_model)[0]
        )
        test_accuracy = 0
        if (test_model is not None):
            # Run 100 iters of testing
            ntests = 0
            for _ in range(0, 100):
                workspace.RunNet(test_model.net.Proto().name)
                for g in test_model._devices:
                    test_accuracy += np.asscalar(workspace.FetchBlob(
                        "{}_{}".format(test_model._device_prefix, g) + '/accuracy'
                    ))
                    ntests += 1
            test_accuracy /= ntests
        else:
            test_accuracy = (-1)

        explog.log(
            input_count=num_images,
            batch_count=(i + epoch * epoch_iters),
            additional_values={
                'accuracy': accuracy,
                'loss': loss,
                'learning_rate': learning_rate,
                'epoch': epoch,
                'test_accuracy': test_accuracy,
            }
        )
        assert loss < 40, "Exploded gradients :("
        if DEBUG_TRAINING:
            device_name = "{}_{}".format(test_model._device_prefix, test_model._devices[0])
            display_activation_map(plt_kernel, channel=0, batch_num=16, device_name=device_name)
            plt.pause(0.001) 

    #lfw verification test
    elif args.test_data_type == 'LFW' and args.load_model_path is not None:
        lfw_pairs = os.path.join(os.path.abspath('../dataset'), 'lfw_pairs.txt')
        if not os.path.exists(lfw_pairs):
            log.error('There is no lfw_pairs.txt in folder dataset/lfw!!!')
        else:
            actual_issame = lfw.get_issame_list(lfw.read_pairs(lfw_pairs))
            num_test_images = len(actual_issame) * 2
            assert num_test_images % total_batch_size == 0, \
                'The number of lfw test images must be interger multiple of the test bach size'
            num_batches = num_test_images // total_batch_size
            emb_array = np.zeros((num_test_images, args.feature_dim))
            for _ in range(0, num_batches):
                workspace.RunNet(test_model.net.Proto().name)
                for g in test_model._devices:
                    # display_activation_map(plt_kernel, channel=0, batch_num=16)
                    # plt.pause(0.001)
                    label = workspace.FetchBlob('{}_{}'.format(test_model._device_prefix, g) + '/label')
                    embedding = workspace.FetchBlob('{}_{}'.format(test_model._device_prefix, g) + '/fc5')
                    emb_array[label] = embedding

            _, _, test_accuracy, test_val, val_std, far = lfw.evaluate(emb_array,
                                                                       actual_issame,
                                                                       nrof_folds=10)
            log.info('Accuracy: %1.3f+-%1.3f' % (np.mean(test_accuracy), np.std(test_accuracy)))
            log.info('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (test_val, val_std, far))

    #megaface verification test
    elif args.test_data_type == 'MEGAFACE' and args.load_model_path is not None:
        pass



    return epoch + 1, epoch_loss, epoch_accuracy
示例#10
0
def run_training_net(self):
    timeout = 2000.0
    with timeout_guard.CompleteInTimeOrDie(timeout):
        workspace.RunNet(self.train_model.net.Proto().name)
示例#11
0
 def run_testing_net(self):
     if self.test_model is None:
         return
     timeout = 2000.0
     with timeout_guard.CompleteInTimeOrDie(timeout):
         workspace.RunNet(self.test_model.net.Proto().name)
    def buildModelAndTrain(self, opts):
        log.info('in buildModelAndTrain, trainer_input: {}'.format(str(opts)))
        log.info("check type self: {}".format(type(self)))
        log.info("check self dir: {}".format(dir(self)))
        log.info("check self get_input_dataset methods: {}".format(
            inspect.getsource(self.get_input_dataset)))
        log.info("check self gen_input_builder_fun method: {}".format(
            inspect.getsource(self.gen_input_builder_fun)))
        log.info("check self gen_forward_pass_builder_fun method: {}".format(
            inspect.getsource(self.gen_forward_pass_builder_fun)))
        if self.gen_param_update_builder_fun is not None:
            log.info(
                "check self gen_param_update_builder_fun method: {}".format(
                    inspect.getsource(self.gen_param_update_builder_fun)))
        else:
            log.info("check self gen_optimizer_fun method: {}".format(
                inspect.getsource(self.gen_optimizer_fun)))
        log.info("check self assembleAllOutputs method: {}".format(
            inspect.getsource(self.assembleAllOutputs)))

        self.get_model_input_fun()

        self.init_model()

        self.planning_output()

        self.prep_data_parallel_models()

        self.loadCheckpoint()

        for epoch in self.list_of_epochs():

            log.info("start training epoch {}".format(epoch))

            self.fun_per_epoch_b4RunNet(epoch)

            for epoch_iter in self.list_of_epoch_iters():

                self.iter_start_time = time.time()

                self.fun_per_iter_b4RunNet(epoch, epoch_iter)
                self.run_training_net()
                self.fun_per_iter_aftRunNetB4Test(epoch, epoch_iter)

                self.iter_end_time = time.time()

                if (epoch_iter %
                        opts['epoch_iter']['num_train_iteration_per_test'] == 0
                    ):
                    secs_per_train = (self.iter_end_time -
                                      self.iter_start_time)
                    self.secs_per_train.append(secs_per_train)

                    sample_trained = self.total_batch_size
                    samples_per_sec = sample_trained / secs_per_train
                    self.samples_per_sec.append(samples_per_sec)

                    self.fract_epoch = (
                        epoch + float(epoch_iter) / self.epoch_iterations)
                    self.record_epochs.append(self.fract_epoch)

                    for key in self.metrics:
                        metric = self.metrics[key]
                        if not metric['is_train']:
                            continue
                        metric['calculator'].Add()
                        metric['output'].append(metric['calculator'].Compute())

                    self.test_loop_start_time = time.time()
                    for _test_iter in range(
                            0, opts['epoch_iter']['num_test_iter']):
                        timeout = 2000.0
                        with timeout_guard.CompleteInTimeOrDie(timeout):
                            workspace.RunNet(self.test_model.net.Proto().name)
                        for key in self.metrics:
                            metric = self.metrics[key]
                            if metric['is_train']:
                                continue
                            metric['calculator'].Add()
                    self.test_loop_end_time = time.time()
                    self.sec_per_test_loop = \
                        self.test_loop_end_time - self.test_loop_start_time

                    for metric in self.metrics.values():
                        if metric['is_train']:
                            continue
                        metric['output'].append(metric['calculator'].Compute())

                    logStr = 'epoch:{}/{} iter:{}/{} secs_per_train:{} '.format(
                        self.fract_epoch,
                        self.opts['epoch_iter']['num_epochs'], epoch_iter,
                        self.epoch_iterations, secs_per_train)
                    logStr += 'samples_per_sec:{} loop {} tests takes {} sec'.format(
                        samples_per_sec, opts['epoch_iter']['num_test_iter'],
                        self.sec_per_test_loop)
                    for metric, value in self.metrics.items():
                        logStr += ' {}:{} '.format(metric, value['output'][-1])
                    log.info('Iter Stats: {}'.format(logStr))

                self.fun_per_iter_aftRunNetAftTest(epoch, epoch_iter)

            self.checkpoint(epoch)

            self.fun_per_epoch_aftRunNet(epoch)

        self.fun_conclude_operator()

        self.createMetricsPlotsModelsOutputs()

        return self.assembleAllOutputs()
示例#13
0
    def testInputOrder(self):
        #
        # Create two models (train and validation) with same input blobs
        # names and ensure that both will get the data in correct order
        #
        workspace.ResetWorkspace()
        self.counters = {0: 0, 1: 1}

        def dummy_fetcher_rnn_ordered1(fetcher_id, batch_size):
            # Hardcoding some input blobs
            T = 20
            N = batch_size
            D = 33
            data = np.zeros((T, N, D))
            data[0][0][0] = self.counters[fetcher_id]
            label = np.random.randint(N, size=(T, N))
            label[0][0] = self.counters[fetcher_id]
            seq_lengths = np.random.randint(N, size=(N))
            seq_lengths[0] = self.counters[fetcher_id]
            self.counters[fetcher_id] += 1
            return [data, label, seq_lengths]

        workspace.ResetWorkspace()
        model = model_helper.ModelHelper(name="rnn_test_order")

        coordinator = data_workers.init_data_input_workers(
            model,
            input_blob_names=["data2", "label2", "seq_lengths2"],
            fetch_fun=dummy_fetcher_rnn_ordered1,
            batch_size=32,
            max_buffered_batches=1000,
            num_worker_threads=1,
            dont_rebatch=True,
            input_source_name='train')
        coordinator.start()

        val_model = model_helper.ModelHelper(name="rnn_test_order_val")
        coordinator1 = data_workers.init_data_input_workers(
            val_model,
            input_blob_names=["data2", "label2", "seq_lengths2"],
            fetch_fun=dummy_fetcher_rnn_ordered1,
            batch_size=32,
            max_buffered_batches=1000,
            num_worker_threads=1,
            dont_rebatch=True,
            input_source_name='val')
        coordinator1.start()

        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(model.net)
        workspace.CreateNet(val_model.net)

        while coordinator._coordinators[0]._state._inputs < 900:
            time.sleep(0.01)

        with timeout_guard.CompleteInTimeOrDie(5):
            for m in (model, val_model):
                print(m.net.Proto().name)
                workspace.RunNet(m.net.Proto().name)
                last_data = workspace.FetchBlob('data2')[0][0][0]
                last_lab = workspace.FetchBlob('label2')[0][0]
                last_seq = workspace.FetchBlob('seq_lengths2')[0]

                # Run few rounds
                for _i in range(10):
                    workspace.RunNet(m.net.Proto().name)
                    data = workspace.FetchBlob('data2')[0][0][0]
                    lab = workspace.FetchBlob('label2')[0][0]
                    seq = workspace.FetchBlob('seq_lengths2')[0]
                    self.assertEqual(data, last_data + 1)
                    self.assertEqual(lab, last_lab + 1)
                    self.assertEqual(seq, last_seq + 1)
                    last_data = data
                    last_lab = lab
                    last_seq = seq

            time.sleep(0.2)

            self.assertTrue(coordinator.stop())
示例#14
0
def RunEpoch(args, epoch, train_model, test_model, explog,
             elapsed_training_time):
    """
    Run a training epoch one the evaluation model, and then compute the accuracy on a test model.

    :param args: the script's parameters 
    :param epoch: the current epoch'count
    :param train_model: the model on which training will be performed
    :param test_model: the model on which testing will be performed
    :param explog: the log object wrapping the file
    """
    log.info("Starting epoch {}/{}".format(epoch + 1, args.epoch_count))
    epoch_iters = int(args.epoch_size / args.batch_size / args.num_shards)
    test_epoch_iters = int(args.test_epoch_size / args.batch_size /
                           args.num_shards)
    prefix = "{}_{}".format(train_model._device_prefix,
                            train_model._devices[0])

    total_time = 0.

    for i in range(epoch_iters):
        # This timeout is required (temporarily) since CUDA-NCCL
        # operators might deadlock when synchronizing between GPUs.
        timeout = 600 if i == 0 else 300
        with timeout_guard.CompleteInTimeOrDie(timeout):
            t1 = time.time()
            workspace.RunNet(train_model.net.Proto().name)
            t2 = time.time()
            dt = t2 - t1
            total_time += dt

        # Log the tiem it took to run the current batch
        fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
        log.info(
            fmt.format(i + 1, epoch_iters, epoch + 1, args.batch_size / dt))

        # Get the accuracy and loss for this particular device
        accuracy = workspace.FetchBlob(prefix + '/accuracy')
        loss = workspace.FetchBlob(prefix + '/loss')

        # Write the training loss and accuracy for this batch
        log.info("Training loss: {}, accuracy: {}".format(loss, accuracy))

    # Compute the total number of images computed for this epoch; get the accuracy and the loss
    num_images = (epoch + 1) * epoch_iters * args.batch_size
    accuracy = workspace.FetchBlob(prefix + '/accuracy')
    loss = workspace.FetchBlob(prefix + '/loss')

    try:
        learning_rate = workspace.FetchBlob(
            (prefix if args.per_device_optimization else '') +
            data_parallel_model.GetLearningRateBlobNames(train_model)[0])
    except AttributeError:
        log.error(
            "The learning rate could not be found on this peer; this is likely due to the "
            "--per_device_optimization=True option.")
        learning_rate = 'unknown'

    # Prepare the parameters required for testing
    test_accuracy = 0
    test_accuracy_top5 = 0
    if test_model is not None:

        ntests = 0
        for _ in range(test_epoch_iters):
            workspace.RunNet(test_model.net.Proto().name)

            # Aggregate the accuracy across all the devices involved in testing
            for g in test_model._devices:
                test_accuracy += np.asscalar(
                    workspace.FetchBlob(
                        "{}_{}".format(test_model._device_prefix, g) +
                        '/accuracy'))
                test_accuracy_top5 += np.asscalar(
                    workspace.FetchBlob(
                        "{}_{}".format(test_model._device_prefix, g) +
                        '/accuracy_top5'))
                ntests += 1

        # Compute the average test_accuracy and the average top-5 test accuracy across
        # a test epoch, and across all devices involved in it
        test_accuracy /= ntests
        test_accuracy_top5 /= ntests

    # Log the results to stdout, update total training time
    elapsed_training_time += total_time
    on_target = test_accuracy >= args.target_accuracy
    log.info("Finished testing on epoch {}. Obtained:\nAccuracy (Local - Training): {}\n" \
        "Loss (Local - Training): {}\nTop-1 Acc: {}\nTop-5 Acc: {}\nOn target: {}\n Elapsed training time: {}"
        .format(epoch + 1, accuracy, loss, test_accuracy, test_accuracy_top5, on_target, elapsed_training_time))

    # Log this epoch's results
    explog.log(input_count=num_images,
               batch_count=((epoch + 1) * epoch_iters),
               additional_values={
                   'accuracy': accuracy,
                   'loss': loss,
                   'learning_rate': learning_rate,
                   'epoch': epoch + 1,
                   'top1_test_accuracy': test_accuracy,
                   'top5_test_accuracy': test_accuracy_top5,
                   'target_accuracy': args.target_accuracy,
                   'on_target': on_target,
                   'elapsed_training_time': elapsed_training_time,
               })

    assert loss < 40, "Exploded gradients"

    return elapsed_training_time, on_target
示例#15
0
def RunEpoch(
    args,
    epoch,
    train_model,
    test_model,
    total_batch_size,
    num_shards,
    expname,
    explog,
):
    '''
    Run one epoch of the trainer.
    TODO: add checkpointing here.
    '''
    # TODO: add loading from checkpoint
    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
    epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
    train_accuracy = 0
    train_loss = 0
    display_count = 20
    prefix = "gpu_{}".format(train_model._devices[0])
    for i in range(epoch_iters):
        # This timeout is required (temporarily) since CUDA-NCCL
        # operators might deadlock when synchronizing between GPUs.
        timeout = 600.0 if i == 0 else 60.0
        with timeout_guard.CompleteInTimeOrDie(timeout):
            t1 = time.time()
            workspace.RunNet(train_model.net.Proto().name)
            t2 = time.time()
            dt = t2 - t1
        train_accuracy += workspace.FetchBlob(prefix + '/accuracy')
        train_loss += workspace.FetchBlob(prefix + '/loss')
        if (i + 1) % display_count == 0:  # or (i + 1) % epoch_iters == 0:
            fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
            log.info(
                fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
            train_fmt = "Training loss: {}, accuracy: {}"
            log.info(
                train_fmt.format(train_loss / display_count,
                                 train_accuracy / display_count))

            r_train_accuracy.append(train_accuracy / display_count)
            r_loss.append(train_loss / display_count)

            train_accuracy = 0
            train_loss = 0

            test_accuracy = 0
            ntests = 0
            for _ in range(0, 20):
                workspace.RunNet(test_model.net.Proto().name)
                for g in test_model._devices:
                    test_accuracy += np.asscalar(
                        workspace.FetchBlob("gpu_{}".format(g) + '/accuracy'))
                    ntests += 1
            test_accuracy /= ntests
            r_test_accuracy.append(test_accuracy)  #my
    # print(dir(data_parallel_model))

    # exit(0)
    num_images = epoch * epoch_iters * total_batch_size
    prefix = "gpu_{}".format(train_model._devices[0])
    accuracy = workspace.FetchBlob(prefix + '/accuracy')
    loss = workspace.FetchBlob(prefix + '/loss')
    learning_rate = workspace.FetchBlob('SgdOptimizer_0_lr_gpu0')
    test_accuracy = 0
    if (test_model is not None):
        # Run 100 iters of testing
        ntests = 0
        for _ in range(0, 100):
            workspace.RunNet(test_model.net.Proto().name)
            for g in test_model._devices:
                test_accuracy += np.asscalar(
                    workspace.FetchBlob("gpu_{}".format(g) + '/accuracy'))
                ntests += 1
        test_accuracy /= ntests
        r_test_accuracy.append(test_accuracy)  #my
    else:
        test_accuracy = (-1)
    test_fmt = "Testing accuracy: {}"
    log.info(test_fmt.format(test_accuracy))

    explog.log(input_count=num_images,
               batch_count=(i + epoch * epoch_iters),
               additional_values={
                   'accuracy': accuracy,
                   'loss': loss,
                   'learning_rate': learning_rate,
                   'epoch': epoch,
                   'test_accuracy': test_accuracy,
               })
    assert loss < 40, "Exploded gradients :("

    # TODO: add checkpointing
    return epoch + 1