Exemplo n.º 1
0
def cpu_sk(self):
    """ Sinkhorn Knopp optimization on CPU
        * stores activations to RAM
        * does matrix-vector multiplies on CPU
        * slower than GPU
    """
    # 1. aggregate inputs:
    N = len(self.pseudo_loader.dataset)
    if self.num_heads == 1:
        self.PS = np.zeros((N, self.num_clusters_per_head), dtype=self.dtype)
    else:
        self.PS_pre = np.zeros((N, self.presize), dtype=self.dtype)
    now = time.time()
    l_dl = len(self.pseudo_loader)
    time.time()
    batch_time = MovingAverage(intertia=0.9)
    self.model.headcount = 1
    for batch_idx, (data, _, _selected) in enumerate(self.pseudo_loader):
        data = data.to(self.device)
        mass = data.size(0)
        if self.num_heads == 1:
            p = nn.functional.softmax(self.model(data), 1)
            self.PS[_selected, :] = p.detach().cpu().numpy().astype(self.dtype)
        else:
            p = self.model(data)
            self.PS_pre[_selected, :] = p.detach().cpu().numpy().astype(self.dtype)
        batch_time.update(time.time() - now)
        now = time.time()
        if batch_idx % 50 == 0:
            print(f"Aggregating batch {batch_idx:03}/{l_dl}, speed: {mass / batch_time.avg:04.1f}Hz",
                  end='\r', flush=True)
    self.model.headcount = self.num_heads
    print("Aggreg of outputs  took {0:.2f} min".format((time.time() - now) / 60.), flush=True)

    # 2. solve label assignment via sinkhorn-knopp:
    if self.num_heads == 1:
        optimize_L_sk(self, nh=0)
    else:
        for nh in range(self.num_heads):
            print(f"computing head {nh} ", end="\r", flush=True)
            tl = getattr(self.model, f"top_layer{nh:d}")
            time_mat = time.time()

            # clear memory
            try:
                del self.PS
            except:
                pass

            # apply last FC layer (a matmul and adding of bias)
            self.PS = (self.PS_pre @ tl.weight.cpu().numpy().T.astype(self.dtype)
                       + tl.bias.cpu().numpy().astype(self.dtype))
            print(f"matmul took {(time.time() - time_mat) / 60:.2f}min", flush=True)
            self.PS = py_softmax(self.PS, 1)
            optimize_L_sk(self, nh=nh)
    return
Exemplo n.º 2
0
def aggreg_multi_gpu(model,
                     dataloader,
                     hc,
                     dim,
                     TYPE=torch.float64,
                     model_gpus=1):
    """"Accumulate activations and save them on multiple GPUs
        * this function assumes the model is on the first `model_gpus` GPUs
          so that it can write the activations on the remaining ones
        * it splits the activations evenly between the remaining GPUs
    """
    # number of gpus to store
    ngpu_store = torch.cuda.device_count() - model_gpus

    # number of batches in DL
    l_dl = len(dataloader)

    # number of batches each gpu gets
    batches_per_gpu = l_dl // ngpu_store

    # number of data each gpu gets
    points_per_gpu = batches_per_gpu * dataloader.batch_size

    # empty array of indices that we need to keep track of
    indices = torch.empty(len(dataloader.dataset), dtype=torch.long)

    # set up matrix PS: (N x K) when using one head, otherwise N x D, where D is the dim before the last FC layer.
    PS = [
        torch.empty(points_per_gpu, dim, device='cuda:' + str(i), dtype=TYPE)
        for i in range(model_gpus, model_gpus + ngpu_store - 1)
    ]
    # accomodate remainder
    PS.append(
        torch.empty(len(dataloader.dataset) -
                    (ngpu_store - 1) * points_per_gpu,
                    dim,
                    device='cuda:' + str(model_gpus + ngpu_store - 1),
                    dtype=TYPE))

    # slice sizes, i.e. how many activations will be on the gpus
    slices = [qq.shape[0] for qq in PS]
    print("slice sizes: ", slices, flush=True)
    batch_time = MovingAverage(intertia=0.9)
    now = time.time()
    st = 0
    softmax = torch.nn.Softmax(dim=1).to('cuda:0')

    # switch the model to not output array but instead last-FC output for one head and pre-last activations for multi-heads
    model.headcount = 1
    for batch_idx, (data, _, _selected) in enumerate(dataloader):
        data = data.to(torch.device('cuda:0'))
        mass = data.size(0)
        en = st + mass
        # j keeps track of which part of PS we're writing to
        j = min((batch_idx // batches_per_gpu), ngpu_store - 1)
        subs = j * points_per_gpu
        if hc == 1:
            p = softmax(model(data)).detach().to(TYPE)
            # when using one head: save softmax (N x K) matrix:
            PS[j][st - subs:en - subs, :].copy_(p)
        else:
            # when using multiple heads: save softmax (N x D) matrix
            PS[j][st - subs:en - subs, :].copy_(model(data).detach())
        indices[st:en].copy_(_selected)
        st = en
        batch_time.update(time.time() - now)
        now = time.time()
        if batch_idx % 50 == 0:
            print(
                f"Aggregating batch {batch_idx:03}/{l_dl}, speed: {mass / batch_time.avg:04.1f}Hz. To rGPU {j + 1}",
                end='\r',
                flush=True)
    torch.cuda.synchronize()  # just in case
    return PS, indices
def train(max_iter, snapshot, dataset, setname, mu, lr, bs, tfmodel_folder,
          conv5, model_name, stop_iter, pre_emb=False):
    iters_per_log = 100
    data_folder = './' + dataset + '/' + setname + '_batch/'
    data_prefix = dataset + '_' + setname
    snapshot_file = os.path.join(tfmodel_folder, dataset + '_iter_%d.tfmodel')
    if not os.path.isdir(tfmodel_folder):
        os.makedirs(tfmodel_folder)

    cls_loss_avg = 0
    avg_accuracy_all, avg_accuracy_pos, avg_accuracy_neg = 0, 0, 0
    decay = 0.99
    vocab_size = 8803 if dataset == 'referit' else 12112
    emb_name = 'referit' if dataset == 'referit' else 'Gref'

    if pre_emb:
        print("Use pretrained Embeddings.")
        model = get_segmentation_model(model_name, mode='train',
                                       vocab_size=vocab_size, start_lr=lr,
                                       batch_size=bs, conv5=conv5, emb_name=emb_name)
    else:
        model = get_segmentation_model(model_name, mode='train',
                                       vocab_size=vocab_size, start_lr=lr,
                                       batch_size=bs, conv5=conv5)

    weights = './data/weights/deeplab_resnet_init.ckpt'
    print("Loading pretrained weights from {}".format(weights))
    load_var = {var.op.name: var for var in tf.global_variables()
                if var.name.startswith('res') or var.name.startswith('bn') or var.name.startswith('conv1')}

    snapshot_loader = tf.train.Saver(load_var)
    snapshot_saver = tf.train.Saver(max_to_keep=4)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    snapshot_loader.restore(sess, weights)

    im_h, im_w, num_steps = model.H, model.W, model.num_steps
    text_batch = np.zeros((bs, num_steps), dtype=np.float32)
    image_batch = np.zeros((bs, im_h, im_w, 3), dtype=np.float32)
    mask_batch = np.zeros((bs, im_h, im_w, 1), dtype=np.float32)
    valid_idx_batch = np.zeros((bs, 1), dtype=np.int32)

    reader = data_reader.DataReader(data_folder, data_prefix)

    # for time calculate
    last_time = time.time()
    time_avg = MovingAverage()
    for n_iter in range(max_iter):

        for n_batch in range(bs):
            batch = reader.read_batch(is_log=(n_batch == 0 and n_iter % iters_per_log == 0))
            text = batch['text_batch']
            im = batch['im_batch'].astype(np.float32)
            mask = np.expand_dims(batch['mask_batch'].astype(np.float32), axis=2)

            im = im[:, :, ::-1]
            im -= mu

            text_batch[n_batch, ...] = text
            image_batch[n_batch, ...] = im
            mask_batch[n_batch, ...] = mask

            for idx in range(text.shape[0]):
                if text[idx] != 0:
                    valid_idx_batch[n_batch, :] = idx
                    break

        _, cls_loss_val, lr_val, scores_val, label_val = sess.run([model.train_step,
                                                                   model.cls_loss,
                                                                   model.learning_rate,
                                                                   model.pred,
                                                                   model.target],
                                                                  feed_dict={
                                                                      model.words: text_batch,
                                                                      # np.expand_dims(text, axis=0),
                                                                      model.im: image_batch,
                                                                      # np.expand_dims(im, axis=0),
                                                                      model.target_fine: mask_batch,
                                                                      # np.expand_dims(mask, axis=0)
                                                                      model.valid_idx: valid_idx_batch
                                                                  })
        cls_loss_avg = decay * cls_loss_avg + (1 - decay) * cls_loss_val

        # Accuracy
        accuracy_all, accuracy_pos, accuracy_neg = compute_accuracy(scores_val, label_val)
        avg_accuracy_all = decay * avg_accuracy_all + (1 - decay) * accuracy_all
        avg_accuracy_pos = decay * avg_accuracy_pos + (1 - decay) * accuracy_pos
        avg_accuracy_neg = decay * avg_accuracy_neg + (1 - decay) * accuracy_neg

        # timing
        cur_time = time.time()
        elapsed = cur_time - last_time
        last_time = cur_time

        if n_iter % iters_per_log == 0:
            print('iter = %d, loss (cur) = %f, loss (avg) = %f, lr = %f'
                  % (n_iter, cls_loss_val, cls_loss_avg, lr_val))
            print('iter = %d, accuracy (cur) = %f (all), %f (pos), %f (neg)'
                  % (n_iter, accuracy_all, accuracy_pos, accuracy_neg))
            print('iter = %d, accuracy (avg) = %f (all), %f (pos), %f (neg)'
                  % (n_iter, avg_accuracy_all, avg_accuracy_pos, avg_accuracy_neg))
            time_avg.add(elapsed)
            print('iter = %d, cur time = %.5f, avg time = %.5f, model_name: %s' % (n_iter, elapsed, time_avg.get_avg(), model_name))

        # Save snapshot
        if (n_iter + 1) % snapshot == 0 or (n_iter + 1) >= max_iter:
            snapshot_saver.save(sess, snapshot_file % (n_iter + 1))
            print('snapshot saved to ' + snapshot_file % (n_iter + 1))
        if (n_iter + 1) >= stop_iter:
            print('stop training at iter ' + str(stop_iter))
            break

    print('Optimization done.')
    def optimize_epoch(self,
                       model,
                       criterion,
                       optimizer,
                       loader,
                       epoch,
                       is_validation=False):
        top1 = []
        top5 = []
        loss_value = []
        for i in range(len(model.probes)):
            top1.append(TotalAverage())
            top5.append(TotalAverage())
            loss_value.append(TotalAverage())
        batch_time = MovingAverage(intertia=0.9)
        now = time.time()

        if is_validation is False:
            model.train()
            lr = self.lr_schedule(epoch)
            for pg in optimizer.param_groups:
                pg['lr'] = lr
            print(f"Starting epoch {epoch} with learning rate {lr}")
        else:
            model.eval()
        for iter, (input, label) in enumerate(loader):
            input = input.to('cuda:0')
            label = label.to('cuda:0')
            mass = input.size(0)
            total_loss = None
            if args.data in ['Imagenet', 'Places'
                             ] and is_validation and args.tencrops:
                bs, ncrops, c, h, w = input.size()
                input_tensor = input.view(-1, c, h, w)
                input = torch.autograd.Variable(input_tensor.cuda())
            else:
                input = torch.autograd.Variable(input.cuda())

            predictions = model(input)
            if args.data in ['Imagenet', 'Places'
                             ] and is_validation and args.tencrops:
                predictions = [
                    torch.squeeze(p.view(bs, ncrops, -1).mean(1))
                    for p in predictions
                ]
            for i, prediction in enumerate(predictions):
                loss = criterion(prediction, label)
                if total_loss is None:
                    total_loss = loss
                else:
                    total_loss = total_loss + loss
                top1_, top5_ = accuracy(prediction, label, topk=(1, 5))
                top1[i].update(top1_.item(), mass)
                top5[i].update(top5_.item(), mass)
                loss_value[i].update(loss.item(), mass)

            if is_validation is False:
                optimizer.zero_grad()
                total_loss.backward()
                optimizer.step()

            batch_time.update(time.time() - now)
            now = time.time()

        top1_str = 'top1 val' if is_validation else 'top1 train'
        top5_str = 'top5 val' if is_validation else 'top5 train'
        writer.add_scalars(
            top1_str,
            {f"depth_{k+1}": top1[k].avg
             for k in range(len(model.probes))}, epoch)
        writer.add_scalars(
            top5_str,
            {f"depth_{k+1}": top5[k].avg
             for k in range(len(model.probes))}, epoch)
        writer.add_scalars('losses', {
            f"depth_{k+1}": loss_value[k].avg
            for k in range(len(model.probes))
        }, epoch)
        if is_validation:
            print('VAL:')
            for i in range(len(model.probes)):
                print(
                    f" [{i}] t1:{top1[i].avg:04.2f} loss:{loss_value[i].avg:.2f}",
                    end='')
            print()
        else:
            print('TRAIN:')
            for i in range(len(model.probes)):
                print(
                    f" [{i}] t1:{top1[i].avg:04.2f} loss:{loss_value[i].avg:.2f}",
                    end='')
            print()

        return {
            "loss": [x.avg for x in loss_value],
            "top1": [x.avg for x in top1],
            "top5": [x.avg for x in top5]
        }
Exemplo n.º 5
0
def train(method, environment, resume, episodes, lr, lr_episodes, min_lr,
          eval_only, replay_width, batch_size, gamma, update_rate,
          save_interval):

    history = History(method + '_' + environment,
                      ['steps', 'avg_reward', 'loss'], resume is not None)
    history.flush()
    memory = ReplayMemory(replay_width)
    game = Game(name=environments_to_names[environment],
                memory=memory,
                render=False)
    init_state, state_shape = game.get_state(True)
    n_actions = game.env.action_space.n
    agent_cls = agent_factory[method]
    agent = agent_cls(state_shape,
                      n_actions,
                      environment,
                      episodes,
                      update_rate,
                      step_size=lr_episodes,
                      lr=lr,
                      save_interval=save_interval)

    # resume from a ckpt
    if resume is not None:
        agent.load(resume)

    avg_reward = MovingAverage(100)
    avg_loss = MovingAverage(100)

    log.info(f'Training with {episodes}, starting ...')

    # main training loop
    for i in range(episodes):
        state = game.reset()
        done = False
        loss = None
        while not done:
            state = game.state
            action = agent.select_action(state)

            transition, done = game.step(int(action.to('cpu').numpy()))

            if len(memory) > batch_size:
                batched = memory.sample(batch_size)
                loss = agent.train(batched, batch_size, gamma, i)
                avg_loss.add(loss)
        reward = game.rewards
        # agent.save_best(reward)
        agent.save()
        agent.scheduler.step()
        avg_reward.add(reward)

        # moving averages
        text = [
            f'steps: {agent.step_cnt}',
            f'game epochs: {i}/{episodes}',
            f'train loss: {float(avg_loss):.5}',
            f'avg reward: {float(avg_reward):.5}',
            # f'best reward: {float(agent.best_reward):.5}',
            f'reward: {float(reward):.5}',
            f'epsilon: {agent.epsilon:.3}',
        ]
        log.info(', '.join(text), update=True)
        if agent.step_cnt % save_interval == 0:
            history.record({
                'steps': agent.step_cnt,
                'avg_reward': float(avg_reward),
                'loss': float(avg_loss),
            })

    game.env.close()
Exemplo n.º 6
0
    def optimize_epoch(self,
                       model,
                       criterion,
                       optimizer,
                       loader,
                       epoch,
                       is_validation=False):
        top1 = []
        top5 = []
        loss_value = []
        top1.append(TotalAverage())
        top5.append(TotalAverage())
        loss_value.append(TotalAverage())
        batch_time = MovingAverage(intertia=0.9)
        now = time.time()

        if is_validation is False:
            model.run()
            lr = self.lr_schedule(epoch)
            for pg in optimizer.param_groups:
                pg['lr'] = lr
            print("Starting epoch %s" % epoch)
        else:
            model.eval()
        l_dl = len(loader)
        for iter, q in enumerate(loader):
            if len(q) == 3:
                input, label, _s = q
            else:
                input, label = q
            input = input.to(self.dev)
            label = label.to(self.dev)
            mass = input.size(0)
            if is_validation and args.tencrops:
                bs, ncrops, c, h, w = input.size()
                input_tensor = input.view(-1, c, h, w)
                input = input_tensor.to(self.dev)
                predictions = model(input)
                predictions = torch.squeeze(
                    predictions.view(bs, ncrops, -1).mean(1))
            else:
                input = input.to(self.dev)
                predictions = model(input)

            loss = criterion(predictions, label)
            top1_, top5_ = accuracy(predictions, label, topk=(1, 5))
            top1[0].update(top1_.item(), mass)
            top5[0].update(top5_.item(), mass)
            loss_value[0].update(loss.item(), mass)

            if is_validation is False:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            batch_time.update(time.time() - now)
            now = time.time()
            if iter % 50 == 0:
                print(
                    f"{'V' if is_validation else 'T'} Loss: {loss_value[0].avg:03.3f} "
                    f"Top1: {top1[0].avg:03.1f} Top5: {top5[0].avg:03.1f} "
                    f"{epoch: 3}/{iter:05}/{l_dl:05} Freq: {mass / batch_time.avg:04.1f}Hz:",
                    end='\r',
                    flush=True)
        if is_validation:
            print("validation")
            print("val-top1: %s" % top1[0].avg)
            print("val-top5: %s" % top5[0].avg)
        if self.writer:
            str_ = 'LP/val' if is_validation else 'LP/train'
            self.writer.add_scalar(f'{str_}/top1', top1[0].avg, epoch)
            self.writer.add_scalar(f'{str_}/top5', top5[0].avg, epoch)
            self.writer.add_scalar(f'{str_}/Freq', mass / batch_time.avg,
                                   epoch)

        return {
            "loss": [x.avg for x in loss_value],
            "top1": [x.avg for x in top1],
            "top5": [x.avg for x in top1]
        }
Exemplo n.º 7
0
def train(max_iter,
          snapshot,
          dataset,
          data_dir,
          setname,
          mu,
          lr,
          bs,
          tfmodel_folder,
          conv5,
          model_name,
          stop_iter,
          last_iter,
          pre_emb=False,
          finetune=False,
          pretrain_path='',
          emb_dir=''):
    global args
    iters_per_log = 100
    data_folder = os.path.join(data_dir, dataset + '/' + setname + '_batch/')
    data_prefix = dataset + '_' + setname
    snapshot_file = os.path.join(tfmodel_folder, dataset + '_finetune')
    if not os.path.isdir(tfmodel_folder):
        os.makedirs(tfmodel_folder)

    cls_loss_avg = 0
    avg_accuracy_all, avg_accuracy_pos, avg_accuracy_neg = 0, 0, 0
    decay = 0.99
    vocab_size = 8803 if dataset == 'referit' else 1917498
    emb_name = dataset

    if pre_emb:
        print("Use pretrained Embeddings.")
        model = get_segmentation_model(model_name,
                                       mode='train',
                                       vocab_size=vocab_size,
                                       start_lr=lr,
                                       batch_size=bs,
                                       conv5=conv5,
                                       emb_name=emb_name,
                                       emb_dir=emb_dir,
                                       freeze_bn=args.freeze_bn,
                                       is_aug=args.is_aug)
    else:
        model = get_segmentation_model(model_name,
                                       mode='train',
                                       vocab_size=vocab_size,
                                       start_lr=lr,
                                       batch_size=bs,
                                       conv5=conv5)
    if finetune:
        weights = os.path.join(pretrain_path)
        snapshot_loader = tf.train.Saver()
    else:
        weights = './data/weights/deeplab_resnet_init.ckpt'
        print("Loading pretrained weights from {}".format(weights))
        load_var = {
            var.op.name: var
            for var in tf.global_variables()
            if var.name.startswith('res') or var.name.startswith('bn')
            or var.name.startswith('conv1') or var.name.startswith('Adam')
        }
        snapshot_loader = tf.train.Saver(load_var)

    snapshot_saver = tf.train.Saver(max_to_keep=4)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    snapshot_loader.restore(sess, weights)
    # Log tensorboard
    train_writer = tf.summary.FileWriter(args.log_dir + '/train', sess.graph)

    im_h, im_w, num_steps = model.H, model.W, model.num_steps
    text_batch = np.zeros((bs, num_steps), dtype=np.float32)
    image_batch = np.zeros((bs, im_h, im_w, 3), dtype=np.float32)
    mask_batch = np.zeros((bs, im_h, im_w, 1), dtype=np.float32)
    seq_len_batch = np.zeros(bs, dtype=np.int32)
    valid_idx_batch = np.zeros(bs, dtype=np.int32)

    if dataset == 'refvos':
        reader = data_reader_refvos.DataReader(im_dir=args.im_dir,
                                               mask_dir=args.mask_dir,
                                               train_metadata=args.meta)

    # for time calculate
    last_time = time.time()
    time_avg = MovingAverage()
    meanIoU = 0
    last_epoch = (last_iter * bs) // reader.num_batch
    for n_iter in range(last_iter + 1, max_iter):
        for n_batch in range(bs):
            batch = reader.read_batch(
                is_log=(n_batch == 0 and n_iter % iters_per_log == 0))
            text = batch['text_batch']
            im = batch['im_batch'].astype(np.float32)
            # mask = batch['mask_batch']
            mask = np.expand_dims(batch['mask_batch'].astype(np.float32),
                                  axis=2)
            seq_len = batch['seq_length']
            im = im[:, :, ::-1]
            im -= mu

            text_batch[n_batch, ...] = text
            image_batch[n_batch, ...] = im
            mask_batch[n_batch, ...] = mask
            seq_len_batch[n_batch] = seq_len

        _, train_step, summary = sess.run(
            [
                model.train,
                model.train_step,
                model.merged,
            ],
            feed_dict={
                model.words: text_batch,
                model.im: image_batch,
                model.target_fine: mask_batch,
                model.seq_len: seq_len_batch,
            })
        # cls_loss_avg = decay * cls_loss_avg + (1 - decay) * cls_loss_val
        # cls_loss_avg
        # Accuracy
        # accuracy_all, accuracy_pos, accuracy_neg = compute_accuracy(scores_val, label_val)
        # avg_accuracy_all = decay * avg_accuracy_all + (1 - decay) * accuracy_all
        # avg_accuracy_pos = decay * avg_accuracy_pos + (1 - decay) * accuracy_pos
        # avg_accuracy_neg = decay * avg_accuracy_neg + (1 - decay) * accuracy_neg
        # IoU = compute_meanIoU(scores_val, mask_batch)
        # meanIoU += IoU
        # timing
        cur_time = time.time()
        elapsed = cur_time - last_time
        last_time = cur_time
        train_writer.add_summary(summary, train_step)
        # if n_iter % iters_per_log == 0:
        #     print('iter = %d, loss (cur) = %f, loss (avg) = %f, lr = %f'
        #           % (n_iter, cls_loss_val, cls_loss_avg, lr_val))
        #     print('iter = %d, accuracy (cur) = %f (all), %f (pos), %f (neg)'
        #           % (n_iter, accuracy_all, accuracy_pos, accuracy_neg))
        #     print('iter = %d, accuracy (avg) = %f (all), %f (pos), %f (neg)'
        #           % (n_iter, avg_accuracy_all, avg_accuracy_pos, avg_accuracy_neg))
        #     print('iter = %d, meanIoU = %f (neg)'
        #           % (n_iter, meanIoU / iters_per_log))
        #     meanIoU = 0
        #     time_avg.add(elapsed)
        #     print('iter = %d, cur time = %.5f, avg time = %.5f, model_name: %s' % (n_iter, elapsed, time_avg.get_avg(), model_name))

        # Save snapshot
        if (n_iter * bs // reader.num_batch > last_epoch):
            last_epoch += 1
            snapshot_saver.save(sess, snapshot_file, global_step=train_step)
            print('snapshot saved at iteration {}'.format(n_iter))
        if (n_iter + 1) % snapshot == 0 or (n_iter + 1) >= max_iter:
            snapshot_saver.save(sess, snapshot_file, global_step=train_step)
            print('snapshot saved at iteration {}'.format(n_iter))
        if (n_iter + 1) >= stop_iter:
            print('stop training at iter ' + str(stop_iter))
            break

    print('Optimization done.')
Exemplo n.º 8
0
 def __init__(self):
     self.avg_reward = MovingAverage(100)
     self.avg_loss = MovingAverage(100)
Exemplo n.º 9
0
    def optimize_epoch(self, model, optimizer, loader, epoch, validation=False):
        print(f"Starting epoch {epoch}, validation: {validation} " + "=" * 30)
        loss_value = AverageMeter()
        rotacc_value = AverageMeter()

        # house keeping
        if not validation:
            model.run()
            lr = self.lr_schedule(epoch)
            for pg in optimizer.param_groups:
                pg['lr'] = lr
        else:
            model.eval()

        XE = torch.nn.CrossEntropyLoss().to(self.dev)
        l_dl = 0  # len(loader)
        now = time.time()
        batch_time = MovingAverage(intertia=0.9)
        for iter, (data, label, selected) in enumerate(loader):
            now = time.time()

            if not validation:
                niter = epoch * len(loader.dataset) + iter * args.batch_size
            data = data.to(self.dev)
            mass = data.size(0)
            where = np.arange(mass, dtype=int) * 4
            data = data.view(mass * 4, 3, data.size(3), data.size(4))
            rotlabel = torch.tensor(range(4)).view(-1, 1).repeat(mass, 1).view(-1).to(self.dev)
            #################### train CNN ###########################################
            if not validation:
                final = model(data)
                if args.onlyrot:
                    loss = torch.Tensor([0]).to(self.dev)
                else:
                    if args.hc == 1:
                        loss = XE(final[0][where], self.L[selected])
                    else:
                        loss = torch.mean(
                            torch.stack([XE(final[k][where], self.L[k, selected]) for k in range(args.hc)]))
                rotloss = XE(final[-1], rotlabel)
                pred = torch.argmax(final[-1], 1)

                total_loss = loss + rotloss
                optimizer.zero_grad()
                total_loss.backward()
                optimizer.step()
                correct = (pred == rotlabel).to(torch.float)
                rotacc = correct.sum() / float(mass)
            else:
                final = model(data)
                pred = torch.argmax(final[-1], 1)
                correct = (pred == rotlabel.cuda()).to(torch.float)
                rotacc = correct.sum() / float(mass)
                total_loss = torch.Tensor([0])
                loss = torch.Tensor([0])
                rotloss = torch.Tensor([0])
            rotacc_value.update(rotacc.item(), mass)
            loss_value.update(total_loss.item(), mass)

            batch_time.update(time.time() - now)
            now = time.time()
            print(
                f"Loss: {loss_value.avg:03.3f}, RotAcc: {rotacc_value.avg:03.3f} | {epoch: 3}/{iter:05}/{l_dl:05} Freq: {mass / batch_time.avg:04.1f}Hz:",
                end='\r', flush=True)

            # every few iter logging
            if iter % args.logiter == 0:
                if not validation:
                    print(niter, f" Loss: {loss.item():.3f}", flush=True)
                    with torch.no_grad():
                        if not args.onlyrot:
                            pred = torch.argmax(final[0][where], dim=1)
                            pseudoloss = XE(final[0][where], pred)
                    if not args.onlyrot:
                        self.writer.add_scalar('Pseudoloss', pseudoloss.item(), niter)
                    self.writer.add_scalar('lr', self.lr_schedule(epoch), niter)
                    self.writer.add_scalar('Loss', loss.item(), niter)
                    self.writer.add_scalar('RotLoss', rotloss.item(), niter)
                    self.writer.add_scalar('RotAcc', rotacc.item(), niter)

                    if iter > 0:
                        self.writer.add_scalar('Freq(Hz)', mass / (time.time() - now), niter)

        # end of epoch logging
        if self.writer and (epoch % self.log_interval == 0):
            write_conv(self.writer, model, epoch)
            if validation:
                print('val Rot-Acc: ', rotacc_value.avg)
                self.writer.add_scalar('val Rot-Acc', rotacc_value.avg, epoch)

        files.save_checkpoint_all(self.checkpoint_dir, model, args.arch,
                                  optimizer, self.L, epoch, lowest=False)
        return {'loss': loss_value.avg}