Exemplo n.º 1
0
def multi_infer(params):

    model = torch.load(
        params['init_model'],
        map_location="cuda" if torch.cuda.is_available() else "cpu")
    print('Load model', params['init_model'])
    # model = model.to(device)
    model = model.cuda()
    model.eval()

    infer_loader = get_iwildcam_loader(params, mode='infer')
    infer_loader = data_prefetcher(infer_loader)
    y_preds, y_scores, y_ids = [], [], []
    logits_preds = []
    t1 = time()
    print('Begin to infer')
    with torch.no_grad():
        inputs, labels, ids = infer_loader.next()
        i = 0
        while inputs is not None:  # 遍历batch_size上的多个图片
            _, concat_logits, _, _, _ = model(inputs)  # vector
            output = torch.nn.functional.softmax(concat_logits, dim=-1)
            output = output.cpu().detach().numpy()
            logits_preds.extend(output)  # logits-vector 也是softmax后的prob
            y_preds.extend(np.argmax(output, axis=1))  # list[class_id]
            y_scores.extend(np.max(output, axis=1))
            y_ids.extend(ids)  # image_name: list[xxx_00000x.jpg]

            if (i + 1) % params['print_step'] == 0:
                print("iter: %d,  time_cost_per_iter: %.4f s" %
                      (i, (time() - t1) / params['print_step']))
                t1 = time()
            i += 1
            inputs, labels, ids = infer_loader.next()

    O_ids = list(map(lambda x: x.split('_')[0], y_ids))
    # 'Id': [xxx_000.jpg, yyy_000.jpg],'O_Id': [xxx, yyy], 'Class': [class_id], 'Score': [0.1]
    pred_df = {'Id': y_ids, 'O_Id': O_ids, 'Class': y_preds, 'Score': y_scores}
    pred_df = pd.DataFrame(pred_df)
    save_path = params['save_pred_dir'] + params['init_model'].split(
        '/')[-1].split('.')[0] + '_preds.csv'
    pred_df.to_csv(save_path, index=False)
    print("Save {} to {}".format(
        params['init_model'].split('/')[-1].split('.')[0] + '_preds.csv',
        save_path))

    logits_df = {
        'Id': y_ids,
        'Class': y_preds,
        'Logits': list(logits_preds)
    }  # logits-vector, model embedding用
    logits_df = pd.DataFrame(logits_df)
    save_path = params['save_pred_dir'] + params['init_model'].split(
        '/')[-1].split('.')[0] + '_logits.csv'
    logits_df.to_csv(save_path, index=False)
    print("Save {} to {}".format(
        params['init_model'].split('/')[-1].split('.')[0] + '_logits.csv',
        save_path))
    print('pred done', pred_df.shape)
Exemplo n.º 2
0
def infer(params):

    model = torch.load(params['init_model'])
    print('load model', params['init_model'])
    model = model.to(device)
    model.eval()

    infer_loader = get_iwildcam_loader(params, mode='infer')
    infer_loader = data_prefetcher(infer_loader)
    y_preds, y_ids = [], []
    logits_preds = []
    t1 = time()
    print('begin to infer')
    with torch.no_grad():
        inputs, labels, ids = infer_loader.next()
        i = 0
        while inputs is not None:
            output = model(inputs)
            output = torch.nn.functional.softmax(output, dim=-1)
            output = output.cpu().detach().numpy()
            logits_preds.extend(output)
            y_preds.extend(np.argmax(output, axis=1))
            y_ids.extend(ids)

            if (i + 1) % params['print_step'] == 0:
                print("iter: %d,  time_cost_per_iter: %.4f s" %
                      (i, (time() - t1) / params['print_step']))
                t1 = time()
            i += 1
            inputs, labels, ids = infer_loader.next()

    pred_df = {'Id': y_ids, 'Predicted': y_preds}
    pred_df = pd.DataFrame(pred_df)
    save_path = params['save_pred_dir'] + params['init_model'].split(
        '/')[-1].split('.')[0] + '_preds.csv'
    pred_df.to_csv(save_path, index=False)

    logits_df = {
        'Id': y_ids,
        'Predicted': y_preds,
        'Logits': list(logits_preds)
    }
    logits_df = pd.DataFrame(logits_df)
    save_path = params['save_pred_dir'] + params['init_model'].split(
        '/')[-1].split('.')[0] + '_logits.csv'
    logits_df.to_csv(save_path, index=False)

    print('pred done', pred_df.shape)

    return pred_df
Exemplo n.º 3
0
def multi_inferv2(params):

	model = torch.load(params['init_model'])
	print('=> Load model', params['init_model'])
	model = model.cuda()
	model.eval()

	infer_loader = get_iwildcam_loader(params, mode=params['mode'])
	infer_loader = data_prefetcher(infer_loader)
	y_preds, y_scores, y_ids = [], [], []
	logits_preds = []
	t1 = time()
	print('=> Begin to infer')
	with torch.no_grad():
		inputs, labels, ids = infer_loader.next()
		i = 0
		while inputs is not None: # 遍历batch_size上的多个图片
			output = model(inputs) # vector
			output = torch.nn.functional.softmax(output, dim=-1)
			output = output.cpu().detach().numpy()
			logits_preds.extend(output)
			y_preds.extend(np.argmax(output, axis=1)) # list[class_id]
			y_scores.extend(np.max(output, axis=1))
			y_ids.extend(ids) # image_name: list[xxx_00000x.jpg]

			if (i+1) % params['print_step'] == 0:
				print("iter: %d,  time_cost_per_iter: %.4f s" % (i, (time() - t1)/params['print_step']))
				t1 = time()
			i += 1
			inputs, labels, ids = infer_loader.next()

	O_ids = list(map(lambda x: x.split('_')[0], y_ids))
	# 'Id': [xxx.jpg, yyy.jpg], 'Class': [class_id], 'Score': [0.1]
	print("=> Pred Data Len: {}".format(len(y_ids)))
	pred_df = {'Id': y_ids, 'Class': y_preds, 'Score': y_scores}
	pred_df = pd.DataFrame(pred_df)
	save_path = os.path.join(params['save_pred_dir'], params['init_model'].split('/')[-1].split('.')[0]+'_preds.csv')
	pred_df.to_csv(save_path, index=False)
	print("=> Save {} to {}".format(params['init_model'].split('/')[-1].split('.')[0]+'_preds.csv', save_path))

	logits_df = {'Id': y_ids, 'Class': y_preds, 'Logits': list(logits_preds)} # logits-vector, model embedding用
	logits_df = pd.DataFrame(logits_df)
	save_path = os.path.join(params['save_pred_dir'], params['init_model'].split('/')[-1].split('.')[0]+'_logits.csv')
	logits_df.to_csv(save_path, index=False)
	print("=> Save {} to {}".format(params['init_model'].split('/')[-1].split('.')[0]+'_logits.csv', save_path))
Exemplo n.º 4
0
def train(params):

    if params['init_model'] is not None:
        model = torch.load(params['init_model'])
        print('load model', params['init_model'])
    else:
        model = create_model(params['Net'],
                             pretrained=params['pretrained'],
                             num_classes=params['num_classes'],
                             drop_rate=params['drop_rate'],
                             global_pool='avg',
                             bn_tf=False,
                             bn_momentum=0.99,
                             bn_eps=1e-3,
                             checkpoint_path=params['init_model'],
                             in_chans=3)

    optimizer = get_optimizer(params, model)
    param_num = sum([p.data.nelement() for p in model.parameters()])
    print("Number of model parameters: {} M".format(param_num / 1024 / 1024))
    model = model.to(device)
    model.train()

    if params['lr_schedule']:
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=params['lr_decay_epochs'], gamma=0.2)
    if params['loss'] == 'ce' or params['loss'] == 'cross_entropy':
        criterion = cross_entropy().to(device)
        label_type = 'float'
    elif params['loss'] == 'focal':
        criterion = focal_loss(gamma=1.0, alpha=1.0).to(device)
        label_type = 'long'
    else:
        print('no exist loss', params['loss'])
    train_data_loader, dev_data_loader = get_iwildcam_loader(
        params, mode=params['mode'])

    train_log = []
    dev_log = []
    best_acc, best_f1, best_epoch = 0, 0, 0
    t1 = time()
    print('begin to train')
    use_onehot = params['loss'] != 'focal'
    for epoch in range(params['epochs']):
        train_loader = data_prefetcher(train_data_loader, label_type)
        inputs, labels, ids = train_loader.next()
        i = 0
        while inputs is not None:
            mixup_now = np.random.random() < params['aug_proba']
            if params['mixup'] and mixup_now:
                inputs, labels_a, labels_b, lam = mixup_data(
                    inputs, labels, params['mixup_alpha'])

            optimizer.zero_grad()
            output = model(inputs)
            if params['mixup'] and mixup_now:
                loss = mixup_criterion(criterion, output, labels_a, labels_b,
                                       lam)
            else:
                loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

            if i % params['print_step'] == 0:
                preds = np.argmax(output.cpu().detach().numpy(), axis=1)
                if use_onehot:
                    targets = np.argmax(labels.cpu().detach().numpy(), axis=1)
                else:
                    targets = labels.cpu().detach().numpy()
                acc = metrics.accuracy_score(targets, preds)
                loss_val = loss.cpu().detach().numpy()
                f1 = metrics.f1_score(targets, preds, average='macro')
                train_log.append([epoch, i, loss_val, acc, f1])
                print(
                    "epoch: %d, iter: %d, train_loss: %.4f, train_acc: %.4f, train_f1: %.4f, time_cost_per_iter: %.4f s"
                    % (epoch, i, loss_val, acc, f1,
                       (time() - t1) / params['print_step']))
                with open(params['log_dir'] + 'train.tsv', 'a') as f:
                    f.write('%05d\t%05d\t%f\t%f\t%f\n' %
                            (epoch, i, loss_val, acc, f1))
                t1 = time()

            if (i + 1) % params['save_step'] == 0:
                save_model_path = os.path.join(params['save_dir'],
                                               'model_%d_%d.pkl' % (epoch, i))
                torch.save(model, save_model_path)
                print('save model to', save_model_path)

            if (i + 1) % params['eval_step'] == 0:
                t2 = time()
                model.eval()
                data_loader = data_prefetcher(dev_data_loader, label_type)
                loss_val, acc, f1 = evaluate(model, data_loader, criterion,
                                             use_onehot)
                model.train()
                dev_log.append([epoch, i, loss_val, acc, f1])

                if f1 > best_f1:
                    best_acc, best_f1, best_epoch = acc, f1, epoch
                print('[Evaluation] -------------------------------')
                print(
                    "epoch: %d, test acc: %.4f, f1-score: %.4f, loss: %.4f, best-f1-score: %.4f, eval_time: %.4f s"
                    % (epoch, acc, f1, loss_val, best_f1, time() - t2))
                print('[Evaluation] -------------------------------')

                with open(params['log_dir'] + 'eval.tsv', 'a') as f:
                    f.write('%05d\t%05d\t%f\t%f\t%f\n' %
                            (epoch, i, loss_val, acc, f1))

            inputs, labels, ids = train_loader.next()
            i += 1

        if params['lr_schedule']:
            scheduler.step(epoch)

    return model
def main(cfg, kcross=-1, K=-1):
    tensorboard_dir = os.path.join(cfg.SAVE_DIR, "tb_event")
    if not os.path.exists(cfg.SAVE_DIR):
        os.makedirs(cfg.SAVE_DIR)
    else:
        print(
            "This directory has already existed, Please remember to modify your configs"
        )
        if not click.confirm(
                "\033[1;31;40mContinue and override the former directory?\033[0m",
                default=False,
        ):
            exit(0)
        if tensorboard_dir is not None and os.path.exists(tensorboard_dir):
            shutil.rmtree(tensorboard_dir)
    print("=> output model will be saved in {}".format(cfg.SAVE_DIR))
    tb_writer = SummaryWriter(tensorboard_dir)

    model = create_model(
        cfg.NET.TYPE,
        pretrained=cfg.NET.PRETRAINED,
        num_classes=cfg.NUM_CLASSES,
        drop_rate=cfg.NET.DROP_RATE,
        global_pool='avg',
        bn_tf=False,
        bn_momentum=0.99,
        bn_eps=1e-3,
        checkpoint_path=cfg.INIT_MODEL if cfg.INIT_MODEL != "" else None,
        in_chans=3)
    print(model)

    optimizer = get_optimizer(cfg, model)
    # use torchvision.models
    # model = models.__dict__[params['Net']](num_classes=params['num_classes'])

    param_num = sum([p.data.nelement() for p in model.parameters()])
    print("=> Number of model parameters: {} M".format(param_num / 1024 /
                                                       1024))

    model = model.cuda()
    # summary(model, (3, cfg.INPUT_SIZE[0], cfg.INPUT_SIZE[1]))
    model = DataParallel(model)

    train_data_loader, dev_data_loader = get_iwildcam_loader(
        cfg, mode=cfg.MODE)  # train/eval的dataloader

    if cfg.TRAIN.LR_SCHEDULE == 'Step':  # True
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.2)
    elif cfg.TRAIN.LR_SCHEDULE == 'Cosine':
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06)
    else:
        raise NotImplementedError("Only Support lr_schdule(step, cosine)")

    best_acc, best_f1, best_epoch, start_epoch = 0, 0, 0, 1
    # ------ Begin Resume -------
    if cfg.RESUME:
        load_ckpt(cfg.SAVE_DIR)  # read history parameters from json
        ckpt = torch.load(
            cfg.INIT_MODEL,
            map_location="cuda")  # already specify in load_params()
        print('=> Load checkpoint from ', cfg.INIT_MODEL)
        model.load_state_dict(ckpt['state_dict'])
        optimizer.load_state_dict(ckpt['optimizer'])
        scheduler.load_state_dict(ckpt['scheduler'])
        start_epoch = ckpt['epoch'] + 1
        # best_acc = ckpt['best_acc']
        best_f1 = ckpt['best_f1']
        best_epoch = ckpt['best_epoch']

    if cfg.LOSS.LOSS_TYPE == 'CE':
        criterion = cross_entropy(func_type='softmax').to(device)
        if cfg.LOSS.WEIGHT_PER_CLS:
            CE = torch.nn.CrossEntropyLoss(weight=torch.from_numpy(
                cfg.LOSS.WEIGHT_PER_CLS).float().to(device))
        label_type = 'float'
    elif cfg.LOSS.LOSS_TYPE == 'Sigmoid_CE':
        criterion = cross_entropy(func_type='sigmoid').to(device)
        label_type = 'float'
    elif cfg.LOSS.LOSS_TYPE == 'Focal':
        criterion = focal_loss(gamma=1.0, alpha=1.0).to(device)
        label_type = 'long'
    elif cfg.LOSS.LOSS_TYPE == 'CB_loss':  # FIXME: this is unsure implementation, low score
        criterion = cb_loss(cfg.LOSS.SAMPLES_PER_CLS, cfg.NUM_CLASSES,
                            'softmax').to(device)
        label_type = 'float'
    else:
        raise NotImplementedError("Not accessible loss type for: {}".format(
            cfg.LOSS.LOSS_TYPE))

    t0 = time.time()
    t1 = time.time()
    print('[INFO]Begin to train')
    use_onehot = cfg.LOSS.LOSS_TYPE != 'Focal'
    for epoch in range(start_epoch, cfg.TRAIN.EPOCHS + 1):
        print('=> Current Lr {:.5e}'.format(optimizer.param_groups[0]['lr']))
        if cfg.TRAIN.LR_SCHEDULE:
            scheduler.step()

        if cfg.LOSS.CLASS_WEIGHT:
            train_loss, train_acc, train_f1 = \
             train(train_data_loader, model, CE, optimizer, epoch, cfg, label_type, use_onehot)
        else:
            train_loss, train_acc, train_f1 = \
             train(train_data_loader, model, criterion, optimizer, epoch, cfg, label_type, use_onehot)

        val_acc, val_f1 = validate(dev_data_loader, model, criterion, cfg,
                                   label_type, use_onehot)
        # TODO: this should also be done with the ProgressMeter
        print('=> [Epoch-{}] * Acc {:.3f} F1 {:.3f}'.format(
            epoch, val_acc, val_f1))

        is_best = val_f1 > best_f1
        best_f1 = max(val_f1, best_f1)
        best_epoch = epoch if is_best else best_epoch

        tb_writer.add_scalar('train_loss', train_loss, epoch)
        tb_writer.add_scalar('val_metrics/val_acc', val_acc, epoch)
        tb_writer.add_scalar('val_metrics/val_f1-score', val_f1, epoch)

        save_model_path = os.path.join(cfg.SAVE_DIR,
                                       'model_{:03d}.pkl'.format(epoch))
        torch.save(
            {
                'state_dict': model.state_dict(),
                'scheduler': scheduler.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
                # 'best_acc': best_acc,
                'best_f1': best_f1,
                'best_epoch': best_epoch,
            },
            save_model_path)
        print('=> save model to', save_model_path)

    print("=> Train is over, Time cost: {:.1f} hours...".format(
        (time.time() - t0) / 3600))

    source = 'model_{:03d}.pkl'.format(best_epoch)
    source_path = os.path.join(cfg.SAVE_DIR, source)
    target = 'model_best.pkl'
    target_path = os.path.join(cfg.SAVE_DIR, target)
    try:
        shutil.copy(source_path, target_path)
        print("Save best model to {}: [Epoch: {:d} / f1-score: {:.4f}]".format(
            target_path, best_epoch, best_f1))
    except IOError as e:
        print("Unable to copy file. %s" % e)
    except:
        print("Unexpected error:", sys.exc_info())

    # ---- Delete Useless ckpt
    ckpts = sorted(name for name in os.listdir(cfg.SAVE_DIR)
                   if name.startswith('model'))
    ckpts = ckpts[:-1]
    print("=> Start to clean checkpoint from {} to {}".format(
        ckpts[0], ckpts[-1]))
    for name in ckpts:
        os.remove(os.path.join(cfg.SAVE_DIR, name))

    if cfg.CROSS_VALIDATION:
        ksave_path = os.path.join(cfg.SAVE_DIR, 'kcross_model')
        if not os.path.exists(ksave_path):
            os.makedirs(ksave_path)
        kmodel_path = os.path.join(ksave_path, 'kcross_{}.pkl'.format(kcross))
        shutil.copy(target_path, kmodel_path)
        print("=> Save K-best model to {}...".format(kmodel_path))
Exemplo n.º 6
0
def multi_infer(cfg):

    model = create_model(
        cfg.NET.TYPE,
        pretrained=cfg.NET.PRETRAINED,
        num_classes=cfg.NUM_CLASSES,
        drop_rate=cfg.NET.DROP_RATE,
        global_pool='avg',
        bn_tf=False,
        bn_momentum=0.99,
        bn_eps=1e-3,
        checkpoint_path=cfg.INIT_MODEL if cfg.INIT_MODEL != "" else None,
        in_chans=3)
    print(model)

    # model = torch.load(cfg.INIT_MODEL, map_location="cuda" if torch.cuda.is_available() else "cpu")
    checkpoint = torch.load(
        cfg.INIT_MODEL,
        map_location="cuda" if torch.cuda.is_available() else "cpu")
    print('Load model', cfg.INIT_MODEL)
    state_dict = checkpoint['state_dict']
    for k in list(state_dict.keys()):
        if k.startswith('module'):
            state_dict[k[len("module."):]] = state_dict[k]
            del state_dict[k]
    msg = model.load_state_dict(state_dict, strict=False)

    # model = model.to(device)
    model = model.cuda()
    model.eval()

    infer_loader = get_iwildcam_loader(cfg, mode='infer')
    infer_loader = data_prefetcher(infer_loader)
    y_preds, y_scores, y_ids = [], [], []
    logits_preds = []
    t1 = time()
    print('Begin to infer')
    with torch.no_grad():
        inputs, labels, ids = infer_loader.next()
        i = 0
        while inputs is not None:  # loop each image in a batch
            output = model(inputs)  # vector
            output = torch.nn.functional.softmax(output, dim=-1)
            output = output.cpu().detach().numpy()  # prob
            logits_preds.extend(output)
            y_preds.extend(np.argmax(output, axis=1))  # list[class_id]
            y_scores.extend(np.max(output, axis=1))
            y_ids.extend(ids)  # image_name: list[xxx_00000x.jpg]

            if (i + 1) % 40 == 0:
                print("iter: %d,  time_cost_per_iter: %.4f s" %
                      (i, (time() - t1) / 40))
                t1 = time()
            i += 1
            inputs, labels, ids = infer_loader.next()

    O_ids = list(map(lambda x: x.split('_')[0], y_ids))
    # 'Id': [xxx_000.jpg, yyy_000.jpg],'O_Id': [xxx, yyy], 'Class': [class_id], 'Score': [0.1]
    pred_df = {'Id': y_ids, 'O_Id': O_ids, 'Class': y_preds, 'Score': y_scores}
    pred_df = pd.DataFrame(pred_df)
    save_path = os.path.join(
        cfg.SAVE_PRED_DIR,
        cfg.INIT_MODEL.split('/')[-1].split('.')[0] + '_preds.csv')
    pred_df.to_csv(save_path, index=False)
    print("=> Save {} to {}".format(
        cfg.INIT_MODEL.split('/')[-1].split('.')[0] + '_preds.csv', save_path))

    logits_df = {
        'Id': y_ids,
        'Class': y_preds,
        'Logits': list(logits_preds)
    }  # logits-vector, model embedding用
    logits_df = pd.DataFrame(logits_df)
    save_path = os.path.join(
        cfg.SAVE_PRED_DIR,
        cfg.INIT_MODEL.split('/')[-1].split('.')[0] + '_logits.csv')
    logits_df.to_csv(save_path, index=False)
    print("=> Save {} to {}".format(
        cfg.INIT_MODEL.split('/')[-1].split('.')[0] + '_logits.csv',
        save_path))
Exemplo n.º 7
0
def multi_inferv2(params):
    # read parameters from train scripts: parameteres.json
    with open(os.path.join(params['save_pred_dir'],
                           'parameters.json')) as file:
        train_params = json.load(file)
    params['backbone'] = train_params['backbone']
    params['CAT_NUM'] = train_params['CAT_NUM']
    params['PROPOSAL_NUM'] = train_params['PROPOSAL_NUM']

    ckpt = torch.load(params['init_model'])
    print('=> Load model', params['init_model'])
    model = NTS.attention_net(params,
                              CAT_NUM=params['CAT_NUM'],
                              topN=params['PROPOSAL_NUM'])
    model.load_state_dict(ckpt['state_dict'])
    model = model.cuda()
    model.eval()

    infer_loader = get_iwildcam_loader(params, mode=params['mode'])
    infer_loader = data_prefetcher(infer_loader)
    y_preds, y_scores, y_ids = [], [], []
    logits_preds = []
    t1 = time()
    print('=> Begin to infer')
    with torch.no_grad():
        inputs, labels, ids = infer_loader.next()
        i = 0
        while inputs is not None:  # 遍历batch_size上的多个图片
            _, concat_logits, _, _, _ = model(inputs)
            output = torch.nn.functional.softmax(concat_logits, dim=-1)
            output = output.cpu().detach().numpy()
            logits_preds.extend(output)  # logits-vector 也是softmax后的prob
            y_preds.extend(np.argmax(output, axis=1))  # list[class_id]
            y_scores.extend(np.max(output, axis=1))
            y_ids.extend(ids)  # image_name: list[xxx_00000x.jpg]

            if (i + 1) % params['print_step'] == 0:
                print("iter: %d,  time_cost_per_iter: %.4f s" %
                      (i, (time() - t1) / params['print_step']))
                t1 = time()
            i += 1
            inputs, labels, ids = infer_loader.next()

    O_ids = list(map(lambda x: x.split('_')[0], y_ids))
    # 'Id': [xxx.jpg, yyy.jpg], 'Class': [class_id], 'Score': [0.1]
    print("=> Pred Data Len: {}".format(len(y_ids)))
    pred_df = {'Id': y_ids, 'Class': y_preds, 'Score': y_scores}
    pred_df = pd.DataFrame(pred_df)
    save_path = params['save_pred_dir'] + params['init_model'].split(
        '/')[-1].split('.')[0] + '_preds.csv'
    pred_df.to_csv(save_path, index=False)
    print("=> Save {} to {}".format(
        params['init_model'].split('/')[-1].split('.')[0] + '_preds.csv',
        save_path))

    logits_df = {
        'Id': y_ids,
        'Class': y_preds,
        'Logits': list(logits_preds)
    }  # logits-vector, model embedding用
    logits_df = pd.DataFrame(logits_df)
    save_path = params['save_pred_dir'] + params['init_model'].split(
        '/')[-1].split('.')[0] + '_logits.csv'
    logits_df.to_csv(save_path, index=False)
    print("=> Save {} to {}".format(
        params['init_model'].split('/')[-1].split('.')[0] + '_logits.csv',
        save_path))
    print('pred done', pred_df.shape)
Exemplo n.º 8
0
def main(cfg):
	tensorboard_dir = os.path.join(cfg.SAVE_DIR, "tb_event")
	if not os.path.exists(cfg.SAVE_DIR):
		os.makedirs(cfg.SAVE_DIR)
	else:
		print("This directory has already existed, Please remember to modify your configs")
		if not click.confirm(
			"\033[1;31;40mContinue and override the former directory?\033[0m",
			default=False,
			):
			exit(0)
		if tensorboard_dir is not None and os.path.exists(tensorboard_dir):
			shutil.rmtree(tensorboard_dir)
	print("=> output model will be saved in {}".format(cfg.SAVE_DIR))
	tb_writer = SummaryWriter(tensorboard_dir)

	model = NTS.attention_net(cfg, CAT_NUM=cfg.NET.CAT_NUM, topN=cfg.NET.PROPOSAL_NUM)
	print(model)

	# special for NTS
	raw_parameters = list(model.pretrained_model.parameters())
	part_parameters = list(model.proposal_net.parameters())
	concat_parameters = list(model.concat_net.parameters())
	partcls_parameters = list(model.partcls_net.parameters())
	
	raw_optimizer = torch.optim.SGD(raw_parameters, lr=cfg.TRAIN.LR, momentum=0.9, weight_decay=cfg.TRAIN.WEIGHT_DECAY)
	concat_optimizer = torch.optim.SGD(concat_parameters, lr=cfg.TRAIN.LR, momentum=0.9, weight_decay=cfg.TRAIN.WEIGHT_DECAY)
	part_optimizer = torch.optim.SGD(part_parameters, lr=cfg.TRAIN.LR, momentum=0.9, weight_decay=cfg.TRAIN.WEIGHT_DECAY)
	partcls_optimizer = torch.optim.SGD(partcls_parameters, lr=cfg.TRAIN.LR, momentum=0.9, weight_decay=cfg.TRAIN.WEIGHT_DECAY)

	param_num = sum([p.data.nelement() for p in model.parameters()])
	print("Number of model parameters: {} M".format(param_num / 1024 / 1024))

	model = model.cuda()
	model = DataParallel(model)
	model.train()

	train_data_loader, dev_data_loader = get_iwildcam_loader(params, mode=params['mode']) # train/eval的dataloader

	if params['lr_schedule'] == "Step":# True
		# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=params['lr_decay_epochs'], gamma=0.2)
		schedulers = [MultiStepLR(raw_optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.1),
                    MultiStepLR(concat_optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.1),
                    MultiStepLR(part_optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.1),
                    MultiStepLR(partcls_optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.1)]
	elif params['lr_schedule'] == "Cosine":
		schedulers = [CosineAnnealingLR(raw_optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06),
                    CosineAnnealingLR(concat_optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06),
                    CosineAnnealingLR(part_optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06),
                    CosineAnnealingLR(partcls_optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06)
		]
	
	best_acc, best_f1, best_epoch, start_epoch = 0, 0, 0, 1
	# ------ Begin Resume -------
	if cfg.RESUME:
		load_ckpt(cfg.SAVE_DIR) # read history parameters from json
		ckpt = torch.load(cfg.INIT_MODEL, map_location="cuda") # already specify in load_params()
		print('=> Load checkpoint from ', cfg.INIT_MODEL)
		model.load_state_dict(ckpt['state_dict'])
		raw_optimizer.load_state_dict(ckpt['raw_optimizer'])
		part_optimizer.load_state_dict(ckpt['part_optimizer'])
		concat_optimizer.load_state_dict(ckpt['concat_optimizer'])
		partcls_optimizer.load_state_dict(ckpt['partcls_optimizer'])
		# optimizer.load_state_dict(ckpt['optimizer'])
		scheduler.load_state_dict(ckpt['schduler']) # FIXME: to check
		start_epoch = ckpt['epoch'] + 1
		# best_acc = ckpt['best_acc']
		best_f1 = ckpt['best_f1']
		best_epoch = ckpt['best_epoch']

	if cfg.LOSS.LOSS_TYPE == 'CE':
		criterion = cross_entropy(func_type='softmax').to(device)
		if cfg.LOSS.WEIGHT_PER_CLS:
			CE = torch.nn.CrossEntropyLoss(weight=torch.from_numpy(cfg.LOSS.WEIGHT_PER_CLS).float().to(device))
		label_type = 'float'
	elif cfg.LOSS.LOSS_TYPE == 'Sigmoid_CE':
		criterion = cross_entropy(func_type='sigmoid').to(device)
		label_type = 'float'		
	elif cfg.LOSS.LOSS_TYPE == 'Focal':
		criterion = focal_loss(gamma=1.0, alpha=1.0).to(device)
		label_type = 'long'
	elif cfg.LOSS.LOSS_TYPE == 'CB_loss': # FIXME: this is unsure implementation, low score
		criterion = cb_loss(cfg.LOSS.SAMPLES_PER_CLS, cfg.NUM_CLASSES, 'softmax').to(device)
		label_type = 'float'
	else:
		raise NotImplementedError("Not accessible loss type for: {}".format(cfg.LOSS.LOSS_TYPE))

	t0 = time()
	t1 = time()
	it = 0
	print('[INFO]Begin to train')
	use_onehot = cfg.LOSS.LOSS_TYPE != 'Focal'
	for epoch in range(start_epoch, cfg.TRAIN.EPOCHS + 1):
		print('=> Current Lr {:.5e}'.format(optimizer.param_groups[0]['lr']))
		if cfg.TRAIN.LR_SCHEDULE:
			scheduler.step()

		train_loader = data_prefetcher(train_data_loader, label_type)
		inputs, labels, ids = train_loader.next() # ids没有用到
		i = 0
		batch_time = AverageMeter('Time', ':6.3f')
		data_time = AverageMeter('Data', ':6.3f')
		losses = AverageMeter('Loss', ':.4e')
		train_acc = AverageMeter('Acc', ':6.2f')
		train_f1 = AverageMeter('F1', ':6.2f')
		progress = ProgressMeter(
			len(train_data_loader),
			[batch_time, data_time, losses, train_acc, train_f1],
			prefix="Epoch: [{}]".format(epoch))

		while inputs is not None:
			bs = inputs.size(0)
			# mixup_now = np.random.random() < cfg.AUG.AUG_PROBA # 0.5 一半的概率mixup
			# if cfg.AUG.MIXUP and mixup_now: # True & 一半的概率
			# 	inputs, labels_a, labels_b, lam = mixup_data(inputs, labels, cfg.AUG.MIXUP_ALPHA)

			raw_logits, concat_logits, part_logits, _, top_n_prob = model(inputs)

			# optimizer.zero_grad()
			raw_optimizer.zero_grad()
			part_optimizer.zero_grad()
			concat_optimizer.zero_grad()
			partcls_optimizer.zero_grad()

			raw_logits, concat_logits, part_logits, _, top_n_prob = model(inputs)
			if cfg.AUG.MIXUP and mixup_now:
				# TODO: to implement NTS with mixup
				# loss = mixup_criterion(criterion, output, labels_a, labels_b, lam) # mixup之后的图片也要根据mixup的obj算loss
				pass
			else:
				part_loss = NTS.list_loss(
					part_logits.view(bs * cfg.NET.PROPOSAL_NUM, -1),
					labels.max(axis=1)[1].unsqueeze(1).repeat(1, cfg.NET.PROPOSAL_NUM).view(-1)).view(bs, cfg.NET.PROPOSAL_NUM)
				raw_loss = criterion(raw_logits, labels)
				concat_loss = criterion(concat_logits, labels)
				rank_loss = NTS.ranking_loss(top_n_prob, part_loss, proposal_num=cfg.NET.PROPOSAL_NUM)

				CE = torch.nn.CrossEntropyLoss()
				partcls_loss = CE(
					part_logits.view(bs * cfg.NET.PROPOSAL_NUM, -1),
					labels.max(axis=1)[1].unsqueeze(1).repeat(1, cfg.NET.PROPOSAL_NUM).view(-1))
					# part_logits, (256,6,209) => (1536,209)
					# labels: (1536,)
				total_loss = raw_loss + rank_loss + concat_loss + partcls_loss

			total_loss.backward()

			raw_optimizer.step()
			part_optimizer.step()
			concat_optimizer.step()
			partcls_optimizer.step()

			if i % cfg.PRINT_STEP == 0:
				preds = np.argmax(concat_logits.cpu().detach().numpy(), axis=1) # argmax on logits
				if use_onehot:
					targets = np.argmax(labels.cpu().detach().numpy(), axis=1)
				else:
					targets = labels.cpu().detach().numpy()
				acc = metrics.accuracy_score(targets, preds)
				loss = concat_loss
				loss_val = loss.item()
				f1 = metrics.f1_score(targets,preds,average='macro')
				# train_log.append([epoch,i, loss_val, acc, f1])
				# print("epoch: %d, iter: %d, train_loss: %.4f, train_acc: %.4f, train_f1: %.4f, lr_rate: %.1e, time_cost_per_iter: %.4f s" % ( \
				# 	epoch, i, loss_val, acc, f1, (raw_optimizer.param_groups[0]['lr']), (time() - t1)/params['print_step']))
				tb_writer.add_scalar('train_loss', loss_val, it)
				# with open(params['log_dir'] + 'train.tsv', 'a') as f:
				# 	f.write('%05d\t%05d\t%f\t%f\t%f\n' % (epoch, i, loss_val, acc, f1))
				t1 = time()

			if (i+1) % params['eval_step'] == 0: # 95
				t2=time()
				model.eval()
				data_loader = data_prefetcher(dev_data_loader,label_type)
				loss_val, acc, f1 = evaluate(model, data_loader, criterion, use_onehot)
				model.train()
				dev_log.append([epoch, i, acc, f1])

				if f1 > best_f1:
					best_acc, best_f1, best_iter, best_epoch = acc, f1, i, epoch
				print('[Evaluation] -------------------------------')
				print("epoch: %d, test acc: %.4f, f1-score: %.4f, best-f1-score: %.4f, eval_time: %.4f s" % (
					epoch, acc, f1, best_f1,time()-t2))
				print('[Evaluation] -------------------------------')
				tb_writer.add_scalar('val_metrics/val_acc', acc, it)
				tb_writer.add_scalar('val_metrics/val_f1-score', f1, it)
				tb_writer.add_scalar('val_metrics/val_loss', loss_val, it)
				with open(params['log_dir'] + 'eval.tsv', 'a') as f:
					f.write('%05d\t%05d\t%f\t%f\n' % (epoch, i, acc, f1))
				
				save_model_path= os.path.join(params['save_dir'], 'model_%d_%d.pkl' % (epoch, i))
				# torch.save(model, save_model_path) # FIXME: this is bad for multi-gpu, use below instead
				torch.save({
					'state_dict': model.module.state_dict(),
					'schduler': scheduler.state_dict(),
					'raw_optimizer': raw_optimizer.state_dict(),
					'part_optimizer': part_optimizer.state_dict(),
					'concat_optimizer': concat_optimizer.state_dict(),
					'partcls_optimizer': partcls_optimizer.state_dict(),
					}, save_model_path)
				print('[INFO]save model to', save_model_path)

			inputs, labels, ids = train_loader.next()
			i += 1
			it += 1

	print("[INFO]Train is over, Time cost: %.1f hours..." % ((time()-t0) / 3600))
	# copy best_f1 model to model_best.pkl
	source = 'model_%d_%d.pkl' % (best_epoch, best_iter)
	source_path = os.path.join(params['save_dir'], source)
	target = 'model_best.pkl'
	target_path = os.path.join(params['save_dir'], target)
	try:
		shutil.copy(source_path, target_path)
		print("Save best model to {}: [epoch-iter: {:d}-{:d}/ f1-score: {:.4f}]".format(target_path, best_epoch, best_iter, best_f1))
	except IOError as e:
		print("Unable to copy file. %s" % e)
	except:
		print("Unexpected error:", sys.exc_info())

	# ---- Delete Useless ckpt
	ckpts = sorted(name for name in os.listdir(params['save_dir']) if name.startswith('model'))
	ckpts = ckpts[:-1]
	print("=> Start to clean checkpoint from {} to {}".format(ckpts[0], ckpts[-1]))
	for name in ckpts:
		os.remove(os.path.join(params['save_dir'], name))