Exemplo n.º 1
0
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1
    # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv')
    # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv')
    # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv')
    # raw_pseudo_df2 = pd.read_csv('./mnt/inputs/pseudos/top2_e121_e125_e126_e127_e128_e129/raw_pseudo_tst_df.csv')
    # half_opt_pseudo_df2 = pd.read_csv('./mnt/inputs/pseudos/top2_e121_e125_e126_e127_e128_e129/half_opt_pseudo_tst_df.csv')
    # opt_pseudo_df2 = pd.read_csv('./mnt/inputs/pseudos/top2_e121_e125_e126_e127_e128_e129/opt_pseudo_tst_df.csv')

    # clean texts
    # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer'])

    # load additional tokens
    # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin:
    #     additional_tokens = pickle.load(fin)

    gkf = GroupKFold(n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(
                    histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ',
            logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]  # + additional_tokens

        # fold_trn_df = pd.concat([fold_trn_df, raw_pseudo_df, opt_pseudo_df, half_opt_pseudo_df, raw_pseudo_df2, opt_pseudo_df2, half_opt_pseudo_df2], axis=0)

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        # fobj = BCEWithLogitsLoss()
        # fobj = FocalLossKaggle(gamma=2)
        fobj = MarginRankingLoss()
        state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict()
        model = BertModelForBinaryMultiLabelClassifier(
            num_labels=len(LABEL_COL),
            config_path=MODEL_CONFIG_PATH,
            state_dict=state_dict,
            token_size=len(trn_dataset.tokenizer),
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
            cat_last_layer_num=1,
            do_ratio=0.2,
        )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader,
                                       DEVICE)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader, DEVICE, mode='valid')

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [
                    val_loss,
                ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [
                    val_metric,
                ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [
                    val_metric_raws,
                ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}', logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                val_y_preds,
                val_y_trues,
                val_qa_ids,
                fold,
                epoch,
                val_loss,
                val_metric,
            )
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(
                histories["val_metric"][fold])])
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer,
                                      clean=False)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
Exemplo n.º 2
0
def main(args):
    # Select the hardware device to use for inference.
    if torch.cuda.is_available():
        device = torch.device('cuda', torch.cuda.current_device())
        torch.backends.cudnn.benchmark = True
    else:
        device = torch.device('cpu')

    # Disable gradient calculations by default.
    torch.set_grad_enabled(False)

    # create checkpoint dir
    os.makedirs(args.checkpoint, exist_ok=True)

    if args.arch == 'hg1':
        model = hg1(pretrained=False)
    elif args.arch == 'hg2':
        model = hg2(pretrained=False)
    elif args.arch == 'hg8':
        model = hg8(pretrained=False)
    else:
        raise Exception('unrecognised model architecture: ' + args.model)

    model = DataParallel(model).to(device)

    optimizer = RMSprop(model.parameters(),
                        lr=args.lr,
                        momentum=args.momentum,
                        weight_decay=args.weight_decay)

    best_acc = 0

    # optionally resume from a checkpoint
    title = 'mpii ' + args.arch
    if args.resume:
        assert os.path.isfile(args.resume)
        print("=> loading checkpoint '{}'".format(args.resume))
        checkpoint = torch.load(args.resume)
        args.start_epoch = checkpoint['epoch']
        best_acc = checkpoint['best_acc']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("=> loaded checkpoint '{}' (epoch {})".format(
            args.resume, checkpoint['epoch']))
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                        title=title,
                        resume=True)
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title)
        logger.set_names(
            ['Epoch', 'LR', 'Train Loss', 'Val Loss', 'Train Acc', 'Val Acc'])

    # create data loader
    train_dataset = Mpii(args.image_path, is_train=True)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.train_batch,
                              shuffle=True,
                              num_workers=args.workers,
                              pin_memory=True)

    val_dataset = Mpii(args.image_path, is_train=False)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.test_batch,
                            shuffle=False,
                            num_workers=args.workers,
                            pin_memory=True)

    # train and eval
    lr = args.lr
    for epoch in range(args.start_epoch, args.epochs):
        lr = adjust_learning_rate(optimizer, epoch, lr, args.schedule,
                                  args.gamma)
        print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr))

        # train for one epoch
        train_loss, train_acc = do_training_epoch(train_loader, model, device,
                                                  optimizer)

        # evaluate on validation set
        valid_loss, valid_acc, predictions = do_validation_epoch(
            val_loader, model, device, False)

        # append logger file
        logger.append(
            [epoch + 1, lr, train_loss, valid_loss, train_acc, valid_acc])

        # remember best acc and save checkpoint
        is_best = valid_acc > best_acc
        best_acc = max(valid_acc, best_acc)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
            },
            predictions,
            is_best,
            checkpoint=args.checkpoint,
            snapshot=args.snapshot)

    logger.close()
    logger.plot(['Train Acc', 'Val Acc'])
    savefig(os.path.join(args.checkpoint, 'log.eps'))
Exemplo n.º 3
0
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_useful_start_idx = get_useful_start_idx(sequence_length,
                                                  train_num_each)

    val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each)

    num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu
    num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu
    # num_train_we_use = 4
    # num_val_we_use = 800

    train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use]
    val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use]

    train_idx = []
    for i in range(num_train_we_use):
        for j in range(sequence_length):
            train_idx.append(train_we_use_start_idx[i] + j)

    val_idx = []
    for i in range(num_val_we_use):
        for j in range(sequence_length):
            val_idx.append(val_we_use_start_idx[i] + j)

    num_train_all = len(train_idx)
    num_val_all = len(val_idx)

    print('num train start idx : {:6d}'.format(len(train_useful_start_idx)))
    print('last idx train start: {:6d}'.format(train_useful_start_idx[-1]))
    print('num of train dataset: {:6d}'.format(num_train))
    print('num of train we use : {:6d}'.format(num_train_we_use))
    print('num of all train use: {:6d}'.format(num_train_all))
    print('num valid start idx : {:6d}'.format(len(val_useful_start_idx)))
    print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1]))
    print('num of valid dataset: {:6d}'.format(num_val))
    print('num of valid we use : {:6d}'.format(num_val_we_use))
    print('num of all valid use: {:6d}'.format(num_val_all))

    train_loader = DataLoader(train_dataset,
                              batch_size=train_batch_size,
                              sampler=train_idx,
                              num_workers=workers,
                              pin_memory=False)
    val_loader = DataLoader(val_dataset,
                            batch_size=val_batch_size,
                            sampler=val_idx,
                            num_workers=workers,
                            pin_memory=False)
    model = multi_gru()
    if use_gpu:
        model = model.cuda()

    model = DataParallel(model)
    criterion_1 = nn.BCEWithLogitsLoss(size_average=False)
    criterion_2 = nn.CrossEntropyLoss(size_average=False)
    sig_f = nn.Sigmoid()

    if multi_optim == 0:
        if optimizer_choice == 0:
            optimizer = optim.SGD(model.parameters(),
                                  lr=learning_rate,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_adjust_lr,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif multi_optim == 1:
        if optimizer_choice == 0:
            optimizer = optim.SGD([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': model.module.gru.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc.parameters(),
                    'lr': learning_rate
                },
            ],
                                  lr=learning_rate / 10,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_adjust_lr,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': model.module.gru.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc.parameters(),
                    'lr': learning_rate
                },
            ],
                                   lr=learning_rate / 10)

    best_model_wts = copy.deepcopy(model.state_dict())
    best_val_accuracy_1 = 0.0
    best_val_accuracy_2 = 0.0  # judge by accu2
    correspond_train_acc_1 = 0.0
    correspond_train_acc_2 = 0.0

    record_np = np.zeros([epochs, 8])

    for epoch in range(epochs):
        # np.random.seed(epoch)
        np.random.shuffle(train_we_use_start_idx)
        train_idx = []
        for i in range(num_train_we_use):
            for j in range(sequence_length):
                train_idx.append(train_we_use_start_idx[i] + j)

        train_loader = DataLoader(train_dataset,
                                  batch_size=train_batch_size,
                                  sampler=train_idx,
                                  num_workers=workers,
                                  pin_memory=False)

        model.train()
        train_loss_1 = 0.0
        train_loss_2 = 0.0
        train_corrects_1 = 0
        train_corrects_2 = 0

        train_start_time = time.time()
        for data in train_loader:
            inputs, labels_1, labels_2 = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels_1 = Variable(labels_1.cuda())
                labels_2 = Variable(labels_2.cuda())
            else:
                inputs = Variable(inputs)
                labels_1 = Variable(labels_1)
                labels_2 = Variable(labels_2)

            optimizer.zero_grad()

            outputs_1, outputs_2 = model.forward(inputs)

            _, preds_2 = torch.max(outputs_2.data, 1)

            sig_out = outputs_1.data.cpu()
            sig_out = sig_f(sig_out)
            preds_1 = torch.ByteTensor(sig_out > 0.5)
            preds_1 = preds_1.long()
            train_corrects_1 += torch.sum(preds_1 == labels_1.data.cpu())
            labels_1 = Variable(labels_1.data.float())
            loss_1 = criterion_1(outputs_1, labels_1)

            loss_2 = criterion_2(outputs_2, labels_2)
            loss = loss_1 + loss_2
            loss.backward()
            optimizer.step()

            train_loss_1 += loss_1.data[0]
            train_loss_2 += loss_2.data[0]
            train_corrects_2 += torch.sum(preds_2 == labels_2.data)

        train_elapsed_time = time.time() - train_start_time
        train_accuracy_1 = train_corrects_1 / num_train_all / 7
        train_accuracy_2 = train_corrects_2 / num_train_all
        train_average_loss_1 = train_loss_1 / num_train_all / 7
        train_average_loss_2 = train_loss_2 / num_train_all

        # begin eval

        model.eval()
        val_loss_1 = 0.0
        val_loss_2 = 0.0
        val_corrects_1 = 0
        val_corrects_2 = 0

        val_start_time = time.time()
        for data in val_loader:
            inputs, labels_1, labels_2 = data
            labels_2 = labels_2[(sequence_length - 1)::sequence_length]
            if use_gpu:
                inputs = Variable(inputs.cuda(), volatile=True)
                labels_1 = Variable(labels_1.cuda(), volatile=True)
                labels_2 = Variable(labels_2.cuda(), volatile=True)
            else:
                inputs = Variable(inputs, volatile=True)
                labels_1 = Variable(labels_1, volatile=True)
                labels_2 = Variable(labels_2, volatile=True)

            if crop_type == 0 or crop_type == 1:
                outputs_1, outputs_2 = model.forward(inputs)
            elif crop_type == 5:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                outputs_1, outputs_2 = model.forward(inputs)
                outputs_1 = outputs_1.view(5, -1, 7)
                outputs_1 = torch.mean(outputs_1, 0)
                outputs_2 = outputs_2.view(5, -1, 7)
                outputs_2 = torch.mean(outputs_2, 0)
            elif crop_type == 10:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                outputs_1, outputs_2 = model.forward(inputs)
                outputs_1 = outputs_1.view(10, -1, 7)
                outputs_1 = torch.mean(outputs_1, 0)
                outputs_2 = outputs_2.view(10, -1, 7)
                outputs_2 = torch.mean(outputs_2, 0)

            outputs_2 = outputs_2[sequence_length - 1::sequence_length]
            _, preds_2 = torch.max(outputs_2.data, 1)

            sig_out = outputs_1.data.cpu()
            sig_out = sig_f(sig_out)
            preds_1 = torch.ByteTensor(sig_out > 0.5)
            preds_1 = preds_1.long()
            val_corrects_1 += torch.sum(preds_1 == labels_1.data.cpu())
            labels_1 = Variable(labels_1.data.float())
            loss_1 = criterion_1(outputs_1, labels_1)
            val_loss_1 += loss_1.data[0]

            loss_2 = criterion_2(outputs_2, labels_2)
            val_loss_2 += loss_2.data[0]
            val_corrects_2 += torch.sum(preds_2 == labels_2.data)

        val_elapsed_time = time.time() - val_start_time
        val_accuracy_1 = val_corrects_1 / (num_val_all * 7)
        val_accuracy_2 = val_corrects_2 / num_val_we_use
        val_average_loss_1 = val_loss_1 / (num_val_all * 7)
        val_average_loss_2 = val_loss_2 / num_val_we_use

        print('epoch: {:4d}'
              ' train time: {:2.0f}m{:2.0f}s'
              ' train loss_1: {:4.4f}'
              ' train loss_1: {:4.4f}'
              ' train accu_1: {:.4f}'
              ' valid time: {:2.0f}m{:2.0f}s'
              ' valid loss_1: {:4.4f}'
              ' valid accu_1: {:.4f}'.format(
                  epoch, train_elapsed_time // 60, train_elapsed_time % 60,
                  train_average_loss_1, train_accuracy_1,
                  val_elapsed_time // 60, val_elapsed_time % 60,
                  val_average_loss_1, val_accuracy_1))
        print('epoch: {:4d}'
              ' train time: {:2.0f}m{:2.0f}s'
              ' train loss_2: {:4.4f}'
              ' train accu_2: {:.4f}'
              ' valid time: {:2.0f}m{:2.0f}s'
              ' valid loss_2: {:4.4f}'
              ' valid accu_2: {:.4f}'.format(
                  epoch, train_elapsed_time // 60, train_elapsed_time % 60,
                  train_average_loss_2, train_accuracy_2,
                  val_elapsed_time // 60, val_elapsed_time % 60,
                  val_average_loss_2, val_accuracy_2))

        if optimizer_choice == 0:
            if sgd_adjust_lr == 0:
                exp_lr_scheduler.step()
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler.step(val_average_loss_1 + val_average_loss_2)

        if val_accuracy_2 > best_val_accuracy_2 and val_accuracy_1 > 0.95:
            best_val_accuracy_2 = val_accuracy_2
            best_val_accuracy_1 = val_accuracy_1
            correspond_train_acc_1 = train_accuracy_1
            correspond_train_acc_2 = train_accuracy_2
            best_model_wts = copy.deepcopy(model.state_dict())
        elif val_accuracy_2 == best_val_accuracy_2 and val_accuracy_1 > 0.95:
            if val_accuracy_1 > best_val_accuracy_1:
                correspond_train_acc_1 = train_accuracy_1
                correspond_train_acc_2 = train_accuracy_2
                best_model_wts = copy.deepcopy(model.state_dict())
            elif val_accuracy_1 == best_val_accuracy_1:
                if train_accuracy_2 > correspond_train_acc_2:
                    correspond_train_acc_2 = train_accuracy_2
                    correspond_train_acc_1 = train_accuracy_1
                    best_model_wts = copy.deepcopy(model.state_dict())
                elif train_accuracy_2 == correspond_train_acc_2:
                    if train_accuracy_1 > best_val_accuracy_1:
                        correspond_train_acc_1 = train_accuracy_1
                        best_model_wts = copy.deepcopy(model.state_dict())

        record_np[epoch, 0] = train_accuracy_1
        record_np[epoch, 1] = train_accuracy_2
        record_np[epoch, 2] = train_average_loss_1
        record_np[epoch, 3] = train_average_loss_2
        record_np[epoch, 4] = val_accuracy_1
        record_np[epoch, 5] = val_accuracy_2
        record_np[epoch, 6] = val_average_loss_1
        record_np[epoch, 7] = val_average_loss_2

    print('best accuracy_1: {:.4f} cor train accu_1: {:.4f}'.format(
        best_val_accuracy_1, correspond_train_acc_1))
    print('best accuracy_2: {:.4f} cor train accu_2: {:.4f}'.format(
        best_val_accuracy_2, correspond_train_acc_2))
    save_val_1 = int("{:4.0f}".format(best_val_accuracy_1 * 10000))
    save_val_2 = int("{:4.0f}".format(best_val_accuracy_2 * 10000))
    save_train_1 = int("{:4.0f}".format(correspond_train_acc_1 * 10000))
    save_train_2 = int("{:4.0f}".format(correspond_train_acc_2 * 10000))
    public_name = "cnn_gru" \
                  + "_epoch_" + str(epochs) \
                  + "_length_" + str(sequence_length) \
                  + "_opt_" + str(optimizer_choice) \
                  + "_mulopt_" + str(multi_optim) \
                  + "_flip_" + str(use_flip) \
                  + "_crop_" + str(crop_type) \
                  + "_batch_" + str(train_batch_size) \
                  + "_train1_" + str(save_train_1) \
                  + "_train2_" + str(save_train_2) \
                  + "_val1_" + str(save_val_1) \
                  + "_val2_" + str(save_val_2)
    model_name = public_name + ".pth"
    torch.save(best_model_wts, model_name)

    record_name = public_name + ".npy"
    np.save(record_name, record_np)
Exemplo n.º 4
0
# define model
net = model.attention_net(topN=PROPOSAL_NUM)
if resume:
    ckpt = torch.load(resume)
    net.load_state_dict(ckpt['net_state_dict'])
    start_epoch = ckpt['epoch'] + 1
creterion = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(net.parameters(),
                            lr=LR,
                            momentum=0.9,
                            weight_decay=WD)
# optimizer = torch.optim.Adam(net.parameters(), lr=LR, weight_decay=WD)
schedulers = [MultiStepLR(optimizer, milestones=[60, 100], gamma=0.1)]
net = net.cuda()
net = DataParallel(net)

for epoch in range(start_epoch, 500):
    for scheduler in schedulers:
        scheduler.step()

    # begin training
    _print('--' * 50)
    net.train()
    total_tmp = 0
    raw_tmp = 0
    rank_tmp = 0
    concat_tmp = 0
    partcls_tmp = 0
    for i, data in enumerate(trainloader):
        img, label = data[0].cuda(), data[1].cuda()
    print('{} train iters per epoch:'.format(len(trainloader)))

    test_dataset = Dataset(opt.test_root, opt.test_pd_root, opt.test_list, phase='test', input_shape=opt.input_shape)
    testloader = data.DataLoader(test_dataset,
                                  batch_size=opt.train_batch_size,
                                  shuffle=False,
                                  num_workers=opt.num_workers)
    criterion = LossFunction()

    embedding_net=Unet_down()
    regression_net=Unet_up()

    model = AlignmentNet(embedding_net, regression_net)

    if opt.finetune:
        model = DataParallel(model)
        load_model(model, opt.load_model_path)
        model.load_state_dict(torch.load(opt.load_model_path))
        model.to(torch.device("cuda"))
    else:
        model.to(device)
        model = DataParallel(model)

    if opt.optimizer == 'sgd':
        optimizer = torch.optim.SGD([{'params': model.parameters()}],
                                    lr=opt.lr, weight_decay=opt.weight_decay)
    else:
        optimizer = torch.optim.Adam([{'params': model.parameters()}],
                                     lr=opt.lr, weight_decay=opt.weight_decay)
    scheduler = StepLR(optimizer, step_size=opt.lr_step, gamma=0.1)
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡')
    parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库')
    parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料')
    parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环')
    parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size')
    parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率')
    parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数')
    parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss')
    parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False)
    parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False)
    parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份')
    parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度')
    parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径')
    parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径')
    parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json")
    parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe")

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = n_ctx
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tb_writer = SummaryWriter(log_dir=args.writer_dir)

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print('building files')
        build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces,
                    full_tokenizer=full_tokenizer, min_length=min_length)
        print('files built')

    if not args.pretrained_model:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
    scheduler = pytorch_transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps,
                                                          t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point: start_point + n_ctx])
                start_point += stride
            start_point -= stride
            last = tokens[start_point + n_ctx:]
            last.extend([full_tokenizer.convert_tokens_to_ids(['[PAD]']) * (n_ctx - len(last))])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size: (step + 1) * batch_size]
                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs, labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                    overall_step += 1
                    if (overall_step + 1) % log_step == 0:
                        tb_writer.add_scalar('loss', loss.item(), overall_step)
                if (overall_step + 1) % log_step == 0:
                    print('now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'.format(
                        datetime.now().hour,
                        datetime.now().minute,
                        (step + 1) // gradient_accumulation,
                        piece_num,
                        epoch + 1,
                        running_loss * gradient_accumulation / log_step))
                    running_loss = 0
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
Exemplo n.º 7
0
def main(args: argparse.Namespace):
    logger = CompleteLogger(args.log, args.phase)
    print(args)

    if args.seed is not None:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    cudnn.benchmark = True

    # Data loading code
    train_transform = utils.get_train_transform(args.height,
                                                args.width,
                                                args.train_resizing,
                                                random_horizontal_flip=True,
                                                random_color_jitter=False,
                                                random_gray_scale=False,
                                                random_erasing=False)
    val_transform = utils.get_val_transform(args.height, args.width)
    print("train_transform: ", train_transform)
    print("val_transform: ", val_transform)

    working_dir = osp.dirname(osp.abspath(__file__))
    source_root = osp.join(working_dir, args.source_root)
    target_root = osp.join(working_dir, args.target_root)

    # source dataset
    source_dataset = datasets.__dict__[args.source](
        root=osp.join(source_root, args.source.lower()))
    sampler = RandomMultipleGallerySampler(source_dataset.train,
                                           args.num_instances)
    train_source_loader = DataLoader(convert_to_pytorch_dataset(
        source_dataset.train,
        root=source_dataset.images_dir,
        transform=train_transform),
                                     batch_size=args.batch_size,
                                     num_workers=args.workers,
                                     sampler=sampler,
                                     pin_memory=True,
                                     drop_last=True)
    train_source_iter = ForeverDataIterator(train_source_loader)
    val_loader = DataLoader(convert_to_pytorch_dataset(
        list(set(source_dataset.query) | set(source_dataset.gallery)),
        root=source_dataset.images_dir,
        transform=val_transform),
                            batch_size=args.batch_size,
                            num_workers=args.workers,
                            shuffle=False,
                            pin_memory=True)

    # target dataset
    target_dataset = datasets.__dict__[args.target](
        root=osp.join(target_root, args.target.lower()))
    train_target_loader = DataLoader(convert_to_pytorch_dataset(
        target_dataset.train,
        root=target_dataset.images_dir,
        transform=train_transform),
                                     batch_size=args.batch_size,
                                     num_workers=args.workers,
                                     shuffle=True,
                                     pin_memory=True,
                                     drop_last=True)
    train_target_iter = ForeverDataIterator(train_target_loader)
    test_loader = DataLoader(convert_to_pytorch_dataset(
        list(set(target_dataset.query) | set(target_dataset.gallery)),
        root=target_dataset.images_dir,
        transform=val_transform),
                             batch_size=args.batch_size,
                             num_workers=args.workers,
                             shuffle=False,
                             pin_memory=True)

    # create model
    num_classes = source_dataset.num_train_pids
    backbone = utils.get_model(args.arch)
    pool_layer = nn.Identity() if args.no_pool else None
    model = ReIdentifier(backbone,
                         num_classes,
                         finetune=args.finetune,
                         pool_layer=pool_layer).to(device)
    model = DataParallel(model)

    # define optimizer and lr scheduler
    optimizer = Adam(model.module.get_parameters(base_lr=args.lr,
                                                 rate=args.rate),
                     args.lr,
                     weight_decay=args.weight_decay)
    lr_scheduler = WarmupMultiStepLR(optimizer,
                                     args.milestones,
                                     gamma=0.1,
                                     warmup_factor=0.1,
                                     warmup_steps=args.warmup_steps)

    # resume from the best checkpoint
    if args.phase != 'train':
        checkpoint = torch.load(logger.get_checkpoint_path('best'),
                                map_location='cpu')
        model.load_state_dict(checkpoint)

    # analysis the model
    if args.phase == 'analysis':
        # plot t-SNE
        utils.visualize_tsne(source_loader=val_loader,
                             target_loader=test_loader,
                             model=model,
                             filename=osp.join(logger.visualize_directory,
                                               'analysis', 'TSNE.pdf'),
                             device=device)
        # visualize ranked results
        visualize_ranked_results(test_loader,
                                 model,
                                 target_dataset.query,
                                 target_dataset.gallery,
                                 device,
                                 visualize_dir=logger.visualize_directory,
                                 width=args.width,
                                 height=args.height,
                                 rerank=args.rerank)
        return

    if args.phase == 'test':
        print("Test on source domain:")
        validate(val_loader,
                 model,
                 source_dataset.query,
                 source_dataset.gallery,
                 device,
                 cmc_flag=True,
                 rerank=args.rerank)
        print("Test on target domain:")
        validate(test_loader,
                 model,
                 target_dataset.query,
                 target_dataset.gallery,
                 device,
                 cmc_flag=True,
                 rerank=args.rerank)
        return

    # define loss function
    criterion_ce = CrossEntropyLossWithLabelSmooth(num_classes).to(device)
    criterion_triplet = SoftTripletLoss(margin=args.margin).to(device)

    # start training
    best_val_mAP = 0.
    best_test_mAP = 0.
    for epoch in range(args.epochs):
        # print learning rate
        print(lr_scheduler.get_lr())

        # train for one epoch
        train(train_source_iter, train_target_iter, model, criterion_ce,
              criterion_triplet, optimizer, epoch, args)

        # update learning rate
        lr_scheduler.step()

        if (epoch + 1) % args.eval_step == 0 or (epoch == args.epochs - 1):

            # evaluate on validation set
            print("Validation on source domain...")
            _, val_mAP = validate(val_loader,
                                  model,
                                  source_dataset.query,
                                  source_dataset.gallery,
                                  device,
                                  cmc_flag=True)

            # remember best mAP and save checkpoint
            torch.save(model.state_dict(),
                       logger.get_checkpoint_path('latest'))
            if val_mAP > best_val_mAP:
                shutil.copy(logger.get_checkpoint_path('latest'),
                            logger.get_checkpoint_path('best'))
            best_val_mAP = max(val_mAP, best_val_mAP)

            # evaluate on test set
            print("Test on target domain...")
            _, test_mAP = validate(test_loader,
                                   model,
                                   target_dataset.query,
                                   target_dataset.gallery,
                                   device,
                                   cmc_flag=True,
                                   rerank=args.rerank)
            best_test_mAP = max(test_mAP, best_test_mAP)

    # evaluate on test set
    model.load_state_dict(torch.load(logger.get_checkpoint_path('best')))
    print("Test on target domain:")
    _, test_mAP = validate(test_loader,
                           model,
                           target_dataset.query,
                           target_dataset.gallery,
                           device,
                           cmc_flag=True,
                           rerank=args.rerank)
    print("test mAP on target = {}".format(test_mAP))
    print("oracle mAP on target = {}".format(best_test_mAP))
    logger.close()
Exemplo n.º 8
0
    def __init__(self, model_name, batch_size, gpu_memory):
        super().__init__(batch_size, gpu_memory)
        if model_name in [
                'pt_vgg', 'pt_resnet', 'pt_inception', 'pt_densenet'
        ]:
            model = model_class_dict[model_name](pretrained=True)
            self.mean = np.reshape([0.485, 0.456, 0.406], [1, 3, 1, 1])
            self.std = np.reshape([0.229, 0.224, 0.225], [1, 3, 1, 1])
            model = DataParallel(model.cuda())
        else:
            model = model_class_dict[model_name]()
            if model_name in ['pt_post_avg_cifar10', 'pt_post_avg_imagenet']:
                # checkpoint = torch.load(model_path_dict[model_name])
                self.mean = np.reshape([0.485, 0.456, 0.406], [1, 3, 1, 1])
                self.std = np.reshape([0.229, 0.224, 0.225], [1, 3, 1, 1])
            else:
                model = DataParallel(model).cuda()
                checkpoint = torch.load(model_path_dict[model_name] + '.pth')
                self.mean = np.reshape([0.485, 0.456, 0.406], [1, 3, 1, 1])
                self.std = np.reshape([0.225, 0.225, 0.225], [1, 3, 1, 1])
                model.load_state_dict(checkpoint)
                model.float()
        self.mean, self.std = self.mean.astype(np.float32), self.std.astype(
            np.float32)

        model.eval()
        self.model = model
Exemplo n.º 9
0
# training parameters
BATCH_SIZE = 100
LR = 0.0002
EPOCHS = 20

# data_loader
IMG_SIZE = 32
'''
生成网络
'''
Net_G = Generator(depth=128)
Net_D = Discriminator(depth=128)
Net_G.weight_init(mean=0.0, std=0.02)
Net_D.weight_init(mean=0.0, std=0.02)

Net_G = DataParallel(Net_G)
Net_D = DataParallel(Net_D)
if GPU_NUMS > 1:
    Net_G.cuda()
    Net_D.cuda()
'''
读入数据并进行预处理
'''
transform = Compose([
    Scale(IMG_SIZE),
    ToTensor(),
    Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])
train_loader = torch.utils.data.DataLoader(
    # MNIST('data', train=True, download=True, transform=transform),
    MNISTDataSet('../ganData/mnist.npz', train=True, transform=transform),
Exemplo n.º 10
0
def main():
    global args
    args = parser.parse_args()

    bestLoss = 1000

    torch.manual_seed(0)
    torch.cuda.set_device(0)

    model = import_module(args.model)
    config, net, loss, get_pbb = model.get_model()
    start_epoch = args.start_epoch
    save_dir = args.save_dir

    if args.resume:
        print("=> loading checkpoint '{}'".format(args.resume))
        checkpoint = torch.load(save_dir + 'detector_' + args.resume)
        start_epoch = checkpoint['epoch']
        net.load_state_dict(checkpoint['state_dict'])

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    logfile = os.path.join(save_dir, 'log')
    if args.test != 1:
        sys.stdout = Logger(logfile)
        pyfiles = [f for f in os.listdir('./') if f.endswith('.py')]
        for f in pyfiles:
            shutil.copy(f, os.path.join(save_dir, f))
    n_gpu = setgpu(args.gpu)
    args.n_gpu = n_gpu
    net = net.cuda()
    loss = loss.cuda()
    cudnn.benchmark = True
    net = DataParallel(net)
    datadir = config_training['preprocess_result_path']

    luna_data = np.load(
        '/home/jiancong/LungNodule_DL/detector/luna_folds/luna_fold6.npy')
    luna_train = luna_data[1]
    luna_test = luna_data[0]

    if args.test == 1:
        print("start test")
        margin = 32
        sidelen = 144

        split_comber = SplitComb(sidelen, config['max_stride'],
                                 config['stride'], margin, config['pad_value'])
        dataset = LungNodule3Ddetector(datadir,
                                       luna_test,
                                       config,
                                       phase='test',
                                       split_comber=split_comber)
        test_loader = DataLoader(dataset,
                                 batch_size=1,
                                 shuffle=False,
                                 num_workers=args.workers,
                                 collate_fn=collate,
                                 pin_memory=False)

        test(test_loader, net, get_pbb, save_dir, config)
        return

    dataset = LungNodule3Ddetector(datadir, luna_train, config, phase='train')
    train_loader = DataLoader(dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.workers,
                              pin_memory=True)

    dataset = LungNodule3Ddetector(datadir, luna_test, config, phase='val')
    val_loader = DataLoader(dataset,
                            batch_size=16,
                            shuffle=False,
                            num_workers=args.workers,
                            pin_memory=True)

    optimizer = torch.optim.SGD(net.parameters(),
                                args.lr,
                                momentum=0.9,
                                weight_decay=args.weight_decay)

    def get_lr(epoch):
        if epoch <= args.epochs * 0.2:
            lr = args.lr
        elif epoch <= args.epochs * 0.4:
            lr = 0.1 * args.lr
        elif epoch <= args.epochs * 0.6:
            lr = 0.05 * args.lr
        else:
            lr = 0.01 * args.lr
        return lr

    for epoch in range(start_epoch, args.epochs + 1):
        train(train_loader, net, loss, epoch, optimizer, get_lr, save_dir)
        print("finsihed epoch {}".format(epoch))
        valiloss = validate(val_loader, net, loss)

        if bestLoss > valiloss:
            bestLoss = valiloss
            state_dict = net.module.state_dict()
            for key in state_dict.keys():
                state_dict[key] = state_dict[key].cpu()

            torch.save(
                {
                    'epoch': epoch + 1,
                    'save_dir': save_dir,
                    'state_dict': state_dict,
                    'args': args
                }, os.path.join(save_dir, 'detector_%03d.ckpt' % epoch))
            print("save model on epoch %d" % epoch)
Exemplo n.º 11
0
Arquivo: 0.py Projeto: tkkcc/tnrd
    for epoch in range(o.epoch):
        for i in tqdm(d):
            g, y, k, s = [x.to(o.device) for x in i]
            x = y
            optimizer.zero_grad()
            out = m(x)
            log("out", out)
            loss = npsnr(out, g)
            loss.backward()
            optimizer.step()
            losss.append(loss.detach().item())
            assert not isnan(losss[-1])
            print("stage", stage, "epoch", epoch + 1)
            log("loss", mean(losss[-5:]))
            num += 1
            # if num > (o.epoch * iter_num - 4):
            if num % 50 == 1:
                show(torch.cat((y[0, 0], g[0, 0], out[0, 0]), 1),
                     # save=f"save/{stage:02}{epoch:02}.png",
                     )
    plt.clf()
    plt.plot(range(len(losss)), losss)
    plt.xlabel("batch")
    plt.ylabel("loss")
    plt.title(f"{iter_num} iter x {o.epoch} epoch")
    plt.savefig(f"save/{stage:02}loss.png")


m = DataParallel(M()).to(o.device)
train(m)
Exemplo n.º 12
0
def main():
    args = parser.parse_args()

    log_out_dir = os.path.join(RESULT_DIR, 'logs', args.out_dir,
                               'fold%d' % args.fold)
    if not os.path.exists(log_out_dir):
        os.makedirs(log_out_dir)
    log = Logger()
    log.open(os.path.join(log_out_dir, 'log.train.txt'), mode='a')

    model_out_dir = os.path.join(RESULT_DIR, 'models', args.out_dir,
                                 'fold%d' % args.fold)
    log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format(
        model_out_dir))
    if not os.path.exists(model_out_dir):
        os.makedirs(model_out_dir)

    # set cuda visible device
    if not args.all_gpus:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
    cudnn.benchmark = True
    # cudnn.enabled = False

    # set random seeds
    torch.manual_seed(0)
    torch.cuda.manual_seed_all(0)
    np.random.seed(0)

    model_params = {}
    model_params['architecture'] = args.arch
    model_params['num_classes'] = args.num_classes
    model_params['in_channels'] = args.in_channels
    if 'efficientnet' in args.arch:
        model_params['image_size'] = args.img_size
        model_params['encoder'] = args.effnet_encoder

    model = init_network(model_params)

    if args.load_state_dict_path is not None:
        if args.load_state_dict_path == 'use-img-level-densenet-ckpt':
            model_dir = '../output/models/densenet121_1024_all_data__obvious_neg__gradaccum_20__start_lr_3e6'
            pretrained_ckpt_path = os.path.join(f'{model_dir}',
                                                f'fold{args.fold}',
                                                'final.pth')
        else:
            pretrained_ckpt_path = args.load_state_dict_path
        init_pretrained = torch.load(pretrained_ckpt_path)
        model.load_state_dict(init_pretrained['state_dict'])

    if args.all_gpus:
        model = DataParallel(model)
    model.cuda()

    # define loss function (criterion)
    try:
        criterion = eval(args.loss)().cuda()
    except:
        raise (RuntimeError("Loss {} not available!".format(args.loss)))

    start_epoch = 0
    best_loss = 1e5
    best_epoch = 0
    best_focal = float('inf')

    # define scheduler
    try:
        scheduler = eval(args.scheduler)(
            scheduler_lr_multiplier=args.scheduler_lr_multiplier,
            scheduler_epoch_offset=args.scheduler_epoch_offset)
    except:
        raise (RuntimeError("Scheduler {} not available!".format(
            args.scheduler)))
    optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0]

    # optionally resume from a checkpoint
    if args.resume:
        args.resume = os.path.join(model_out_dir, args.resume)
        if os.path.isfile(args.resume):
            # load checkpoint weights and update model and optimizer
            log.write(">> Loading checkpoint:\n>> '{}'\n".format(args.resume))

            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            best_epoch = checkpoint['best_epoch']
            best_focal = checkpoint['best_map']
            model.load_state_dict(checkpoint['state_dict'])

            optimizer_fpath = args.resume.replace('.pth', '_optim.pth')
            if os.path.exists(optimizer_fpath):
                log.write(">> Loading checkpoint:\n>> '{}'\n".format(
                    optimizer_fpath))
                optimizer.load_state_dict(
                    torch.load(optimizer_fpath)['optimizer'])
            log.write(">>>> loaded checkpoint:\n>>>> '{}' (epoch {})\n".format(
                args.resume, checkpoint['epoch']))
        else:
            log.write(">> No checkpoint found at '{}'\n".format(args.resume))

    # Data loading code
    train_transform = train_multi_augment2

    with open('../input/imagelevel_folds_obvious_staining_5.pkl', 'rb') as f:
        folds = pickle.load(f)
    fold = args.fold
    trn_img_paths, val_img_paths = folds[fold]

    train_df = get_train_df_ohe(clean_from_duplicates=True)
    basepath_2_ohe_vector = {
        img: vec
        for img, vec in zip(train_df['img_base_path'],
                            train_df.iloc[:, 2:].values)
    }

    public_hpa_df_17 = get_public_df_ohe(clean_from_duplicates=True)
    public_basepath_2_ohe_vector = {
        img_path: vec
        for img_path, vec in zip(public_hpa_df_17['img_base_path'],
                                 public_hpa_df_17.iloc[:, 2:].values)
    }
    basepath_2_ohe_vector.update(public_basepath_2_ohe_vector)

    available_paths = set(
        np.concatenate((train_df['img_base_path'].values,
                        public_hpa_df_17['img_base_path'].values)))
    trn_img_paths = [path for path in trn_img_paths if path in available_paths]
    val_img_paths = [path for path in val_img_paths if path in available_paths]
    labels_df = pd.read_hdf(args.cell_level_labels_path)

    # modifying minor class labels
    cherrypicked_mitotic_spindle = pd.read_csv(
        '../input/mitotic_cells_selection.csv')

    cherrypicked_mitotic_spindle_img_cell = set(
        cherrypicked_mitotic_spindle[['ID', 'cell_i']].apply(tuple,
                                                             axis=1).values)

    cherrypicked_mitotic_spindle_img_cell = {
        (img, cell_i - 1)
        for img, cell_i in cherrypicked_mitotic_spindle_img_cell
    }

    class_names = get_class_names()
    mitotic_spindle_class_i = class_names.index('Mitotic spindle')

    if args.include_nn_mitotic:
        cherrypicked_mitotic_spindle_based_on_nn = pd.read_csv(
            '../input/mitotic_pos_nn_added.csv')
        cherrypicked_mitotic_spindle_img_cell.update(
            set(cherrypicked_mitotic_spindle_based_on_nn[[
                'ID', 'cell_i'
            ]].apply(tuple, axis=1).values))
        print('len cherrypicked_mitotic_spindle_img_cell',
              len(cherrypicked_mitotic_spindle_img_cell))
    mitotic_bool_idx = labels_df.index.isin(
        cherrypicked_mitotic_spindle_img_cell)

    def modify_label(labels, idx, val):
        labels[idx] = val
        return labels

    labels_df.loc[mitotic_bool_idx, 'image_level_pred'] = labels_df.loc[
        mitotic_bool_idx, 'image_level_pred'].map(
            lambda x: modify_label(x, mitotic_spindle_class_i, 1))

    if args.include_nn_mitotic:
        cherrypicked_not_mitotic_spindle_based_on_nn = pd.read_csv(
            '../input/mitotic_neg_nn_added.csv')
        cherrypicked_not_mitotic_spindle_based_on_nn = set(
            cherrypicked_not_mitotic_spindle_based_on_nn[[
                'ID', 'cell_i'
            ]].apply(tuple, axis=1).values)
        not_mitotic_bool_idx = labels_df.index.isin(
            cherrypicked_not_mitotic_spindle_based_on_nn)
        labels_df.loc[not_mitotic_bool_idx,
                      'image_level_pred'] = labels_df.loc[
                          not_mitotic_bool_idx,
                          'image_level_pred'].map(lambda x: modify_label(
                              x, mitotic_spindle_class_i, 0))

    if args.ignore_negative:
        raise NotImplementedError

    if args.upsample_minorities:
        cells_to_upsample = list(cherrypicked_mitotic_spindle_img_cell)
        aggresome_class_i = class_names.index('Aggresome')
        confident_aggresome_indices = list(
            labels_df.index[labels_df['image_level_pred'].map(
                lambda x: x[aggresome_class_i] > 0.9)])
        print('confident_aggresome_indices len',
              len(confident_aggresome_indices))
        print('confident_aggresome_indices[:5]',
              confident_aggresome_indices[:5])
        cells_to_upsample += confident_aggresome_indices
    else:
        cells_to_upsample = None
    train_dataset = ProteinDatasetCellSeparateLoading(
        trn_img_paths,
        labels_df=labels_df,
        cells_to_upsample=cells_to_upsample,
        img_size=args.img_size,
        in_channels=args.in_channels,
        transform=train_transform,
        basepath_2_ohe=basepath_2_ohe_vector,
        normalize=args.normalize,
        target_raw_img_size=args.target_raw_img_size)
    train_loader = DataLoader(
        train_dataset,
        sampler=RandomSampler(train_dataset),
        batch_size=args.batch_size,
        drop_last=False,
        num_workers=args.workers,
        pin_memory=True,
    )

    # valid_dataset = ProteinDatasetCellLevel(val_img_paths,
    #                                         labels_df=labels_df,
    #                                         img_size=args.img_size,
    #                                         batch_size=64,
    #                                         is_trainset=True,
    #                                         in_channels=args.in_channels)

    valid_dataset = ProteinDatasetCellSeparateLoading(
        val_img_paths,
        labels_df=labels_df,
        img_size=args.img_size,
        in_channels=args.in_channels,
        basepath_2_ohe=basepath_2_ohe_vector,
        normalize=args.normalize,
        target_raw_img_size=args.target_raw_img_size)
    valid_loader = DataLoader(valid_dataset,
                              sampler=SequentialSampler(valid_dataset),
                              batch_size=args.batch_size,
                              drop_last=False,
                              num_workers=args.workers,
                              pin_memory=True)

    log.write('** start training here! **\n')
    log.write('\n')
    log.write(
        'epoch    iter      rate     |  train_loss/acc  |    valid_loss/acc/map/focal     |best_epoch/best_focal|  min \n'
    )
    log.write(
        '-----------------------------------------------------------------------------------------------------------------\n'
    )
    start_epoch += 1

    if args.eval_at_start:
        with torch.no_grad():
            valid_loss, valid_acc, val_focal, val_map_score = validate(
                valid_loader, model, criterion, -1, log)
        print('\r', end='', flush=True)
        log.write(
            '%5.1f   %5d    %0.6f   |  %0.4f  %0.4f  |    %0.4f  %6.4f %6.4f %6.1f  |    %6.4f  %6.4f   | %3.1f min \n' % \
            (-1, -1, -1, -1, -1, valid_loss, valid_acc, val_map_score, val_focal,
                   best_epoch, best_focal, -1))

    for epoch in range(start_epoch, args.epochs + 1):
        end = time.time()

        # set manual seeds per epoch
        np.random.seed(epoch)
        torch.manual_seed(epoch)
        torch.cuda.manual_seed_all(epoch)

        # adjust learning rate for each epoch
        lr_list = scheduler.step(model, epoch, args.epochs)
        lr = lr_list[0]

        # train for one epoch on train set
        iter, train_loss, train_acc = train(
            train_loader,
            model,
            criterion,
            optimizer,
            epoch,
            clipnorm=args.clipnorm,
            lr=lr,
            agg_steps=args.gradient_accumulation_steps)

        with torch.no_grad():
            valid_loss, valid_acc, val_focal, val_map_score = validate(
                valid_loader, model, criterion, epoch, log)

        # remember best loss and save checkpoint
        is_best = val_focal < best_focal
        best_loss = min(valid_loss, best_loss)
        best_epoch = epoch if is_best else best_epoch
        best_focal = val_focal if is_best else best_focal

        print('\r', end='', flush=True)
        log.write('%5.1f   %5d    %0.6f   |  %0.4f  %0.4f  |    %0.4f  %6.4f %6.4f  %6.1f |  %6.4f  %6.4f | %3.1f min \n' % \
                  (epoch, iter + 1, lr, train_loss, train_acc, valid_loss, valid_acc, val_map_score, val_focal,
                   best_epoch, best_focal, (time.time() - end) / 60))

        save_model(model,
                   is_best,
                   model_out_dir,
                   optimizer=optimizer,
                   epoch=epoch,
                   best_epoch=best_epoch,
                   best_map=best_focal)
Exemplo n.º 13
0
def train(args):
    print('start training...')
    model, model_file = create_model(args)
    train_loader, val_loader = get_train_val_loaders(batch_size=args.train_batch_size, val_batch_size=args.val_batch_size)
    frame_loader, _ = get_frame_train_loader(batch_size=args.frame_batch_size)
    #model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)

    if args.optim == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0001)
    elif args.optim == 'RAdam':
        optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=0.0001)
    else:
        optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0001)

    if args.lrs == 'plateau':
        lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=args.factor, patience=args.patience, min_lr=args.min_lr)
    else:
        lr_scheduler = CosineAnnealingLR(optimizer, args.t_max, eta_min=args.min_lr)

    model = model.cuda()
    if torch.cuda.device_count() > 1:
        model_name = model.name
        model = DataParallel(model)
        model.name = model_name

    #model=model.train()

    best_f2 = 0.
    best_key = 'top1'

    print('epoch |    lr     |       %        |  loss  |  avg   |  loss  |  top1   |  top10  |  best  | time |  save |')

    if not args.no_first_val:
        val_metrics = validate(args, model, val_loader)
        print('val   |           |                |        |        | {:.4f} | {:.4f} | {:.4f} | {:.4f} |       |        |'.format(
            val_metrics['valid_loss'], val_metrics['top1'], val_metrics['top10'], val_metrics[best_key] ))

        best_f2 = val_metrics[best_key]

    if args.val:
        return

    model.train()

    if args.lrs == 'plateau':
        lr_scheduler.step(best_f2)
    else:
        lr_scheduler.step()


    #for epoch in range(args.start_epoch, args.num_epochs):
    def get_batch(loader, iterator=None, epoch=0, batch_idx=0):
        ret_epoch = epoch
        ret_batch_idx = batch_idx + 1
        if iterator is None:
            iterator = loader.__iter__()
        try:
            b = iterator.__next__()
        except StopIteration:
            iterator = loader.__iter__()
            b = iterator.__next__()
            ret_epoch += 1
            ret_epoch = 0
        return b, iterator, epoch, ret_batch_idx     

    frame_epoch = args.start_epoch
    train_epoch = 0
    frame_iter = frame_loader.__iter__()
    train_iter = train_loader.__iter__()
    train_step = 0
    frame_batch_idx = -1
    train_batch_idx = -1


    while frame_epoch <= args.num_epochs:
        frame_loss = 0.
        train_loss = 0.
        current_lr = get_lrs(optimizer)
        bg = time.time()

        def train_batch(rgb, audio, labels):
            output = model(rgb, audio)
            
            loss = criterion(output, labels)
            batch_size = rgb.size(0)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            return loss.item()


        for i in range(200):
            batch, frame_iter, frame_epoch, frame_batch_idx = get_batch(frame_loader, frame_iter, frame_epoch, frame_batch_idx)
            rgb, audio, labels = batch[0].cuda(), batch[2].cuda(), batch[4].cuda()
            
            loss_val = train_batch(rgb, audio, labels)
            frame_loss += loss_val
            print('\r F{:4d} | {:.7f} | {:06d}/{} | {:.4f} | {:.4f} |'.format(
                frame_epoch, float(current_lr[0]), args.frame_batch_size*(frame_batch_idx+1), frame_loader.num, loss_val, frame_loss/(i+1)), end='')
        print('')
        for i in range(100):
            batch, train_iter, train_epoch, train_batch_idx = get_batch(train_loader, train_iter, train_epoch, train_batch_idx)
            rgb, audio, labels = [x.cuda() for x in batch]
            
            loss_val = train_batch(rgb, audio, labels)
            train_loss += loss_val
            print('\r T{:4d} | {:.7f} | {:06d}/{} | {:.4f} | {:.4f} |'.format(
                train_epoch, float(current_lr[0]), args.train_batch_size*(train_batch_idx+1), train_loader.num, loss_val, train_loss/(i+1)), end='')


        if train_step > 0 and train_step % args.iter_val == 0:
            if isinstance(model, DataParallel):
                torch.save(model.module.state_dict(), model_file+'_latest')
            else:
                torch.save(model.state_dict(), model_file+'_latest')

            val_metrics = validate(args, model, val_loader)
            
            _save_ckp = ''
            if args.always_save or val_metrics[best_key] > best_f2:
                best_f2 = val_metrics[best_key]
                if isinstance(model, DataParallel):
                    torch.save(model.module.state_dict(), model_file)
                else:
                    torch.save(model.state_dict(), model_file)
                _save_ckp = '*'
            print(' {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.2f} |  {:4s} |'.format(
                val_metrics['valid_loss'], val_metrics['top1'], val_metrics['top10'], best_f2,
                (time.time() - bg) / 60, _save_ckp))

            model.train()
            if args.lrs == 'plateau':
                lr_scheduler.step(best_f2)
            else:
                lr_scheduler.step()
            current_lr = get_lrs(optimizer)
    
        train_step += 1
Exemplo n.º 14
0
def main():
    global args
    args = parser.parse_args()
    
    
    torch.manual_seed(0)
    torch.cuda.set_device(0)

    model = import_module(args.model)
    config, net, loss, get_pbb = model.get_model()
    start_epoch = args.start_epoch
    save_dir = args.save_dir
    
    if args.resume:
        checkpoint = torch.load(args.resume)
        if start_epoch == 0:
            start_epoch = checkpoint['epoch'] + 1
        if not save_dir:
            save_dir = checkpoint['save_dir']
        else:
            save_dir = os.path.join('results',save_dir)
        net.load_state_dict(checkpoint['state_dict'])
    else:
        if start_epoch == 0:
            start_epoch = 1
        if not save_dir:
            exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime())
            save_dir = os.path.join('results', args.model + '-' + exp_id)
        else:
            save_dir = os.path.join('results',save_dir)
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    logfile = os.path.join(save_dir,'log')
    if args.test!=1:
        sys.stdout = Logger(logfile)
        pyfiles = [f for f in os.listdir('./') if f.endswith('.py')]
        for f in pyfiles:
            shutil.copy(f,os.path.join(save_dir,f))
    n_gpu = setgpu(args.gpu)
    args.n_gpu = n_gpu
    net = net.cuda()
    loss = loss.cuda()
    cudnn.benchmark = True
    net = DataParallel(net)
    datadir = config_training['preprocess_result_path']
    
    if args.test == 1:
        margin = 32
        sidelen = 144

        split_comber = SplitComb(sidelen,config['max_stride'],config['stride'],margin,config['pad_value'])
        dataset = data.DataBowl3Detector(
            datadir,
            'full.npy',
            config,
            phase='test',
            split_comber=split_comber)
        test_loader = DataLoader(
            dataset,
            batch_size = 1,
            shuffle = False,
            num_workers = args.workers,
            collate_fn = data.collate,
            pin_memory=False)
        
        test(test_loader, net, get_pbb, save_dir,config)
        return

    #net = DataParallel(net)
    
    dataset = data.DataBowl3Detector(
        datadir,
        'kaggleluna_full.npy',
        config,
        phase = 'train')
    train_loader = DataLoader(
        dataset,
        batch_size = args.batch_size,
        shuffle = True,
        num_workers = args.workers,
        pin_memory=True)

    dataset = data.DataBowl3Detector(
        datadir,
        'valsplit.npy',
        config,
        phase = 'val')
    val_loader = DataLoader(
        dataset,
        batch_size = args.batch_size,
        shuffle = False,
        num_workers = args.workers,
        pin_memory=True)

    optimizer = torch.optim.SGD(
        net.parameters(),
        args.lr,
        momentum = 0.9,
        weight_decay = args.weight_decay)
    
    def get_lr(epoch):
        if epoch <= args.epochs * 0.5:
            lr = args.lr
        elif epoch <= args.epochs * 0.8:
            lr = 0.1 * args.lr
        else:
            lr = 0.01 * args.lr
        return lr
    

    for epoch in range(start_epoch, args.epochs + 1):
        train(train_loader, net, loss, epoch, optimizer, get_lr, args.save_freq, save_dir)
        validate(val_loader, net, loss)
def train_linear(epochs, batch_size, dev_ids, learning_rate=0.001, save_file=None, show_batch=True):

    # get data and dataloader
    voice_data = VoiceData()
    dataloader = DataLoader(voice_data,
                            shuffle=True,
                            batch_size=batch_size)

    # get device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # declare model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        model = LinMod().to(device)
        model = DataParallel(model, device_ids=dev_ids)
    else:
        model = LinMod()
       
    # training mode
    model.train()

    # declare training methodology
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    track_epoch_time = []
    for e in range(epochs):
        start = time.time()
        print("Starting Epoch", e)
        # Loop over minibatches
        for i, (x, y) in enumerate(dataloader):

            # move to device
            x = x.to(device)
            y = y.to(device)

            # zero gradient
            optimizer.zero_grad()

            # forward
            y_ = model(x)
            loss = criterion(y_,y)
            loss.backward()
            optimizer.step() 

            if show_batch:
                print("batch {}/{} \t loss: {}".format(i, len(voice_data)/batch_size, float(loss)), end='\r')

        epoch_time = time.time()-start
        print("\nEpoch-time: ", epoch_time)
        track_epoch_time.append(epoch_time)

        # save file
        if save_file: 
            torch.save(model, save_file)

    track_epoch_time = track_epoch_time[1:]
    total_epoch_time = sum(track_epoch_time)
    avg_epoch_time = total_epoch_time/len(track_epoch_time)

    return({'GPUs':len(dev_ids), 'batch_size':batch_size, 'epoch_time':avg_epoch_time})
Exemplo n.º 16
0
class MSG_GAN:
    """ Unconditional TeacherGAN

        args:
            depth: depth of the GAN (will be used for each generator and discriminator)
            latent_size: latent size of the manifold used by the GAN
            use_eql: whether to use the equalized learning rate
            use_ema: whether to use exponential moving averages.
            ema_decay: value of ema decay. Used only if use_ema is True
            device: device to run the GAN on (GPU / CPU)
    """
    def __init__(self,
                 depth=7,
                 latent_size=512,
                 use_eql=True,
                 use_ema=True,
                 ema_decay=0.999,
                 th_low=0.45,
                 th_high=0.8,
                 dis_optimize_always=False,
                 device=th.device("cpu")):
        """ constructor for the class """
        from torch.nn import DataParallel

        self.gen = Generator(depth, latent_size, use_eql=use_eql).to(device)

        # Parallelize them if required:
        if device == th.device("cuda"):
            self.gen = DataParallel(self.gen)
            self.dis = Discriminator(depth,
                                     latent_size,
                                     use_eql=use_eql,
                                     gpu_parallelize=True).to(device)
        else:
            self.dis = Discriminator(depth, latent_size,
                                     use_eql=True).to(device)

        # state of the object
        self.use_ema = use_ema
        self.ema_decay = ema_decay
        self.th_low = th_low
        self.th_high = th_high
        self.dis_optimize_always = dis_optimize_always
        self.use_eql = use_eql
        self.latent_size = latent_size
        self.depth = depth
        self.device = device

        if self.use_ema:
            from MSG_GAN.CustomLayers import update_average

            # create a shadow copy of the generator
            self.gen_shadow = copy.deepcopy(self.gen)

            # updater function:
            self.ema_updater = update_average

            # initialize the gen_shadow weights equal to the
            # weights of gen
            self.ema_updater(self.gen_shadow, self.gen, beta=0)

        # by default the generator and discriminator are in eval mode
        self.gen.eval()
        self.dis.eval()
        if self.use_ema:
            self.gen_shadow.eval()

    def generate_samples(self, num_samples):
        """
        generate samples using this gan
        :param num_samples: number of samples to be generated
        :return: generated samples tensor: list[ Tensor(B x H x W x C)]
        """
        noise = th.randn(num_samples, self.latent_size).to(self.device)
        generated_images = self.gen(noise)

        # reshape the generated images
        generated_images = list(
            map(lambda x: (x.detach().permute(0, 2, 3, 1) / 2) + 0.5,
                generated_images))

        return generated_images

    def optimize_discriminator(self, dis_optim, noise, real_batch, loss_fn,
                               gen_loss):
        """
        performs one step of weight update on discriminator using the batch of data
        :param dis_optim: discriminator optimizer
        :param noise: input noise of sample generation
        :param real_batch: real samples batch
                           should contain a list of tensors at different scales
        :param loss_fn: loss function to be used (object of GANLoss)
        :return: current loss
        """

        # generate a batch of samples
        fake_samples = self.gen(noise)
        fake_samples = list(map(lambda x: x.detach(), fake_samples))

        loss = loss_fn.dis_loss(real_batch, fake_samples)
        dis_loss = loss.item()
        # optimize discriminator
        # From http://blog.otoro.net/2016/04/01/generating-large-images-from-latent-vectors
        # "...calculate D’s loss function first, and only perform gradient descent on D if G’s loss function is less
        # than some upper bound (so it is relatively not that weak against D in the first place),
        # and also if D’s loss function is greater than some lower bound (so that it is not relatively that strong
        # versus G). We have tried to use an upper bound of 0.80 and a lower bound of 0.45."

        # print('gen_loss={} < self.th_high={} : {} and dis_loss={} > self.th_low={} : {}'.format(
        #     gen_loss, self.th_high, gen_loss < self.th_high, dis_loss, self.th_low, dis_loss > self.th_low
        # ))

        if (gen_loss < self.th_high
                and dis_loss > self.th_low) or self.dis_optimize_always:
            # print('Condition met to run loss.backward() for discriminator')
            dis_optim.zero_grad()
            loss.backward()
            dis_optim.step()

        return loss.item()

    def optimize_generator(self, gen_optim, noise, real_batch, loss_fn):
        """
        performs one step of weight update on generator using the batch of data
        :param gen_optim: generator optimizer
        :param noise: input noise of sample generation
        :param real_batch: real samples batch
                           should contain a list of tensors at different scales
        :param loss_fn: loss function to be used (object of GANLoss)
        :return: current loss
        """

        # generate a batch of samples
        fake_samples = self.gen(noise)

        loss = loss_fn.gen_loss(real_batch, fake_samples)

        # optimize generator
        gen_optim.zero_grad()
        loss.backward()
        gen_optim.step()

        # if self.use_ema is true, apply the moving average here:
        if self.use_ema:
            self.ema_updater(self.gen_shadow, self.gen, self.ema_decay)

        return loss.item()

    def create_grid(self, samples, img_files):
        """
        utility function to create a grid of GAN samples
        :param samples: generated samples for storing list[Tensors]
        :param img_files: list of names of files to write
        :return: None (saves multiple files)
        """
        from torchvision.utils import save_image
        from torch.nn.functional import interpolate
        from numpy import sqrt, power

        # dynamically adjust the colour of the images
        samples = [
            Generator.adjust_dynamic_range(sample) for sample in samples
        ]

        # resize the samples to have same resolution:
        for i in range(len(samples)):
            samples[i] = interpolate(samples[i],
                                     scale_factor=power(2, self.depth - 1 - i))
        # save the images:
        for sample, img_file in zip(samples, img_files):
            save_image(sample,
                       img_file,
                       nrow=int(sqrt(sample.shape[0])),
                       normalize=True,
                       scale_each=True,
                       padding=0)

    def train(self,
              data,
              gen_optim,
              dis_optim,
              loss_fn,
              normalize_latents=True,
              start=1,
              num_epochs=12,
              feedback_factor=10,
              checkpoint_factor=1,
              data_percentage=100,
              num_samples=36,
              log_dir=None,
              sample_dir="./samples",
              save_dir="./models",
              save_real=False):
        """
        Method for training the network
        :param data: pytorch dataloader which iterates over images
        :param gen_optim: Optimizer for generator.
                          please wrap this inside a Scheduler if you want to
        :param dis_optim: Optimizer for discriminator.
                          please wrap this inside a Scheduler if you want to
        :param loss_fn: Object of GANLoss
        :param normalize_latents: whether to normalize the latent vectors during training
        :param start: starting epoch number
        :param num_epochs: total number of epochs to run for (ending epoch number)
                           note this is absolute and not relative to start
        :param feedback_factor: number of logs generated and samples generated
                                during training per epoch
        :param checkpoint_factor: save model after these many epochs
        :param data_percentage: amount of data to be used
        :param num_samples: number of samples to be drawn for feedback grid
        :param log_dir: path to directory for saving the loss.log file
        :param sample_dir: path to directory for saving generated samples' grids
        :param save_dir: path to directory for saving the trained models
        :return: None (writes multiple files to disk)
        """

        from torch.nn.functional import avg_pool2d

        # turn the generator and discriminator into train mode
        self.gen.train()
        self.dis.train()

        assert isinstance(gen_optim, th.optim.Optimizer), \
            "gen_optim is not an Optimizer"
        assert isinstance(dis_optim, th.optim.Optimizer), \
            "dis_optim is not an Optimizer"

        print("Starting the training process ... ")

        # create fixed_input for debugging
        fixed_input = th.randn(num_samples, self.latent_size).to(self.device)
        if normalize_latents:
            fixed_input = (fixed_input /
                           fixed_input.norm(dim=-1, keepdim=True) *
                           (self.latent_size**0.5))

        # create a global time counter
        global_time = time.time()
        global_step = 0

        # See http://blog.otoro.net/2016/04/01/generating-large-images-from-latent-vectors/ and comments in
        # optimize_discriminator() above
        gen_loss = 0

        for epoch in range(start, num_epochs + 1):
            start_time = timeit.default_timer(
            )  # record time at the start of epoch

            print("\nEpoch: %d" % epoch)
            total_batches = len(iter(data))

            limit = int((data_percentage / 100) * total_batches)

            for (i, batch) in enumerate(data, 1):

                # extract current batch of data for training
                images = batch.to(self.device)
                extracted_batch_size = images.shape[0]

                # create a list of downsampled images from the real images:
                images = [images] + [
                    avg_pool2d(images, int(np.power(2, i)))
                    for i in range(1, self.depth)
                ]
                images = list(reversed(images))

                # sample some random latent points
                gan_input = th.randn(extracted_batch_size,
                                     self.latent_size).to(self.device)

                # normalize them if asked
                if normalize_latents:
                    gan_input = (gan_input /
                                 gan_input.norm(dim=-1, keepdim=True) *
                                 (self.latent_size**0.5))

                # optimize the discriminator:
                dis_loss = self.optimize_discriminator(dis_optim, gan_input,
                                                       images, loss_fn,
                                                       gen_loss)

                # optimize the generator:
                gen_loss = self.optimize_generator(gen_optim, gan_input,
                                                   images, loss_fn)

                # provide a loss feedback
                if i % (
                        int(limit / feedback_factor) + 1
                ) == 0 or i == 1:  # Avoid div by 0 error on small training sets
                    elapsed = time.time() - global_time
                    elapsed = str(datetime.timedelta(seconds=elapsed))
                    print("Elapsed [%s] batch: %d  d_loss: %f  g_loss: %f" %
                          (elapsed, i, dis_loss, gen_loss))

                    # also write the losses to the log file:
                    if log_dir is not None:
                        log_file = os.path.join(log_dir, "loss.log")
                        os.makedirs(os.path.dirname(log_file), exist_ok=True)
                        with open(log_file, "a") as log:
                            log.write(
                                str(global_step) + "\t" + str(dis_loss) +
                                "\t" + str(gen_loss) + "\n")

                    # create a grid of samples and save it
                    reses = [
                        str(int(np.power(2, dep))) + "_x_" +
                        str(int(np.power(2, dep)))
                        for dep in range(2, self.depth + 2)
                    ]
                    gen_img_files = [
                        os.path.join(
                            sample_dir, res,
                            "gen_" + str(epoch) + "_" + str(i) + ".png")
                        for res in reses
                    ]

                    # Make sure all the required directories exist
                    # otherwise make them
                    os.makedirs(sample_dir, exist_ok=True)
                    for gen_img_file in gen_img_files:
                        os.makedirs(os.path.dirname(gen_img_file),
                                    exist_ok=True)

                    dis_optim.zero_grad()
                    gen_optim.zero_grad()
                    with th.no_grad():
                        self.create_grid(
                            self.gen(fixed_input) if not self.use_ema else
                            self.gen_shadow(fixed_input), gen_img_files)

                    # create a grid of real images and save it
                    if save_real:
                        real_img_files = [
                            os.path.join(
                                sample_dir, res,
                                "real_" + str(epoch) + "_" + str(i) + ".png")
                            for res in reses
                        ]
                        # Make sure all the required directories exist
                        # otherwise make them
                        os.makedirs(sample_dir, exist_ok=True)
                        for real_img_file in real_img_files:
                            os.makedirs(os.path.dirname(real_img_file),
                                        exist_ok=True)

                        self.create_grid(images, real_img_files)

                # increment the global_step:
                global_step += 1

                if i > limit:
                    break

            # calculate the time required for the epoch
            stop_time = timeit.default_timer()
            print("Time taken for epoch: %.3f secs" % (stop_time - start_time))

            if epoch % checkpoint_factor == 0 or epoch == 1 or epoch == num_epochs:
                os.makedirs(save_dir, exist_ok=True)
                gen_save_file = os.path.join(save_dir,
                                             "GAN_GEN_" + str(epoch) + ".pth")
                dis_save_file = os.path.join(save_dir,
                                             "GAN_DIS_" + str(epoch) + ".pth")
                gen_optim_save_file = os.path.join(
                    save_dir, "GAN_GEN_OPTIM_" + str(epoch) + ".pth")
                dis_optim_save_file = os.path.join(
                    save_dir, "GAN_DIS_OPTIM_" + str(epoch) + ".pth")

                th.save(self.gen.state_dict(), gen_save_file)
                th.save(self.dis.state_dict(), dis_save_file)
                th.save(gen_optim.state_dict(), gen_optim_save_file)
                th.save(dis_optim.state_dict(), dis_optim_save_file)

                if self.use_ema:
                    gen_shadow_save_file = os.path.join(
                        save_dir, "GAN_GEN_SHADOW_" + str(epoch) + ".pth")
                    th.save(self.gen_shadow.state_dict(), gen_shadow_save_file)

        print("Training completed ...")

        # return the generator and discriminator back to eval mode
        self.gen.eval()
        self.dis.eval()
Exemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡')
    parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库')
    parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料')
    parser.add_argument('--tokenized_data_path0', default='data/tokenized/', type=str, required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--tokenized_data_path1', default='data/tokenized/', type=str, required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环')
    parser.add_argument('--batch_size', default=64, type=int, required=False, help='训练batch size')
    parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率')
    parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数')
    parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍')
    parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False)
    parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False)
    parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份')
    parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度')
    parser.add_argument('--max_length', default=256, type=int, required=False, help='最短收录文章长度')
    parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径')
    parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径')
    parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json")
    parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe")
    parser.add_argument('--max_steps_perEpoch_perPiece', default=1000000, type=int, required=False)
    parser.add_argument('--steps_savemodel', default=10000, type=int, required=False, help='保存模型步数')
    parser.add_argument('--padding', action='store_true', help='输入是否定长')
    args = parser.parse_args()
    print('args:\n' + args.__repr__())
    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    #os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    log_step = args.log_step
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    output_dir = args.output_dir
    assert log_step % gradient_accumulation == 0
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))
    multi_gpu = False
    optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    print('starting training')
    step_loss = 0
    running_loss = 10
    loss_ = 10
    iter0 = iterData(args.tokenized_data_path0, rate=0.045, batch_size=batch_size, epochs=epochs)
    iter1 = iterData(args.tokenized_data_path1, rate=1.0, batch_size=batch_size, epochs=epochs)
    step = 0
    epoch0 = -1
    while True:
        data0 = next(iter0)
        data1 = next(iter1)
        if data0=='__STOP__' or data1=='__STOP__':
            break
        epoch, epochs, idx_file0, nb_files0, batch_inputs0 = data0
        epoch, epochs, idx_file1, nb_files1, batch_inputs1 = data1
        batch_inputs = batch_inputs1+batch_inputs0
        random.shuffle(batch_inputs)
        batch_inputs = torch.tensor(batch_inputs).long().to(device)
        #  forward pass
        outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs)
        loss, logits = outputs[:2]
        #  get loss
        if multi_gpu:
            loss = loss.mean()
        if gradient_accumulation > 1:
            loss = loss / gradient_accumulation
        #  loss backward
        if fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        #  optimizer step
        if (step + 1) % gradient_accumulation == 0:
            running_loss += loss.item()
            optimizer.step()
            optimizer.zero_grad()
            step_loss += 1
            #scheduler.step()
        if (step + 1) % log_step == 0:
            loss_ = running_loss * gradient_accumulation / (log_step / gradient_accumulation)
            print('now time: {}:{}. step: {}, progress-innerEpoch: {}/{}, progress-outerEpoch: {}/{}, loss {}'.format(
                    datetime.now().hour,
                    datetime.now().minute,
                    step+1,
                    str(idx_file0+1)+':'+str(idx_file1+1),
                    str(nb_files0)+':'+str(nb_files1),
                    epoch + 1,
                    epochs,
                    loss_))
            running_loss = 0
        if step%args.steps_savemodel==0:
            print('saving model for epoch {}'.format(epoch + 1))
            output_dir_ = output_dir + 'model_epoch{}_step{}_loss-{}'.format(epoch + 1, step,'%0.2f'%loss_)
            if not os.path.exists(output_dir_):
                os.mkdir(output_dir_)
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(output_dir_)
        step += 1
        if epoch!=epoch0:
            if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
                os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1))
            epoch0 = epoch
            print('epoch {} finished'.format(epoch + 1))
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
    print('training finished')
Exemplo n.º 18
0
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_idx = [i for i in range(num_train)]
    np.random.seed(0)
    np.random.shuffle(train_idx)
    val_idx = [i for i in range(num_val)]

    print('num of train dataset: {:6d}'.format(num_train))
    print('num of valid dataset: {:6d}'.format(num_val))

    train_loader = DataLoader(train_dataset,
                              batch_size=train_batch_size,
                              sampler=train_idx,
                              num_workers=workers,
                              pin_memory=False)
    val_loader = DataLoader(val_dataset,
                            batch_size=val_batch_size,
                            sampler=val_idx,
                            num_workers=workers,
                            pin_memory=False)
    # model = models.resnet50(pretrained=True)
    # num_ftrs = model.fc.in_features
    # model.fc = nn.Linear(num_ftrs, 7)
    model = multi_resnet()
    if use_gpu:
        model = model.cuda()
    model = DataParallel(model)
    criterion = nn.BCEWithLogitsLoss(size_average=False)
    if multi_optim == 0:
        if optimizer_choice == 0:
            optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
            exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam(model.parameters())
    elif multi_optim == 1:
        if optimizer_choice == 0:
            optimizer = optim.SGD([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': model.module.fc1.parameters(),
                    'lr': 1e-3
                },
            ],
                                  lr=1e-4,
                                  momentum=0.9)

            exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': model.module.fc1.parameters(),
                    'lr': 1e-3
                },
            ],
                                   lr=1e-4)

    best_model_wts = copy.deepcopy(model.state_dict())
    best_val_accuracy = 0.0
    correspond_train_acc = 0.0

    all_info = []
    all_train_accuracy = []
    all_train_loss = []
    all_val_accuracy = []
    all_val_loss = []

    sig_f = nn.Sigmoid()
    for epoch in range(epochs):
        train_idx = [i for i in range(num_train)]
        np.random.seed(0)
        np.random.shuffle(train_idx)

        train_loader = DataLoader(train_dataset,
                                  batch_size=train_batch_size,
                                  sampler=train_idx,
                                  num_workers=workers,
                                  pin_memory=False)
        model.train()
        train_loss = 0.0
        train_corrects = 0
        train_start_time = time.time()
        for data in train_loader:
            inputs, labels_1, labels_2 = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels_1.cuda())
            else:
                inputs = Variable(inputs)
                labels = Variable(labels_1)
            optimizer.zero_grad(
            )  # 如果optimizer(net.parameters()), 那么效果和net.zero_grad()一样

            outputs = model.forward(inputs)

            sig_out = outputs.data.cpu()
            sig_out = sig_f(sig_out)

            predict = torch.ByteTensor(sig_out > 0.5)
            predict = predict.long()
            train_corrects += torch.sum(predict == labels.data.cpu())
            # print(train_corrects)
            labels = Variable(labels.data.float())
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.data[0]
            # print(train_corrects)
        train_elapsed_time = time.time() - train_start_time
        train_accuracy = train_corrects / num_train / 7
        train_average_loss = train_loss / num_train / 7

        model.eval()
        val_loss = 0.0
        val_corrects = 0
        val_start_time = time.time()
        for data in val_loader:
            inputs, labels_1, labels_2 = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels_1.cuda())
            else:
                inputs = Variable(inputs)
                labels = Variable(labels_1)

            outputs = model.forward(inputs)

            sig_out = outputs.data.cpu()
            sig_out = sig_f(sig_out)

            predict = torch.ByteTensor(sig_out > 0.5)
            predict = predict.long()
            val_corrects += torch.sum(predict == labels.data.cpu())
            labels = Variable(labels.data.float())
            loss = criterion(outputs, labels)
            val_loss += loss.data[0]
            # print(val_corrects)
        val_elapsed_time = time.time() - val_start_time
        val_accuracy = val_corrects / num_val / 7
        val_average_loss = val_loss / num_val / 7
        print('epoch: {:4d}'
              ' train in: {:2.0f}m{:2.0f}s'
              ' train loss: {:4.4f}'
              ' train accu: {:.4f}'
              ' valid in: {:2.0f}m{:2.0f}s'
              ' valid loss: {:4.4f}'
              ' valid accu: {:.4f}'.format(epoch, train_elapsed_time // 60,
                                           train_elapsed_time % 60,
                                           train_average_loss, train_accuracy,
                                           val_elapsed_time // 60,
                                           val_elapsed_time % 60,
                                           val_average_loss, val_accuracy))

        all_train_loss.append(train_average_loss)
        all_train_accuracy.append(train_accuracy)
        all_val_loss.append(val_average_loss)
        all_val_accuracy.append(val_accuracy)

        if optimizer_choice == 0:
            exp_lr_scheduler.step(val_average_loss)

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            correspond_train_acc = train_accuracy
            best_model_wts = copy.deepcopy(model.state_dict())
        elif val_accuracy == best_val_accuracy:
            if train_accuracy > correspond_train_acc:
                correspond_train_acc = train_accuracy
                best_model_wts = copy.deepcopy(model.state_dict())

    print('best accuracy: {:.4f} cor train accu: {:.4f}'.format(
        best_val_accuracy, correspond_train_acc))

    save_val = int("{:4.0f}".format(best_val_accuracy * 10000))
    save_train = int("{:4.0f}".format(correspond_train_acc * 10000))
    model_name = "tool" \
                 + "_epoch_" + str(epochs) \
                 + "_opt_" + str(optimizer_choice) \
                 + "_mulopt_" + str(multi_optim) \
                 + "_flip_" + str(use_flip) \
                 + "_crop_" + str(crop_type) \
                 + "_batch_" + str(train_batch_size) \
                 + "_train_" + str(save_train) \
                 + "_val_" + str(save_val) \
                 + ".pth"
    torch.save(best_model_wts, model_name)
    all_info.append(all_train_accuracy)
    all_info.append(all_train_loss)
    all_info.append(all_val_accuracy)
    all_info.append(all_val_loss)

    record_name = "tool" \
                  + "_epoch_" + str(epochs) \
                  + "_opt_" + str(optimizer_choice) \
                  + "_mulopt_" + str(multi_optim) \
                  + "_flip_" + str(use_flip) \
                  + "_crop_" + str(crop_type) \
                  + "_batch_" + str(train_batch_size) \
                  + "_train_" + str(save_train) \
                  + "_val_" + str(save_val) \
                  + ".pkl"
    with open(record_name, 'wb') as f:
        pickle.dump(all_info, f)
    print()
Exemplo n.º 19
0
class UNetTrainer(object):
    """UNet trainer"""
    def __init__(self,
                 start_epoch=0,
                 save_dir='',
                 resume="",
                 devices_num=2,
                 num_classes=2,
                 color_dim=1):

        self.net = UNet(color_dim=color_dim, num_classes=num_classes)
        self.start_epoch = start_epoch if start_epoch != 0 else 1
        self.save_dir = os.path.join('../models', save_dir)
        self.loss = CrossEntropyLoss()
        self.num_classes = num_classes

        if resume:
            checkpoint = torch.load(resume)
            if self.start_epoch == 0:
                self.start_epoch = checkpoint['epoch'] + 1
            if not self.save_dir:
                self.save_dir = checkpoint['save_dir']
            self.net.load_state_dict(checkpoint['state_dir'])

        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)

        self.net.cuda()
        self.loss.cuda()
        if devices_num == 2:
            self.net = DataParallel(self.net, device_ids=[0, 1])
        #self.loss = DataParallel(self.loss, device_ids=[0, 1])

    def train(self,
              train_loader,
              val_loader,
              lr=0.001,
              weight_decay=1e-4,
              epochs=200,
              save_freq=10):

        self.logfile = os.path.join(self.save_dir, 'log')
        sys.stdout = Logger(self.logfile)
        self.epochs = epochs
        self.lr = lr

        optimizer = torch.optim.Adam(
            self.net.parameters(),
            #lr,
            #momentum=0.9,
            weight_decay=weight_decay)

        for epoch in range(self.start_epoch, epochs + 1):
            self.train_(train_loader, epoch, optimizer, save_freq)
            self.validate_(val_loader, epoch)

    def train_(self, data_loader, epoch, optimizer, save_freq=10):
        start_time = time.time()

        self.net.train()
        #lr = self.get_lr(epoch)

        #for param_group in optimizer.param_groups:
        #    param_group['lr'] = lr

        metrics = []

        for i, (data, target) in enumerate(tqdm(data_loader)):
            data_t, target_t = data, target
            data = Variable(data.cuda(non_blocking=True))
            target = Variable(target.cuda(non_blocking=True))

            output = self.net(data)  #unet输出结果

            output = output.transpose(1, 3).transpose(1, 2).contiguous().view(
                -1, self.num_classes)
            target = target.view(-1)
            loss_output = self.loss(output, target)

            optimizer.zero_grad()
            loss_output.backward()  #反向传播loss
            optimizer.step()

            loss_output = loss_output.data[0]  #loss数值
            acc = accuracy(output, target)
            metrics.append([loss_output, acc])

            if i == 0:
                batch_size = data.size(0)
                _, output = output.data.max(dim=1)
                output = output.view(batch_size, 1, 1, 320, 480).cpu()  #预测结果图
                data_t = data_t[0, 0].unsqueeze(0).unsqueeze(0)  #原img图
                target_t = target_t[0].unsqueeze(0)  #gt图
                t = torch.cat([output[0].float(), data_t,
                               target_t.float()], 0)  #第一个参数为list,拼接3张图像
                #show_list = []
                #for j in range(10):
                #    show_list.append(data_t[j, 0].unsqueeze(0).unsqueeze(0))
                #    show_list.append(target_t[j].unsqueeze(0))
                #    show_list.append(output[j].float())
                #
                #t = torch.cat(show_list, 0)
                torchvision.utils.save_image(t,
                                             "temp_image/%02d_train.jpg" %
                                             epoch,
                                             nrow=3)

            #if i == 20:
            #    break

        if epoch % save_freq == 0:
            if 'module' in dir(self.net):
                state_dict = self.net.module.state_dict()
            else:
                state_dict = self.net.state_dict()

            for key in state_dict.keys():
                state_dict[key] = state_dict[key].cpu()

            torch.save(
                {
                    'epoch': epoch,
                    'save_dir': self.save_dir,
                    'state_dir': state_dict
                }, os.path.join(self.save_dir, '%03d.ckpt' % epoch))

        end_time = time.time()

        metrics = np.asarray(metrics, np.float32)
        self.print_metrics(metrics, 'Train', end_time - start_time, epoch)

    def validate_(self, data_loader, epoch):
        start_time = time.time()

        self.net.eval()
        metrics = []
        for i, (data, target) in enumerate(data_loader):
            data_t, target_t = data, target
            data = Variable(data.cuda(non_blocking=True), volatile=True)
            target = Variable(target.cuda(non_blocking=True), volatile=True)

            output = self.net(data)
            output = output.transpose(1, 3).transpose(1, 2).contiguous().view(
                -1, self.num_classes)
            target = target.view(-1)
            loss_output = self.loss(output, target)

            loss_output = loss_output.data[0]
            acc = accuracy(output, target)
            metrics.append([loss_output, acc])

            if i == 0:
                batch_size = data.size(0)
                _, output = output.data.max(dim=1)
                output = output.view(batch_size, 1, 1, 320, 480).cpu()
                data_t = data_t[0, 0].unsqueeze(0).unsqueeze(0)
                target_t = target_t[0].unsqueeze(0)
                t = torch.cat([output[0].float(), data_t, target_t.float()], 0)
                #    show_list = []
                #    for j in range(10):
                #        show_list.append(data_t[j, 0].unsqueeze(0).unsqueeze(0))
                #        show_list.append(target_t[j].unsqueeze(0))
                #        show_list.append(output[j].float())
                #
                #    t = torch.cat(show_list, 0)
                torchvision.utils.save_image(t,
                                             "temp_image/%02d_val.jpg" % epoch,
                                             nrow=3)
            #if i == 10:
            #    break

        end_time = time.time()

        metrics = np.asarray(metrics, np.float32)
        self.print_metrics(metrics, 'Validation', end_time - start_time)

    def print_metrics(self, metrics, phase, time, epoch=-1):
        """metrics: [loss, acc]
        """
        if epoch != -1:
            print("Epoch: {}".format(epoch), )
        print(phase, )
        print('loss %2.4f, accuracy %2.4f, time %2.2f' %
              (np.mean(metrics[:, 0]), np.mean(metrics[:, 1]), time))
        if phase != 'Train':
            print

    def get_lr(self, epoch):
        if epoch <= self.epochs * 0.5:
            lr = self.lr
        elif epoch <= self.epochs * 0.8:
            lr = 0.1 * self.lr
        else:
            lr = 0.01 * self.lr
        return lr

    def save_py_files(self, path):
        """copy .py files in exps dir, cfgs dir and current dir into
           save_dir, and keep the files structure
        """
        #exps dir
        pyfiles = [f for f in os.listdir(path) if f.endswith('.py')]
        path = "/".join(path.split('/')[-2:])
        exp_save_path = os.path.join(self.save_dir, path)
        mkdir(exp_save_path)
        for f in pyfiles:
            shutil.copy(os.path.join(path, f), os.path.join(exp_save_path, f))
        #current dir
        pyfiles = [f for f in os.listdir('./') if f.endswith('.py')]
        for f in pyfiles:
            shutil.copy(f, os.path.join(self.save_dir, f))
        #cfgs dir
        shutil.copytree('./cfgs', os.path.join(self.save_dir, 'cfgs'))
Exemplo n.º 20
0
        correct += pred.eq(target.data).to('cpu').sum()

    print('Accuracy: %d %%' % (100 * correct / len(test_loader.dataset)))


if __name__ == '__main__':

    model = models.resnet50(pretrained=True)  # resnet50
    #model = models.resnet101(pretrained=True) # resnet101
    num_features = model.fc.in_features
    model.fc = nn.Linear(num_features, 1222)

    #print(model)
    #model.cuda()
    model.to('cuda')
    model = DataParallel(model)

    traindir = ('/faces_83/train_images')
    testdir = ('/faces_83/test_images')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))
Exemplo n.º 21
0
    def restore_checkpoint(self):
        """
        Restore from the last checkpoint if available. Otherwise, configure this trainer from the scratch.
        """
        # Check if there exists any checkpoints.
        chkpt_path = self.last_checkpoint
        if chkpt_path:
            # reload configuration from the checkpoint
            self._config = TrainerConfig.from_pretrained(str(chkpt_path))
            self._logger.info("TrainerConfig at [%s] is restored.", chkpt_path)

            # Recover random number generator states
            self.set_seed()  # Set seed before restoring RNG
            random_path = Path(chkpt_path, 'random.pt')
            random_states = torch.load(random_path)
            numpy.random.set_state(random_states['numpy'])
            random.setstate(random_states['random'])
            self.trainset.set_rng_state(random_states['trainset'])

            torch.set_rng_state(random_states['torch']['cpu'])
            if torch.cuda.is_available():
                torch.cuda.set_rng_state_all(random_states['torch']['cuda'])

            # Record that the RNG is restored.
            self._logger.info(
                "State of random number generator is restored from [%s]",
                random_path)
            self._random_restored = True

            # Recover the trainer's internal states
            internal_states = torch.load(Path(chkpt_path, 'internal.pt'))
            for key, value in internal_states.items():
                if hasattr(self, key):
                    setattr(self, key, value)
        else:
            self.set_seed()  # Set seed.

        # Build/restore model
        self._config.model.set_chkpt_path(chkpt_path)
        self._module = Solver.from_pretrained(config=self._config.model)
        self._module_init = {
            id(p): p.clone()
            for p in self._module.parameters()
        }
        self._module.to(self.main_device)
        self._logger.info("A network at [%s] is restored.", chkpt_path)

        # Compute the epoch/step information
        self._minibatch_per_epoch = len(self.trainset)
        self._step_per_epoch = int(
            math.ceil(self._minibatch_per_epoch /
                      self._config.gradient_accumulation_steps))
        self._steps_to_go = self._step_per_epoch * self._config.epoch
        self._logger.info("Steps / Epoch = %5d", self._step_per_epoch)
        self._logger.info("We will run %3d epoch(s) or %6d step(s)",
                          self._config.epoch, self._steps_to_go)
        self._logger.info(
            "Per a single step, %2d gradient(s) will be accumulated. (Total %2d mini-batch(es)/epoch)",
            self._config.gradient_accumulation_steps,
            self._minibatch_per_epoch)
        self._logger.info(
            "We will report TRAINING loss/accuracy for every %3d epoch(s)",
            self._config.epoch_report)
        self._logger.info(
            "We will report DEV ACC. and save CHKPTs for every %3d epoch(s)",
            self._config.epoch_chkpt)

        # Restore the number of steps that were passed before
        if chkpt_path:
            self._epoch = int(chkpt_path.name)
            self._logger.info("Attempt to restore from the checkpoint [%s]",
                              chkpt_path)
            self._logger.info("Resume training from epoch %s", self._epoch)

        # Classify parameters to form parameter groups to build optimizer
        no_w_decay = {'bias', 'norm', 'Norm', '_embedding'}
        parameters = [((2 if 'text_model.model.embeddings' in n else
                        (1 if 'text_model' in n else 0),
                        any(t in n for t in no_w_decay)), p)
                      for n, p in self._module.named_parameters()]
        parameters = groupby(sorted(parameters, key=lambda t: t[0]),
                             key=lambda t: t[0])

        # Build optimizer groups
        optimizer_grouped_parameters = []
        for (encoder_type_flag, is_without_wd), group in parameters:
            group = {'params': [p for _, p in group]}

            if is_without_wd:
                group['weight_decay'] = 0.0

            if encoder_type_flag == 2 and self._config.fix_encoder_embedding:
                group['lr'] = 0.0
            elif encoder_type_flag == 1:
                group['lr'] = self._config.optimizer.kwargs[
                    'lr'] * self._config.lr_multiplier_encoder

            optimizer_grouped_parameters.append(group)

        # Build optimizer before restoration
        self._optimizer = self._config.optimizer.build(
            optimizer_grouped_parameters)
        self._logger.info("We will use the following optimizer: %s",
                          self._optimizer)

        # Restore the optimizer if available.
        if chkpt_path:
            # Check if saved optimizer exists
            optimizer_file = Path(chkpt_path, 'optimizer.pt')
            if optimizer_file.is_file():
                self._optimizer.load_state_dict(torch.load(optimizer_file))
                self._logger.info(
                    "An optimizer for module at [%s] is restored.",
                    optimizer_file)

        # Specify warmup strategy if warmup value is not negative
        warmup_steps = int(self._step_per_epoch * self._config.epoch_warmup)
        if warmup_steps >= 0:
            # Build scheduler before restoration
            self._scheduler = get_linear_schedule_with_warmup(
                self._optimizer,
                num_warmup_steps=warmup_steps,
                num_training_steps=self._steps_to_go)
            self._logger.info(
                "We will use linear scheduling: warm up %s epochs or %s steps",
                self._config.epoch_warmup, warmup_steps)

            # Restore the scheduler if available
            if chkpt_path:
                # Check if saved scheduler exists
                scheduler_file = Path(chkpt_path, 'scheduler.pt')
                if scheduler_file.is_file():
                    self._scheduler.load_state_dict(torch.load(scheduler_file))
                    self._logger.info(
                        "A scheduler for module at [%s] is restored.",
                        scheduler_file)

        # Log the threshold of gradient clipping.
        if self._config.gradient_clip > 0:
            self._logger.info("We will use gradient clipping at %.3f",
                              self._config.gradient_clip)
        else:
            self._logger.info("We will not use gradient clipping")

        # Log the structure of the network.
        parameters_size = sum(p.numel() for p in self._module.parameters())
        disk_space = sum(
            required_space_param(p) for p in self._module.parameters())
        self._logger.info('==== [Network Structure] ====\n%s',
                          str(self._module))
        self._logger.info(
            'There are %12d parameters in a network. Required space for checkpointing is %.3fMB.',
            parameters_size, disk_space / 1048576)

        # Wrap data parallel if we can use more than one GPU
        if len(self.device_order) > 1 and not self.disable_dataparallel:
            self._module = DataParallel(self._module,
                                        device_ids=self.device_order,
                                        output_device=self.device_order[0])
            self._logger.info(
                "We identified [%s] devices for parallel training",
                len(self.device_order))
        else:
            self._logger.info("We don't use DataParallel.")

        # Set answer checker
        self._answer_checker = AnswerChecker(
            is_expression_type=_unwrap_parallel(
                self._module).is_expression_type,
            logger=self._logger)
Exemplo n.º 22
0
def train(args):
    # gpu init
    multi_gpus = False
    if len(args.gpus.split(',')) > 1:
        multi_gpus = True
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # log init
    save_dir = os.path.join(
        args.save_dir, args.model_pre + args.backbone.upper() + '_' +
        datetime.now().strftime('%Y%m%d_%H%M%S'))
    if os.path.exists(save_dir):
        raise NameError('model dir exists!')
    os.makedirs(save_dir)
    logging = init_log(save_dir)
    _print = logging.info

    # dataset loader
    transform = transforms.Compose([
        transforms.ToTensor(),  # range [0, 255] -> [0.0,1.0]
        transforms.Normalize(mean=(0.5, 0.5, 0.5),
                             std=(0.5, 0.5,
                                  0.5))  # range [0.0, 1.0] -> [-1.0,1.0]
    ])
    # validation dataset
    trainset = CASIAWebFace(args.train_root,
                            args.train_file_list,
                            transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=8,
                                              drop_last=False)
    # test dataset
    lfwdataset = LFW(args.lfw_test_root,
                     args.lfw_file_list,
                     transform=transform)
    lfwloader = torch.utils.data.DataLoader(lfwdataset,
                                            batch_size=128,
                                            shuffle=False,
                                            num_workers=4,
                                            drop_last=False)

    # define backbone and margin layer
    if args.backbone == 'MobileFace':
        net = MobileFaceNet(feature_dim=args.feature_dim)
    elif args.backbone == 'Res50':
        net = ResNet50()
    elif args.backbone == 'Res101':
        net = ResNet101()
    elif args.backbone == 'Res50_IR':
        net = SEResNet_IR(50, feature_dim=args.feature_dim, mode='ir')
    elif args.backbone == 'SERes50_IR':
        net = SEResNet_IR(50, feature_dim=args.feature_dim, mode='se_ir')
    elif args.backbone == 'SphereNet':
        net = SphereNet(num_layers=64, feature_dim=args.feature_dim)
    else:
        print(args.backbone, ' is not available!')

    if args.margin_type == 'ArcFace':
        margin = ArcMarginProduct(args.feature_dim,
                                  trainset.class_nums,
                                  s=args.scale_size)
    elif args.margin_type == 'CosFace':
        pass
    elif args.margin_type == 'SphereFace':
        pass
    elif args.margin_type == 'InnerProduct':
        margin = InnerProduct(args.feature_dim, trainset.class_nums)
    else:
        print(args.margin_type, 'is not available!')

    if args.resume:
        print('resume the model parameters from: ', args.net_path,
              args.margin_path)
        net.load_state_dict(torch.load(args.net_path)['net_state_dict'])
        margin.load_state_dict(torch.load(args.margin_path)['net_state_dict'])

    # define optimizers for different layers
    criterion_classi = torch.nn.CrossEntropyLoss().to(device)
    optimizer_classi = optim.SGD([{
        'params': net.parameters(),
        'weight_decay': 5e-4
    }, {
        'params': margin.parameters(),
        'weight_decay': 5e-4
    }],
                                 lr=0.1,
                                 momentum=0.9,
                                 nesterov=True)
    scheduler_classi = lr_scheduler.MultiStepLR(optimizer_classi,
                                                milestones=[35, 60, 85],
                                                gamma=0.1)

    criterion_center = AgentCenterLoss(trainset.class_nums, args.feature_dim,
                                       args.scale_size).to(device)
    optimizer_center = optim.SGD(criterion_center.parameters(), lr=0.5)
    scheduler_center = lr_scheduler.MultiStepLR(optimizer_center,
                                                milestones=[35, 60, 85],
                                                gamma=0.1)

    if multi_gpus:
        net = DataParallel(net).to(device)
        margin = DataParallel(margin).to(device)
    else:
        net = net.to(device)
        margin = margin.to(device)

    best_lfw_acc = 0.0
    best_lfw_iters = 0
    total_iters = 0
    for epoch in range(1, args.total_epoch + 1):
        scheduler_classi.step()
        scheduler_center.step()
        # train model
        _print('Train Epoch: {}/{} ...'.format(epoch, args.total_epoch))
        net.train()

        if args.plot:
            all_features, all_labels = [], []

        since = time.time()
        for data in trainloader:
            img, label = data[0].to(device), data[1].to(device)
            feature = net(img)
            output = margin(feature)
            loss_classi = criterion_classi(output, label)
            loss_center = criterion_center(feature, label)
            total_loss = loss_classi + loss_center * args.weight_center

            optimizer_classi.zero_grad()
            optimizer_center.zero_grad()
            total_loss.backward()
            optimizer_classi.step()

            # by doing so, weight_cent would not impact on the learning of centers
            #for param in criterion_center.parameters():
            #    param.grad.data *= (1. / args.weight_center)
            optimizer_center.step()

            total_iters += 1
            if args.plot:
                feat = feature.data.cpu().numpy()
                #for i in range(feat.shape[0]):
                #    feat[i] = feat[i] / np.sqrt((np.dot(feat[i], feat[i])))
                all_features.append(feat)
                all_labels.append(label.data.cpu().numpy())

            # print train information
            if total_iters % 10 == 0:
                # current training accuracy
                _, predict = torch.max(output.data, 1)
                total = label.size(0)
                correct = (np.array(predict.cpu()) == np.array(
                    label.data.cpu())).sum()
                time_cur = (time.time() - since) / 10
                since = time.time()
                print(
                    "Iters: {:0>6d}/[{:0>2d}], loss_classi: {:.4f}, loss_center: {:.4f}, train_accuracy: {:.4f}, time: {:.2f} s/iter, learning rate: {}"
                    .format(total_iters, epoch, loss_classi.item(),
                            loss_center.item(), correct / total, time_cur,
                            scheduler_classi.get_lr()[0]))
            # save model
            if total_iters % args.save_freq == 0:
                msg = 'Saving checkpoint: {}'.format(total_iters)
                _print(msg)
                if multi_gpus:
                    net_state_dict = net.module.state_dict()
                    margin_state_dict = margin.module.state_dict()
                else:
                    net_state_dict = net.state_dict()
                    margin_state_dict = margin.state_dict()

                if not os.path.exists(save_dir):
                    os.mkdir(save_dir)
                torch.save(
                    {
                        'iters': total_iters,
                        'net_state_dict': net_state_dict
                    },
                    os.path.join(save_dir, 'Iter_%06d_net.ckpt' % total_iters))
                torch.save(
                    {
                        'iters': total_iters,
                        'net_state_dict': margin_state_dict
                    },
                    os.path.join(save_dir,
                                 'Iter_%06d_margin.ckpt' % total_iters))
                #torch.save({
                #    'iters': total_iters,
                #    'net_state_dict': criterion_center.state_dict()},
                #    os.path.join(save_dir, 'Iter_%06d_center.ckpt' % total_iters))

            # test accuracy
            if total_iters % args.test_freq == 0:

                # test model on lfw
                net.eval()
                getFeatureFromTorch('./result/cur_lfw_result.mat', net, device,
                                    lfwdataset, lfwloader)
                lfw_accs = evaluation_10_fold('./result/cur_lfw_result.mat')
                _print('LFW Ave Accuracy: {:.4f}'.format(
                    np.mean(lfw_accs) * 100))
                if best_lfw_acc < np.mean(lfw_accs) * 100:
                    best_lfw_acc = np.mean(lfw_accs) * 100
                    best_lfw_iters = total_iters

                net.train()

        if args.plot:
            all_features = np.concatenate(all_features, 0)
            all_labels = np.concatenate(all_labels, 0)
            plot_features(all_features, all_labels, trainset.class_nums, epoch,
                          save_dir)
    _print('Finally Best Accuracy: LFW: {:.4f} in iters: {}'.format(
        best_lfw_acc, best_lfw_iters))
    print('finishing training')
Exemplo n.º 23
0
class Trainer(object):
    """
    Trainer class
    """
    def __init__(self,
                 chkpt_path: str,
                 config: TrainerConfig,
                 train: str,
                 test: str,
                 dev: str = None,
                 disable_dataparallel: bool = False):
        """
        Instantiate trainer

        :param str chkpt_path: Path to checkpoint the model, optimizer and scheduler
        :param TrainerConfig config: Configuration instance for Trainer
        :param str train: Path to JSON with lines file which contains the training set
        :param str test: Path to JSON with lines file which contains the evaluation set
        :param str dev: Path to JSON with lines file which contains the development set (optional)
        :param bool disable_dataparallel:
            True if module should not be parallelized across different GPU devices. False by default.
        """
        # Register configuration
        self._config = config
        self.disable_dataparallel = disable_dataparallel

        # Prepare internal states
        self._best_on_dev = 0.0  #: Best score on the development set
        self._ema_on_dev = None  #: Exponential Moving Average score on the development set.
        self._random_restored = False  #: Whether the RNG state restored or not

        # Epoch & step information
        self._epoch = 0
        self._steps_to_go = 0
        self._step_per_epoch = 0
        self._minibatch_per_epoch = 0

        # Dictionary that records the last performance metrics
        self._last_performances = {}
        self._last_metrics = {}

        # Prepare checkpointing
        self._chkpt_path = Path(chkpt_path)
        if not self._chkpt_path.exists():
            self._chkpt_path.mkdir(parents=True)

        # Logging file handler
        file_handler = logging.FileHandler(filename=Path(
            chkpt_path, 'train.log'),
                                           encoding='UTF-8')
        file_handler.setFormatter(
            logging.Formatter(
                '[%(asctime)s] %(levelname)s %(name)s: %(message)s',
                datefmt='%m/%d/%Y %H:%M:%S'))
        file_handler.setLevel(logging.INFO)

        # Set the logger
        self._logger = logging.getLogger(self.__class__.__name__ +
                                         '_%s' % id(self))
        self._logger.addHandler(file_handler)
        self._logger.setLevel(logging.INFO)

        # If DEBUG is on, turn on the anomaly detection
        if 'DEBUG' in ENV:
            torch.autograd.set_detect_anomaly(True)

        # Prepare Tensorboard if available.
        try:
            from tensorboardX import SummaryWriter
            self._writer = SummaryWriter(logdir=str(self._chkpt_path),
                                         flush_secs=30)
        except ImportError:
            self._writer = None

        # Prepare data-parallel if available.
        if torch.cuda.is_available():
            devices = get_available_device_count()
            cuda_keys = list(range(devices))
            random.shuffle(cuda_keys)

            self.main_device = torch.device('cuda', cuda_keys[0])
            self.device_order = cuda_keys
        else:
            self.main_device = torch.device('cpu')
            self.device_order = [self.main_device]
        self._logger.info(
            "We will use [%s] device as a main device for training, with ordering [%s]",
            self.main_device, self.device_order)

        # Read the datasets
        self.set_seed(
        )  #: Set seed before loading the datasets (because of shuffling in training set)
        self.trainset, self.devset, self.evalset = self._config.read_datasets(
            train=train, dev=dev, test=test)
        self._trainit = iter(self.trainset)

        # Log dataset statistics
        self._logger.info('From %s, we loaded %s mini-batch(es)', train,
                          len(self.trainset))
        self._logger.info('From %s, we loaded %s mini-batch(es)', dev,
                          len(self.devset))
        self._logger.info('From %s, we loaded %s mini-batch(es)', test,
                          len(self.evalset))
        self.trainset.print_item_statistics(self._logger)

        # Build or restore module
        self._module = None
        self._module_init = {}
        self._optimizer = None
        self._answer_checker = None
        self.restore_checkpoint()

    @property
    def checkpoints(self) -> List[Path]:
        """
        :rtype: List[Path]
        :return: List of checkpointed steps (dictionaries)
        """
        checkpoints = sorted(Path(self._chkpt_path).glob('*'))
        checkpoints = [
            x for x in checkpoints if x.is_dir() and x.name.isnumeric()
        ]
        return checkpoints

    @property
    def last_checkpoint(self) -> Path:
        """
        :rtype: Path
        :return: The last checkpoint if exists. Otherwise, None
        """
        return self.checkpoints[-1] if len(self.checkpoints) else None

    @property
    def current_epoch(self) -> int:
        """
        :rtype: int
        :return: Current epoch index
        """
        return self._epoch

    @property
    def is_done(self) -> bool:
        """
        :rtype: bool
        :return: True if trainer already reached maximum epoch specified.
        """
        return self._epoch == self._config.epoch

    def close(self):
        """
        Close and clean-up the trainer.
        """
        if self._writer is not None:
            # Close the TensorboardX
            self._writer.close()
            self._writer = None
        if self._answer_checker is not None:
            # Kill the answer checker child processes
            self._answer_checker.close()
            self._answer_checker = None

    def rotate_checkpoint(self, max_item: int = 10):
        """
        Rotate checkpoints

        :param int max_item: Maximum number of allowed checkpoints
        """
        # Check if we should delete older checkpoint(s)
        if len(self.checkpoints) <= max_item:
            return

        for chkpt in self.checkpoints[:-max_item]:
            # Remove old checkpoints
            self._logger.info("Deleting old checkpoint [%s]", chkpt)
            shutil.rmtree(chkpt)

    def checkpoint(self):
        """
        Make a checkpoint
        """
        # Build dictionary format to make the order directory names and the order of epoch index be the same.
        directory_format = '%%0%dd' % int(
            math.ceil(math.log10(self._config.epoch + 1)))
        # If directory exists, exit the method.
        output_dir = Path(self._chkpt_path, directory_format % self._epoch)
        if output_dir.exists():
            return

        # Prepare the directory for checkpointing
        self._logger.info("Save checkpoint to [%s]", output_dir)
        output_dir.mkdir(parents=True)

        # Save the all RNG states used in this trainer.
        torch.save(
            {
                'numpy': numpy.random.get_state(),
                'random': random.getstate(),
                'trainset': self.trainset.get_rng_state(),
                'torch': {
                    'cpu':
                    torch.get_rng_state(),
                    'cuda':
                    torch.cuda.get_rng_state_all()
                    if torch.cuda.is_available() else None
                }
            }, Path(output_dir, 'random.pt'))

        # Save Trainer's internal states
        torch.save(
            {
                '_best_on_dev': self._best_on_dev,
                '_ema_on_dev': self._ema_on_dev,
                '_last_performances': self._last_performances,
                '_last_metrics': self._last_metrics
            }, Path(output_dir, 'internal.pt'))

        # Save the model
        _unwrap_parallel(self._module).save_pretrained(output_dir)
        # Save the optimizer
        torch.save(self._optimizer.state_dict(),
                   Path(output_dir, 'optimizer.pt'))

        # Save the scheduler if available.
        if hasattr(self, '_scheduler'):
            torch.save(self._scheduler.state_dict(),
                       Path(output_dir, 'scheduler.pt'))

        # Write configuration that has been used.
        self._config.save_pretrained(output_dir)
        # Rotate checkpoints.
        self.rotate_checkpoint()

    def restore_checkpoint(self):
        """
        Restore from the last checkpoint if available. Otherwise, configure this trainer from the scratch.
        """
        # Check if there exists any checkpoints.
        chkpt_path = self.last_checkpoint
        if chkpt_path:
            # reload configuration from the checkpoint
            self._config = TrainerConfig.from_pretrained(str(chkpt_path))
            self._logger.info("TrainerConfig at [%s] is restored.", chkpt_path)

            # Recover random number generator states
            self.set_seed()  # Set seed before restoring RNG
            random_path = Path(chkpt_path, 'random.pt')
            random_states = torch.load(random_path)
            numpy.random.set_state(random_states['numpy'])
            random.setstate(random_states['random'])
            self.trainset.set_rng_state(random_states['trainset'])

            torch.set_rng_state(random_states['torch']['cpu'])
            if torch.cuda.is_available():
                torch.cuda.set_rng_state_all(random_states['torch']['cuda'])

            # Record that the RNG is restored.
            self._logger.info(
                "State of random number generator is restored from [%s]",
                random_path)
            self._random_restored = True

            # Recover the trainer's internal states
            internal_states = torch.load(Path(chkpt_path, 'internal.pt'))
            for key, value in internal_states.items():
                if hasattr(self, key):
                    setattr(self, key, value)
        else:
            self.set_seed()  # Set seed.

        # Build/restore model
        self._config.model.set_chkpt_path(chkpt_path)
        self._module = Solver.from_pretrained(config=self._config.model)
        self._module_init = {
            id(p): p.clone()
            for p in self._module.parameters()
        }
        self._module.to(self.main_device)
        self._logger.info("A network at [%s] is restored.", chkpt_path)

        # Compute the epoch/step information
        self._minibatch_per_epoch = len(self.trainset)
        self._step_per_epoch = int(
            math.ceil(self._minibatch_per_epoch /
                      self._config.gradient_accumulation_steps))
        self._steps_to_go = self._step_per_epoch * self._config.epoch
        self._logger.info("Steps / Epoch = %5d", self._step_per_epoch)
        self._logger.info("We will run %3d epoch(s) or %6d step(s)",
                          self._config.epoch, self._steps_to_go)
        self._logger.info(
            "Per a single step, %2d gradient(s) will be accumulated. (Total %2d mini-batch(es)/epoch)",
            self._config.gradient_accumulation_steps,
            self._minibatch_per_epoch)
        self._logger.info(
            "We will report TRAINING loss/accuracy for every %3d epoch(s)",
            self._config.epoch_report)
        self._logger.info(
            "We will report DEV ACC. and save CHKPTs for every %3d epoch(s)",
            self._config.epoch_chkpt)

        # Restore the number of steps that were passed before
        if chkpt_path:
            self._epoch = int(chkpt_path.name)
            self._logger.info("Attempt to restore from the checkpoint [%s]",
                              chkpt_path)
            self._logger.info("Resume training from epoch %s", self._epoch)

        # Classify parameters to form parameter groups to build optimizer
        no_w_decay = {'bias', 'norm', 'Norm', '_embedding'}
        parameters = [((2 if 'text_model.model.embeddings' in n else
                        (1 if 'text_model' in n else 0),
                        any(t in n for t in no_w_decay)), p)
                      for n, p in self._module.named_parameters()]
        parameters = groupby(sorted(parameters, key=lambda t: t[0]),
                             key=lambda t: t[0])

        # Build optimizer groups
        optimizer_grouped_parameters = []
        for (encoder_type_flag, is_without_wd), group in parameters:
            group = {'params': [p for _, p in group]}

            if is_without_wd:
                group['weight_decay'] = 0.0

            if encoder_type_flag == 2 and self._config.fix_encoder_embedding:
                group['lr'] = 0.0
            elif encoder_type_flag == 1:
                group['lr'] = self._config.optimizer.kwargs[
                    'lr'] * self._config.lr_multiplier_encoder

            optimizer_grouped_parameters.append(group)

        # Build optimizer before restoration
        self._optimizer = self._config.optimizer.build(
            optimizer_grouped_parameters)
        self._logger.info("We will use the following optimizer: %s",
                          self._optimizer)

        # Restore the optimizer if available.
        if chkpt_path:
            # Check if saved optimizer exists
            optimizer_file = Path(chkpt_path, 'optimizer.pt')
            if optimizer_file.is_file():
                self._optimizer.load_state_dict(torch.load(optimizer_file))
                self._logger.info(
                    "An optimizer for module at [%s] is restored.",
                    optimizer_file)

        # Specify warmup strategy if warmup value is not negative
        warmup_steps = int(self._step_per_epoch * self._config.epoch_warmup)
        if warmup_steps >= 0:
            # Build scheduler before restoration
            self._scheduler = get_linear_schedule_with_warmup(
                self._optimizer,
                num_warmup_steps=warmup_steps,
                num_training_steps=self._steps_to_go)
            self._logger.info(
                "We will use linear scheduling: warm up %s epochs or %s steps",
                self._config.epoch_warmup, warmup_steps)

            # Restore the scheduler if available
            if chkpt_path:
                # Check if saved scheduler exists
                scheduler_file = Path(chkpt_path, 'scheduler.pt')
                if scheduler_file.is_file():
                    self._scheduler.load_state_dict(torch.load(scheduler_file))
                    self._logger.info(
                        "A scheduler for module at [%s] is restored.",
                        scheduler_file)

        # Log the threshold of gradient clipping.
        if self._config.gradient_clip > 0:
            self._logger.info("We will use gradient clipping at %.3f",
                              self._config.gradient_clip)
        else:
            self._logger.info("We will not use gradient clipping")

        # Log the structure of the network.
        parameters_size = sum(p.numel() for p in self._module.parameters())
        disk_space = sum(
            required_space_param(p) for p in self._module.parameters())
        self._logger.info('==== [Network Structure] ====\n%s',
                          str(self._module))
        self._logger.info(
            'There are %12d parameters in a network. Required space for checkpointing is %.3fMB.',
            parameters_size, disk_space / 1048576)

        # Wrap data parallel if we can use more than one GPU
        if len(self.device_order) > 1 and not self.disable_dataparallel:
            self._module = DataParallel(self._module,
                                        device_ids=self.device_order,
                                        output_device=self.device_order[0])
            self._logger.info(
                "We identified [%s] devices for parallel training",
                len(self.device_order))
        else:
            self._logger.info("We don't use DataParallel.")

        # Set answer checker
        self._answer_checker = AnswerChecker(
            is_expression_type=_unwrap_parallel(
                self._module).is_expression_type,
            logger=self._logger)

    def set_seed(self):
        """
        Set the random seeds
        """
        if self._random_restored:
            # Ignore seed setting when state of rng was restored.
            return

        seed = self._config.seed
        self._logger.info("Seed for random number generation = %s", seed)

        random.seed(seed)
        numpy.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    def get_evaluation_output(self, key: str):
        """
        Get the evaluation output of specified key.

        :param str key: metric key to read
        :return: metric value of specified key
        """
        return self._last_performances[key]

    def get_metrics(self) -> dict:
        """
        :return: The latest metric dictionary.
        """
        return self._last_metrics

    def run_a_chkpt_iter(self):
        """
        Run epochs until checkpointing
        """

        try:
            accumulated_values = {}

            for _ in range(self._config.epoch_chkpt):
                # For each epoch (at most the number of checkpointing epoch)
                self._epoch += 1

                all_grad_applied = True
                for batch_step in range(self._minibatch_per_epoch):
                    # For each minibatch
                    self._module.eval()
                    self._module.zero_grad()

                    # Load a minibatch
                    batch = next(self._trainit)
                    batch = self._load_batch(batch)

                    # Execute training
                    self._module.train()
                    reported_values = self._step(**batch)
                    reported_values['Loss/generate'] = reported_values[
                        'total_loss']
                    reported_values['total_loss'].backward()
                    all_grad_applied = False

                    # Accumulate statistics and update gradient
                    _accumulate_stats(reported_values, accumulated_values)
                    if (batch_step +
                            1) % self._config.gradient_accumulation_steps == 0:
                        self._update_grad()
                        all_grad_applied = True
                else:
                    # If there exists not-updated gradients, update gradient
                    if not all_grad_applied:
                        self._update_grad()

                if self._config.epoch_report > 0 and self._epoch % self._config.epoch_report == 0:
                    # Log metrics
                    if self._writer is not None:
                        for name, val in accumulated_values.items():
                            self._writer.add_scalar(name,
                                                    sum(val) / len(val),
                                                    self._epoch)
                        # Report current optimizer status
                        self._report_optimizer()

                    accumulated_values.clear()

            # Evaluate current result on development set
            self.evaluate()
            self.checkpoint()
        except Exception as e:
            self._logger.error('Exception occurred!', exc_info=e)
            raise e

    def train(self):
        """
        Do full-length training (until the maximum epoch)
        """
        # Set seed
        self.set_seed()

        # Prepare estimated time calculator class
        eta = ExpectedTimeToFinishCalculator(self._config.epoch,
                                             current=self._epoch)
        while self._epoch < self._config.epoch:
            self.run_a_chkpt_iter()
            eta_time = eta.step(increase=self._config.epoch_chkpt)
            self._logger.info('Expected time to finish: %s', eta_time)

        # Evaluate performance on the evaluation set
        try:
            self.evaluate(is_development=False)
        except Exception as e:
            self._logger.error('Exception occurred!', exc_info=e)
            raise e
        finally:
            # Remove old checkpoints and close Tensorboard writer
            self.rotate_checkpoint(1)

    def _update_grad(self):
        """
        Update accumulated gradients
        """
        if self._config.gradient_clip > 0:
            # If clipping threshold is set, then clip the gradient
            torch.nn.utils.clip_grad_norm_(self._module.parameters(),
                                           self._config.gradient_clip)

        if self._config.gradient_normalize:
            # If normalizing gradient is set, then normalize the gradient
            _normalize_gradients(*self._module.parameters())

        # Apply optimizer & scheduler
        self._optimizer.step()
        if hasattr(self, '_scheduler'):
            self._scheduler.step()

        # Reset the gradient
        self._module.zero_grad()

    def _load_batch(self,
                    batch: ProblemInstance,
                    is_training=True,
                    max_len=0) -> dict:
        """
        Load batch instance into dictionary that can feed-able into the model.

        :param ProblemInstance batch: A mini-batch
        :param bool is_training: True if this batch is used for training. True by default.
        :param int max_len: Maximum length of equation to be generated. 0 by default (i.e. depends on the current batch)
        :rtype: dict
        :return: Dictionary representing mini-batch
        """
        # Prepare dictionary
        batch_dict = {
            'max_numbers':
            max(len(numbers) for numbers in batch.text.number_value),
            IN_TXT: batch.text.token,
            IN_TPAD: batch.text.pad,
            IN_TNUM: batch.text.number
        }

        # Retrieve information about the target field
        required_field = _unwrap_parallel(self._module).required_field
        # Get equation in terms of the target field
        equation = getattr(batch, required_field)
        if is_training:
            # If this is training, then directly provide target equation for teacher-forcing
            batch_dict[IN_EQN] = equation
        else:
            # Otherwise, just provide information about maximum length of generation & arity of operators
            batch_dict['max_len'] = max(equation.shape[-2], max_len) + 1
            if required_field.startswith('tuple'):
                batch_dict['function_arities'] = getattr(
                    self.evalset, required_field + '_field').function_arities

        if not isinstance(self._module, DataParallel):
            # If we applied data parallel, then move the value to the main device
            batch_dict = {
                k: v.to(self.main_device) if isinstance(v, torch.Tensor) else v
                for k, v in batch_dict.items()
            }

        # Returned value is a dict.
        return batch_dict

    def _step(self, training: bool = True, **kwargs):
        """
        Execute forward computation of the module

        :param bool training: True if this execution is for training. True by default.
        :param kwargs: Keyword arguments to execute the module.
        :return: Result of execution.
            - If training is True, return value will be a dictionary mapping from string to accuracy/loss Tensors.
            - Otherwise, return value will be a LongTensor indicating the generated tokens
        """
        result = self._module(**kwargs)
        if type(result) is dict and training:
            return {k: v.mean() if training else v for k, v in result.items()}
        else:
            return result

    def _report_optimizer(self):
        """
        Report the current state of the optimizer
        """
        # Classify parameters by their types
        param_type = {
            id(p): ('Enc' if 'text_model.' in n else 'Dec') +
            ('Embed' if '_embedding' in n else 'Trans')
            for n, p in _unwrap_parallel(self._module).named_parameters()
        }
        # Dictionary for accumulating parameter information
        param_states = {
            key: {
                'weight_norm': [],
                'acc_update': []
            }
            for key in set(param_type.values())
        }

        with torch.no_grad():
            # Without using gradients, accumulate information about weight and gradient
            for gid, group in enumerate(self._optimizer.param_groups):
                for p in group['params']:
                    id_p = id(p)
                    states = param_states[param_type[id_p]]
                    w_init = self._module_init[id_p]

                    w_elem = p.numel()
                    w_norm = p.norm(2).item() / w_elem
                    delta_norm = (w_init -
                                  p.clone().cpu()).norm(2).item() / w_elem

                    states['weight_norm'].append(w_norm)
                    states['acc_update'].append(delta_norm)

        # Write accumulated results
        if self._writer:
            for part, states in param_states.items():
                prefix = 'Optimizer_%s/%%s' % part

                for key, val in states.items():
                    if not len(val):
                        continue

                    # Track average & histograms
                    val = numpy.array(val)
                    self._writer.add_scalar(prefix % key, val.mean(),
                                            self._epoch)
                    self._writer.add_scalar(prefix % (key + '_std'), val.std(),
                                            self._epoch)

    def _check_equation(self, checker: AnswerChecker, outputs: torch.Tensor,
                        batch: ProblemInstance):
        """
        Verify whether the outputted equation is correct or not.

        :param AnswerChecker checker: AnswerChecker instance to compute equation and check answer
        :param torch.Tensor outputs:
            LongTensor containing generated equations.
            - If the model should generate op-tokens, Shape = [B, M, T], where B = batch size, M = beams, and T = length
            - Otherwise, Shape = [B, M, T, 1+2A], where A = maximum arity.
        :param batch:
        :return:
        """
        # Retrieve size information
        batch_sz, beam_sz = outputs.shape[:2]

        # Get the target field information
        required_field = _unwrap_parallel(self._module).required_field
        # Retrieve the target field
        field = getattr(self.evalset, required_field + '_field')
        # Recover string representation of gold set and generated beams
        golds = field.convert_ids_to_equations(getattr(batch, required_field))
        beams = [
            field.convert_ids_to_equations(outputs[i]) for i in range(batch_sz)
        ]

        outputs = []
        for i in range(batch_sz):
            # For each batch, retrieve information about written numbers and expected answer tuples
            numbers = batch.text.number_value[i]
            expected = batch.expected[i]

            # Test whether the produced equation in each beam
            results = [
                checker.check(beam, numbers, expected) for beam in beams[i]
            ]
            # Record outputs: (index, goldset output, generated output, correctness)
            outputs.append((i, golds[i], beams[i], results))

        return outputs

    def evaluate(self, is_development: bool = True):
        """
        Evaluate the current model.

        :param bool is_development: True if current evaluation is done on development set. True by default.
        """
        # Shortcut for beam size
        beam_size = self._config.model.beam_size
        # Accumulator for output
        accumulator = []

        # Define log storage for information
        set_type = 'Dev' if is_development else 'Test'
        errored_path = Path(self._chkpt_path, 'error_sample_%s.log' % set_type)
        correct_path = Path(self._chkpt_path,
                            'correct_sample_%s.log' % set_type)
        result_path = Path(self._chkpt_path, 'results.csv')

        # Check whether we should write header or not.
        first_result_output = not result_path.exists()

        # Open file handlers
        errored_fp = errored_path.open('w+t', encoding='UTF-8')
        correct_fp = correct_path.open('w+t', encoding='UTF-8')
        result_fp = result_path.open('a+t', encoding='UTF-8')

        # Set module as evaluation phase
        self._module.eval()

        # Load dataset
        dataset = self.devset if is_development else self.evalset
        max_len = 0 if is_development else MEM_MAX
        for batch in dataset:
            # For each batch item, load it and produce outputs
            kwargs = self._load_batch(batch,
                                      is_training=False,
                                      max_len=max_len)
            outputs = self._step(**kwargs, training=False, beam=beam_size)

            # Convert text into string (for printing purpose)
            texts = dataset.problem_field.convert_ids_to_string(
                batch.text.token)

            # Check the result and print the result for each item.
            for i, gold, beams, results in self._check_equation(
                    self._answer_checker, outputs, batch):
                # Record the best output of the beam search results
                result_dict = {
                    'Index':
                    batch.index[i],
                    'Error':
                    str(type(results[0][2])),
                    'correct':
                    results[0][0],
                    'error_1_Parse':
                    results[0][2] is not None,
                    'error_2_Empty':
                    len(results[0][1]) == 0 and results[0][2] is None,
                    'error_3_Match':
                    not results[0][0] and len(results[0][1]) > 0
                    and results[0][2] is None,
                    'correct_in_beam':
                    any(r[0] for r in results)
                }

                # Accumulate the test result.
                accumulator.append(result_dict)

                # Select appropriate file handler
                fp = errored_fp if not result_dict['correct'] else correct_fp
                # Write problem & result
                fp.writelines([
                    '[Q] ', batch.index[i], '\n', texts[i], '\n',
                    '---------------------------------------\n',
                    '[EXPECTED]\t%s\n' % ' '.join(gold),
                    '---ANSWER:\t%s\n' % batch.expected[i],
                    '---------------------------------------\n'
                ])
                fp.writelines([
                    '[BEAM#%3d]\t%s\n'
                    '---ANSWER:\t%s\n%s' %
                    (b, ' '.join(beam), res[1],
                     '' if res[2] is None else '----ERROR:\t%s %s\n' %
                     (type(res[2]), str(res[2])))
                    for b, (beam, res) in enumerate(zip(beams, results))
                ])
                fp.write('\n')

        # Close file handlers
        errored_fp.close()
        correct_fp.close()

        # Write CSV results
        sorted_keys = sorted(accumulator[0].keys())
        # Write CSV header
        if first_result_output:
            _write_csv_line(result_fp, 'Set', 'GlobalStep', 'Beam',
                            *sorted_keys)

        # Write CSV results
        for values in accumulator:
            _write_csv_line(result_fp, set_type, self._epoch, beam_size,
                            *[values[key] for key in sorted_keys])

        # Close CSV handler
        result_fp.close()

        # Average metric across items (correctness & errors)
        metric_dict = {}
        for key in sorted_keys:
            value = [item[key] for item in accumulator]

            if type(value[0]) is not str:
                average = sum(value) / len(value)

                # Write accumulated results
                self._logger.info('Evaluating on %s (beam %s): %s = %.6f',
                                  set_type, beam_size, key, average)
                metric_dict[set_type + '/' + key] = average

        # Reset the dataset (since dataset reached EOF)
        dataset.reset()

        # Write exponential moving average & maximum value into metric dict
        if is_development:
            self._best_on_dev = max(self._best_on_dev,
                                    metric_dict['Dev/correct'])
            if self._ema_on_dev is None:
                self._ema_on_dev = metric_dict['Dev/correct']
            else:
                self._ema_on_dev = metric_dict[
                    'Dev/correct'] * 0.6 + self._ema_on_dev * 0.4

            metric_dict['Dev/correct_max'] = self._best_on_dev
            metric_dict['Dev/correct_ema'] = self._ema_on_dev

        # Record last output
        self._last_performances[set_type] = [
            item['correct']
            for item in sorted(accumulator, key=lambda d: d['Index'])
        ]
        self._last_metrics.update(metric_dict)
Exemplo n.º 24
0
def train(args):
    # gpu init
    multi_gpus = False
    best_lfw_acc = 0.0
    best_lfw_iters = 0
    best_agedb30_acc = 0.0
    best_agedb30_iters = 0
    best_cfp_fp_acc = 0.0
    best_cfp_fp_iters = 0
    if len(args.gpus.split(',')) > 1:
        multi_gpus = True
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # log init
    save_dir = os.path.join(
        args.save_dir,
        args.backbone.upper() + datetime.now().date().strftime('%Y%m%d'))
    if not os.path.exists(save_dir):
        #raise NameError('model dir exists!')
        os.makedirs(save_dir)
    logging = init_log(save_dir)
    _print = logging.info

    # define backbone and margin layer
    if args.backbone == 'MobileFace':
        net = MobileFaceNet(512).to(config.device)
    elif args.backbone == 'MNasMobile':
        net = MnasNet(512).to(config.device)
    elif args.backbone == 'ProxyNas':
        net = ProxyNas(512).to(config.device)
    elif args.backbone == 'SERes50_IR':
        net = SE_IR(50, 0.6, 'ir_se').to(config.device)
    elif args.backbone == 'IR_50':
        net = SE_IR(50, 0.6, 'ir').to(config.device)
    else:
        print(args.backbone, ' is not available!')
    summary(net.to(config.device), (3, 112, 112))
    #define tranform
    if args.backbone == 'ProxyNas':
        transform = transforms.Compose([
            transforms.Resize(112, 112),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])
    else:
        # dataset loader
        transform = transforms.Compose([
            transforms.Resize((112, 112)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),  # range [0, 255] -> [0.0,1.0]
            transforms.Normalize(mean=(0.5, 0.5, 0.5),
                                 std=(0.5, 0.5,
                                      0.5))  # range [0.0, 1.0] -> [-1.0,1.0]
        ])

    # validation dataset
    trainset = VGG_FP(config=config, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=config.batch_size,
                                              shuffle=True,
                                              num_workers=8,
                                              drop_last=False)
    num_iter = len(trainset) // config.batch_size
    numclass = trainset.class_nums

    if args.has_test:

        lfwdataset = LFW(config=config, transform=transform)
        lfwloader = torch.utils.data.DataLoader(lfwdataset,
                                                batch_size=config.batch_size,
                                                shuffle=False,
                                                num_workers=8,
                                                drop_last=False)
        agedbdataset = AgeDB30(config=config, transform=transform)
        agedbloader = torch.utils.data.DataLoader(agedbdataset,
                                                  batch_size=config.batch_size,
                                                  shuffle=False,
                                                  num_workers=8,
                                                  drop_last=False)
        cfpfpdataset = CFP_FP(config=config, transform=transform)
        cfpfploader = torch.utils.data.DataLoader(cfpfpdataset,
                                                  batch_size=config.batch_size,
                                                  shuffle=False,
                                                  num_workers=8,
                                                  drop_last=False)

    if args.margin_type == 'ArcFace':
        margin = ArcMarginProduct(512, numclass, s=args.scale_size)
    elif args.margin_type == 'CosFace':
        pass
    elif args.margin_type == 'SphereFace':
        pass
    else:
        print(args.margin_type, 'is not available!')
    if args.resume:
        print('resume the model parameters from: ', args.net_path,
              args.margin_path)
        net.load_state_dict(torch.load(args.net_path)['net_state_dict'])
        margin.load_state_dict(torch.load(args.margin_path)['net_state_dict'])

    # define optimizers for different layer
    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer_ft = optim.SGD([{
        'params': net.parameters(),
        'weight_decay': 5e-4
    }, {
        'params': margin.parameters(),
        'weight_decay': 5e-4
    }],
                             lr=0.001,
                             momentum=0.9,
                             nesterov=True)
    exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer_ft,
                                                milestones=config.milestones,
                                                gamma=0.1)

    if multi_gpus:
        net = DataParallel(net).to(device)
        margin = DataParallel(margin).to(device)
    else:
        net = net.to(device)
        margin = margin.to(device)

    total_iters = 1
    vis = Visualizer(env=args.backbone)
    start_epoch = total_iters // num_iter
    if args.resume:
        total_iters = args.resume
        with open('result/log_vis_train.txt', 'r') as fw:
            for line in fw.readlines():
                nodes = line.split(':')
                vis.plot_curves({'softmax loss': np.float(nodes[1])},
                                iters=np.float(nodes[0]),
                                title='train loss',
                                xlabel='iters',
                                ylabel='train loss')
                vis.plot_curves({'train accuracy': np.float(nodes[2])},
                                iters=np.float(nodes[0]),
                                title='train accuracy',
                                xlabel='iters',
                                ylabel='train accuracy')
        with open('result/log_vis_test.txt', 'r') as fw2:
            for line in fw2.readlines():
                nodes = line.split(':')
                vis.plot_curves(
                    {
                        'lfw': np.float(nodes[1]),
                        'agedb-30': np.float(nodes[2]),
                        'cfp-fp': np.float(nodes[3])
                    },
                    iters=np.float(nodes[0]),
                    title='test accuracy',
                    xlabel='iters',
                    ylabel='test accuracy')

    for epoch in range(1, args.total_epoch + 1):
        exp_lr_scheduler.step()
        if epoch < start_epoch:
            continue
        # train model
        _print('Train Epoch: {}/{} ...'.format(epoch, args.total_epoch))
        net.train()
        log_vis_train = open('result/log_vis_train.txt', 'a')
        log_vis_test = open('result/log_vis_test.txt', 'a')

        since = time.time()
        for data in trainloader:
            img, label = data[0].to(device), data[1].to(device)
            optimizer_ft.zero_grad()

            raw_logits = net(img)
            output = margin(raw_logits, label)
            total_loss = criterion(output, label)
            total_loss.backward()
            optimizer_ft.step()
            # print train information
            if total_iters % 200 == 0:
                # current training accuracy
                _, predict = torch.max(output.data, 1)
                total = label.size(0)
                correct = (np.array(predict) == np.array(label.data)).sum()
                time_cur = (time.time() - since) / 100
                since = time.time()
                vis.plot_curves({'softmax loss': total_loss.item()},
                                iters=total_iters,
                                title='train loss',
                                xlabel='iters',
                                ylabel='train loss')
                vis.plot_curves({'train accuracy': correct / total},
                                iters=total_iters,
                                title='train accuracy',
                                xlabel='iters',
                                ylabel='train accuracy')
                log_vis_train.write("%d:%f:%f\n" %
                                    (total_iters, total_loss.item(),
                                     (correct / total)))

                print(
                    "Iters: {:0>6d}/[{:0>2d}], loss: {:.4f}, train_accuracy: {:.4f}, time: {:.2f} s/iter, learning rate: {}"
                    .format(total_iters, epoch, total_loss.item(),
                            correct / total, time_cur,
                            exp_lr_scheduler.get_lr()[0]))

            # save model
            if total_iters % args.save_freq == 0:
                msg = 'Saving checkpoint: {}'.format(total_iters)
                _print(msg)
                if multi_gpus:
                    net_state_dict = net.module.state_dict()
                    margin_state_dict = margin.module.state_dict()
                else:
                    net_state_dict = net.state_dict()
                    margin_state_dict = margin.state_dict()
                if not os.path.exists(save_dir):
                    os.mkdir(save_dir)
                torch.save(
                    {
                        'iters': total_iters,
                        'net_state_dict': net_state_dict
                    },
                    os.path.join(save_dir, 'Iter_%06d_net.ckpt' % total_iters))
                torch.save(
                    {
                        'iters': total_iters,
                        'net_state_dict': margin_state_dict
                    },
                    os.path.join(save_dir,
                                 'Iter_%06d_margin.ckpt' % total_iters))

            # test accuracy
            if total_iters % args.test_freq == 0 and args.has_test:
                # test model on lfw
                net.eval()
                getFeatureFromTorch('./result/cur_lfw_result.mat', net, device,
                                    lfwdataset, lfwloader)
                lfw_accs = evaluation_10_fold('./result/cur_lfw_result.mat')
                _print('LFW Ave Accuracy: {:.4f}'.format(
                    np.mean(lfw_accs) * 100))
                if best_lfw_acc <= np.mean(lfw_accs) * 100:
                    best_lfw_acc = np.mean(lfw_accs) * 100
                    best_lfw_iters = total_iters

                # test model on AgeDB30
                getFeatureFromTorch('./result/cur_agedb30_result.mat', net,
                                    device, agedbdataset, agedbloader)
                age_accs = evaluation_10_fold(
                    './result/cur_agedb30_result.mat')
                _print('AgeDB-30 Ave Accuracy: {:.4f}'.format(
                    np.mean(age_accs) * 100))
                if best_agedb30_acc <= np.mean(age_accs) * 100:
                    best_agedb30_acc = np.mean(age_accs) * 100
                    best_agedb30_iters = total_iters

                # test model on CFP-FP
                getFeatureFromTorch('./result/cur_cfpfp_result.mat', net,
                                    device, cfpfpdataset, cfpfploader)
                cfp_accs = evaluation_10_fold('./result/cur_cfpfp_result.mat')
                _print('CFP-FP Ave Accuracy: {:.4f}'.format(
                    np.mean(cfp_accs) * 100))
                if best_cfp_fp_acc <= np.mean(cfp_accs) * 100:
                    best_cfp_fp_acc = np.mean(cfp_accs) * 100
                    best_cfp_fp_iters = total_iters
                _print(
                    'Current Best Accuracy: LFW: {:.4f} in iters: {}, AgeDB-30: {:.4f} in iters: {} and CFP-FP: {:.4f} in iters: {}'
                    .format(best_lfw_acc, best_lfw_iters, best_agedb30_acc,
                            best_agedb30_iters, best_cfp_fp_acc,
                            best_cfp_fp_iters))
                # _print('Current Best Accuracy:LFW: {:.4f} in iters: {} and CFP-FP: {:.4f} in iters: {}'.format(
                #                             best_lfw_acc, best_lfw_iters, best_cfp_fp_acc, best_cfp_fp_iters))

                vis.plot_curves(
                    {
                        'lfw': np.mean(lfw_accs),
                        'agedb-30': np.mean(age_accs),
                        'cfp-fp': np.mean(cfp_accs)
                    },
                    iters=total_iters,
                    title='test accuracy',
                    xlabel='iters',
                    ylabel='test accuracy')
                log_vis_test.write('%d:%f:%f:%f\n' %
                                   (total_iters, np.mean(lfw_accs),
                                    np.mean(cfp_accs), np.mean(age_accs)))
                net.train()
            total_iters += 1

    _print(
        'Finally Best Accuracy: LFW: {:.4f} in iters: {}, AgeDB-30: {:.4f} in iters: {} and CFP-FP: {:.4f} in iters: {}'
        .format(best_lfw_acc, best_lfw_iters, best_agedb30_acc,
                best_agedb30_iters, best_cfp_fp_acc, best_cfp_fp_iters))
    _print(
        'Finally Best Accuracy: LFW: {:.4f} in iters: {} and CFP-FP: {:.4f} in iters: {}'
        .format(best_lfw_acc, best_lfw_iters, best_cfp_fp_acc,
                best_cfp_fp_iters))
    print('finishing training')
Exemplo n.º 25
0


    trainLoader = DataLoader(
        data_train, batch_size=64, shuffle=True,num_workers=6)
    #valLoader = DataLoader(
    #    data_val, batch_size=16, shuffle=False,num_workers=6)
    dataset_train_len=len(data_train)
    #dataset_val_len=len(data_val)
    #densenet = models.densenet121(num_classes=14)
    densenet = models.densenet121(pretrained=True)
    densenet.classifier = nn.Linear(1024,14)



    # densenet = pickle.load(open('../../../../media/data/yangliu/xrays/our_trained_densenet_epoch_14.pkl', 'rb'))
    densenet = densenet.cuda()
    densenet = DataParallel(densenet)
    
    #with open(weight_dir+'densenet_epoch_15.pkl','rb') as f:
    #    densenet = pickle.load(f)
    
    parameter=0
    for param in densenet.parameters():
        parameter+=param.data.nelement()
    print ('Total trainable parameters are {}'.format(parameter))

    optimizer=optim.Adam(densenet.parameters(),lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0)
    model_ft = train_model(densenet, optimizer,num_epochs=100)

Exemplo n.º 26
0
def main(args):
    print(f"\nModel: {args.arch}")
    # Set the N & M
    m.N = args.N
    m.M = args.M
    # Select the hardware device to use for inference.
    if torch.cuda.is_available():
        device = torch.device('cuda', torch.cuda.current_device())
        torch.backends.cudnn.benchmark = True
    else:
        device = torch.device('cpu')

    # Disable gradient calculations by default.
    torch.set_grad_enabled(False)

    # create checkpoint dir
    os.makedirs(args.checkpoint, exist_ok=True)

    if args.arch == 'hg1':
        model = hg1(pretrained=False)
    elif args.arch == 'hg2':
        model = hg2(pretrained=False)
    elif args.arch == 'hg3':
        model = hg3(pretrained=False)
    elif args.arch == 'hg4':
        model = hg4(pretrained=False)
    elif args.arch == 'hg5':
        model = hg5(pretrained=False)
    elif args.arch == 'hg6':
        model = hg6(pretrained=False)
    elif args.arch == 'hg7':
        model = hg7(pretrained=False)
    elif args.arch == 'hg8':
        model = hg8(pretrained=False)
    else:
        raise Exception('unrecognised model architecture: ' + args.arch)

    model = DataParallel(model).to(device)

    optimizer = RMSprop(model.parameters(),
                        lr=args.lr,
                        momentum=args.momentum,
                        weight_decay=args.weight_decay)

    best_acc = 0

    # optionally resume from a checkpoint
    if args.resume:
        assert os.path.isfile(args.resume)
        print("=> loading checkpoint '{}'".format(args.resume))
        checkpoint = torch.load(args.resume)
        args.start_epoch = checkpoint['epoch']
        best_acc = checkpoint['best_acc']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("=> loaded checkpoint '{}' (epoch {})".format(
            args.resume, checkpoint['epoch']))
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True)
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'))
        logger.set_names(
            ['Epoch', 'LR', 'Train Loss', 'Val Loss', 'Train Acc', 'Val Acc'])

    # create data loader
    train_dataset = Mpii(args.image_path,
                         is_train=True,
                         inp_res=args.input_shape)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.train_batch,
                              shuffle=True,
                              num_workers=args.workers,
                              pin_memory=True)

    val_dataset = Mpii(args.image_path,
                       is_train=False,
                       inp_res=args.input_shape)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.test_batch,
                            shuffle=False,
                            num_workers=args.workers,
                            pin_memory=True)

    # train and eval
    lr = args.lr
    epoch_times = []
    end = time.time()
    f = open(f"{args.checkpoint}/epoch_times.txt", 'w')
    for epoch in trange(args.start_epoch,
                        args.epochs,
                        desc='Overall',
                        ascii=True):
        start = end
        lr = adjust_learning_rate(optimizer, epoch, lr, args.schedule,
                                  args.gamma)

        # train for one epoch
        train_loss, train_acc = do_training_epoch(train_loader,
                                                  model,
                                                  device,
                                                  Mpii.DATA_INFO,
                                                  optimizer,
                                                  acc_joints=Mpii.ACC_JOINTS)

        # evaluate on validation set
        valid_loss, valid_acc, predictions = do_validation_epoch(
            val_loader,
            model,
            device,
            Mpii.DATA_INFO,
            False,
            acc_joints=Mpii.ACC_JOINTS)

        # print metrics
        tqdm.write(
            f'[{epoch + 1:3d}/{args.epochs:3d}] lr={lr:0.2e} '
            f'train_loss={train_loss:0.4f} train_acc={100 * train_acc:0.2f} '
            f'valid_loss={valid_loss:0.4f} valid_acc={100 * valid_acc:0.2f}')

        # append logger file
        logger.append(
            [epoch + 1, lr, train_loss, valid_loss, train_acc, valid_acc])
        logger.plot_to_file(os.path.join(args.checkpoint, 'log.svg'),
                            ['Train Acc', 'Val Acc'])

        # remember best acc and save checkpoint
        is_best = valid_acc > best_acc
        best_acc = max(valid_acc, best_acc)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
            },
            predictions,
            is_best,
            checkpoint=args.checkpoint,
            snapshot=args.snapshot)
        end = time.time()
        epoch_times.append(end - start)
        print(
            f"Average Epoch Time After Epoch {epoch}: {sum(epoch_times) / len(epoch_times)} sec",
            file=f)
    f.close()
    logger.close()
Exemplo n.º 27
0
    return acc


if __name__ == '__main__':
    opt = Config()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    #获取所有图像的相对路径
    identity_list = get_lfw_list(opt.lfw_test_list)
    #获取图像的绝对路径
    img_paths = [os.path.join(opt.lfw_root, each) for each in identity_list]

    if opt.backbone == 'resnet18':
        model = resnet_face18(opt.use_se)  #
    elif opt.backbone == 'resnet34':
        model = resnet34()
    elif opt.backbone == 'resnet50':
        model = resnet50()

    # You can easily run your operations on multiple GPUs by making your model run parallelly using DataParallel
    # https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
    model = DataParallel(model)
    # load_model(model, opt.test_model_path)
    model.load_state_dict(torch.load(opt.test_model_path, map_location=device))
    model.to(device)
    model.eval()

    lfw_test(model, img_paths, identity_list, opt.lfw_test_list,
             opt.test_batch_size)
Exemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser("PyTorch Xview Pipeline")
    arg = parser.add_argument
    arg('--config', metavar='CONFIG_FILE', help='path to configuration file')
    arg('--workers', type=int, default=6, help='number of cpu threads to use')
    arg('--gpu',
        type=str,
        default='0',
        help='List of GPUs for parallel training, e.g. 0,1,2,3')
    arg('--output-dir', type=str, default='weights/')
    arg('--resume', type=str, default='')
    arg('--fold', type=int, default=0)
    arg('--prefix', type=str, default='classifier_')
    arg('--data-dir', type=str, default="/mnt/sota/datasets/deepfake")
    arg('--folds-csv', type=str, default='folds.csv')
    arg('--crops-dir', type=str, default='crops')
    arg('--label-smoothing', type=float, default=0.01)
    arg('--logdir', type=str, default='logs')
    arg('--zero-score', action='store_true', default=False)
    arg('--from-zero', action='store_true', default=False)
    arg('--distributed', action='store_true', default=False)
    arg('--freeze-epochs', type=int, default=0)
    arg("--local_rank", default=0, type=int)
    arg("--seed", default=777, type=int)
    arg("--padding-part", default=3, type=int)
    arg("--opt-level", default='O1', type=str)
    arg("--test_every", type=int, default=1)
    arg("--no-oversample", action="store_true")
    arg("--no-hardcore", action="store_true")
    arg("--only-changed-frames", action="store_true")

    args = parser.parse_args()
    os.makedirs(args.output_dir, exist_ok=True)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
    else:
        os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu

    cudnn.benchmark = True

    conf = load_config(args.config)
    model = classifiers.__dict__[conf['network']](encoder=conf['encoder'])

    model = model.cuda()
    if args.distributed:
        model = convert_syncbn_model(model)
    ohem = conf.get("ohem_samples", None)
    reduction = "mean"
    if ohem:
        reduction = "none"
    loss_fn = []
    weights = []
    for loss_name, weight in conf["losses"].items():
        loss_fn.append(losses.__dict__[loss_name](reduction=reduction).cuda())
        weights.append(weight)
    loss = WeightedLosses(loss_fn, weights)
    loss_functions = {"classifier_loss": loss}
    optimizer, scheduler = create_optimizer(conf['optimizer'], model)
    bce_best = 100
    start_epoch = 0
    batch_size = conf['optimizer']['batch_size']

    data_train = DeepFakeClassifierDataset(
        mode="train",
        oversample_real=not args.no_oversample,
        fold=args.fold,
        padding_part=args.padding_part,
        hardcore=not args.no_hardcore,
        crops_dir=args.crops_dir,
        data_path=args.data_dir,
        label_smoothing=args.label_smoothing,
        folds_csv=args.folds_csv,
        transforms=create_train_transforms(conf["size"]),
        normalize=conf.get("normalize", None))
    data_val = DeepFakeClassifierDataset(mode="val",
                                         fold=args.fold,
                                         padding_part=args.padding_part,
                                         crops_dir=args.crops_dir,
                                         data_path=args.data_dir,
                                         folds_csv=args.folds_csv,
                                         transforms=create_val_transforms(
                                             conf["size"]),
                                         normalize=conf.get("normalize", None))
    val_data_loader = DataLoader(data_val,
                                 batch_size=batch_size * 2,
                                 num_workers=args.workers,
                                 shuffle=False,
                                 pin_memory=False)
    os.makedirs(args.logdir, exist_ok=True)
    summary_writer = SummaryWriter(args.logdir + '/' +
                                   conf.get("prefix", args.prefix) +
                                   conf['encoder'] + "_" + str(args.fold))
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            state_dict = checkpoint['state_dict']
            state_dict = {k[7:]: w for k, w in state_dict.items()}
            model.load_state_dict(state_dict, strict=False)
            if not args.from_zero:
                start_epoch = checkpoint['epoch']
                if not args.zero_score:
                    bce_best = checkpoint.get('bce_best', 0)
            print("=> loaded checkpoint '{}' (epoch {}, bce_best {})".format(
                args.resume, checkpoint['epoch'], checkpoint['bce_best']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    if args.from_zero:
        start_epoch = 0
    current_epoch = start_epoch

    if conf['fp16']:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.opt_level,
                                          loss_scale='dynamic')

    snapshot_name = "{}{}_{}_{}".format(conf.get("prefix",
                                                 args.prefix), conf['network'],
                                        conf['encoder'], args.fold)

    if args.distributed:
        model = DistributedDataParallel(model, delay_allreduce=True)
    else:
        model = DataParallel(model).cuda()

    # register each block, in order to extract the blocks' feature maps
    for name, block in model.encoder.blocks.named_children():
        block.register_forward_hook(hook_function)

    data_val.reset(1, args.seed)
    max_epochs = conf['optimizer']['schedule']['epochs']

    for epoch in range(start_epoch, max_epochs):
        data_train.reset(epoch, args.seed)
        train_sampler = None
        if args.distributed:
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                data_train)
            train_sampler.set_epoch(epoch)
        if epoch < args.freeze_epochs:
            print("Freezing encoder!!!")
            model.module.encoder.eval()
            for p in model.module.encoder.parameters():
                p.requires_grad = False
        else:
            model.module.encoder.train()
            for p in model.module.encoder.parameters():
                p.requires_grad = True

        train_data_loader = DataLoader(data_train,
                                       batch_size=batch_size,
                                       num_workers=args.workers,
                                       shuffle=train_sampler is None,
                                       sampler=train_sampler,
                                       pin_memory=False,
                                       drop_last=True)

        train_epoch(current_epoch, loss_functions, model, optimizer, scheduler,
                    train_data_loader, summary_writer, conf, args.local_rank,
                    args.only_changed_frames)
        model = model.eval()

        if args.local_rank == 0:
            torch.save(
                {
                    'epoch': current_epoch + 1,
                    'state_dict': model.state_dict(),
                    'bce_best': bce_best,
                }, args.output_dir + '/' + snapshot_name + "_last")
            torch.save(
                {
                    'epoch': current_epoch + 1,
                    'state_dict': model.state_dict(),
                    'bce_best': bce_best,
                },
                args.output_dir + snapshot_name + "_{}".format(current_epoch))
            if (epoch + 1) % args.test_every == 0:
                bce_best = evaluate_val(args,
                                        val_data_loader,
                                        bce_best,
                                        model,
                                        snapshot_name=snapshot_name,
                                        current_epoch=current_epoch,
                                        summary_writer=summary_writer)
        current_epoch += 1
Exemplo n.º 29
0
def main():
    if raw:
        build_files(data_path=RAW_DATA_PATH)
        exit(1)

    model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
        config=model_config)
    MULTI_GPU = False
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        MULTI_GPU = True
    model.to(device)

    total_lines = 0
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            total_lines += len(f.readlines())
    total_steps = int(total_lines * EPOCHS / BATCH_SIZE)
    print('total steps = {}'.format(total_steps))
    optimizer = pytorch_transformers.AdamW(model.parameters(),
                                           lr=LR,
                                           correct_bias=True)
    scheduler = pytorch_transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=WARMUP_STEPS, t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)
    print('starting training')
    for epoch in range(EPOCHS):
        print('epoch {}'.format(epoch))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                running_loss = 0
                sub_lines = f.readlines()
                sub_lines = [line.split()[:n_ctx] for line in sub_lines]
                random.shuffle(sub_lines)
                for step in range(len(sub_lines) // BATCH_SIZE):
                    batch = sub_lines[step * BATCH_SIZE:(step + 1) *
                                      BATCH_SIZE]
                    batch_labels = []
                    batch_inputs = []
                    for ids in batch:
                        int_ids_for_labels = [int(x) for x in ids]
                        int_ids_for_inputs = [int(x) for x in ids]
                        batch_labels.append(int_ids_for_labels)
                        batch_inputs.append(int_ids_for_inputs)
                    batch_labels = torch.tensor(batch_labels).long().to(device)
                    batch_inputs = torch.tensor(batch_inputs).long().to(device)

                    optimizer.zero_grad()
                    outputs = model.forward(input_ids=batch_inputs,
                                            labels=batch_labels)
                    loss, logits = outputs[:2]

                    if MULTI_GPU:
                        loss = loss.mean()

                    if fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       max_grad_norm)

                    running_loss += loss.item()
                    scheduler.step()
                    optimizer.step()
                    if (step + 1) % LOG_STEP == 0:
                        print(
                            'step {} of piece {} of epoch {}, loss {}'.format(
                                step + 1, piece_num, epoch + 1,
                                running_loss / LOG_STEP))
                        running_loss = 0
            piece_num += 1

        print('saving model for epoch {}'.format(epoch))
        if not os.path.exists('./model/model_epoch{}'.format(epoch + 1)):
            os.mkdir('./model/model_epoch{}'.format(epoch + 1))
        model.save_pretrained('./model/model_epoch{}'.format(epoch + 1))
        torch.save(scheduler.state_dict(),
                   './model/model_epoch{}/scheduler.pt'.format(epoch + 1))
        torch.save(optimizer.state_dict(),
                   './model/model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists('./model/final_model'):
        os.mkdir('./model/final_model')
    model.save_pretrained('./model/final_model')
    torch.save(scheduler.state_dict(), './model/final_model/scheduler.pt')
    torch.save(optimizer.state_dict(), './model/final_model/optimizer.pt')
Exemplo n.º 30
0
base_params = filter(lambda p: id(p) not in ignored_params, net.parameters())

optimizer_ft = optim.SGD([
    {'params': base_params, 'weight_decay': 4e-5},
    {'params': net.linear1.parameters(), 'weight_decay': 4e-4},
    {'params': ArcMargin.weight, 'weight_decay': 4e-4},
    {'params': prelu_params, 'weight_decay': 0.0}
], lr=0.1, momentum=0.9, nesterov=True)

exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer_ft, milestones=[36, 52, 58], gamma=0.1)


net = net.cuda()
ArcMargin = ArcMargin.cuda()
if multi_gpus:
    net = DataParallel(net)
    ArcMargin = DataParallel(ArcMargin)
criterion = torch.nn.CrossEntropyLoss()


best_acc = 0.0
best_epoch = 0
for epoch in range(start_epoch, TOTAL_EPOCH+1):
    exp_lr_scheduler.step()
    # train model
    _print('Train Epoch: {}/{} ...'.format(epoch, TOTAL_EPOCH))
    net.train()

    train_total_loss = 0.0
    total = 0
    since = time.time()
Exemplo n.º 31
0
def make_network(configs):
    PoseNet = importNet(configs['network'])
    train_cfg = configs['train']
    config = configs['inference']

    poseNet = PoseNet(**config)

    forward_net = DataParallel(poseNet.cuda())

    def calc_loss(*args, **kwargs):
        return poseNet.calc_loss(*args, **kwargs)

    config['net'] = Trainer(forward_net, configs['inference']['keys'],
                            calc_loss)
    train_cfg['optimizer'] = torch.optim.Adam(config['net'].parameters(),
                                              train_cfg['learning_rate'])

    exp_path = os.path.join('exp', configs['opt'].exp)
    if not os.path.exists(exp_path):
        os.mkdir(exp_path)
    logger = open(os.path.join(exp_path, 'log'), 'a+')

    def make_train(batch_id, config, phase, **inputs):
        for i in inputs:
            inputs[i] = make_input(inputs[i])

        net = config['inference']['net']
        config['batch_id'] = batch_id

        if phase != 'inference':
            result = net(inputs['imgs'],
                         **{i: inputs[i]
                            for i in inputs if i != 'imgs'})

            num_loss = len(config['train']['loss'])

            ## I use the last outputs as the loss
            ## the weights of the loss are controlled by config['train']['loss']
            losses = {
                i[0]: result[-num_loss + idx] * i[1]
                for idx, i in enumerate(config['train']['loss'])
            }

            loss = 0
            toprint = '\n{}: '.format(batch_id)
            for i in losses:
                loss = loss + torch.mean(losses[i])

                my_loss = make_output(losses[i])
                my_loss = my_loss.mean(axis=0)

                if my_loss.size == 1:
                    toprint += ' {}: {}'.format(i,
                                                format(my_loss.mean(), '.8f'))
                else:
                    toprint += '\n{}'.format(i)
                    for j in my_loss:
                        toprint += ' {}'.format(format(j.mean(), '.8f'))

            logger.write(toprint)
            logger.flush()

            if batch_id == 200000:
                ## decrease the learning rate after 200000 iterations
                for param_group in optimizer.param_groups:
                    param_group['lr'] = 1e-5

            optimizer = train_cfg['optimizer']
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            return None
        else:
            out = {}
            net = net.eval()
            result = net(**inputs)
            if type(result) != list and type(result) != tuple:
                result = [result]
            out['preds'] = [make_output(i) for i in result]
            return out

    return make_train
Exemplo n.º 32
0
def main():
    global args
    args = parser.parse_args()
    
    
    torch.manual_seed(0)
    
    
    ##################################

    nodmodel = import_module(args.model1)
    config1, nod_net, loss, get_pbb = nodmodel.get_model()
    args.lr_stage = config1['lr_stage']
    args.lr_preset = config1['lr']

    
    save_dir = args.save_dir

    
    ##################################
    
    casemodel = import_module(args.model2)
    
    config2 = casemodel.config
    args.lr_stage2 = config2['lr_stage']
    args.lr_preset2 = config2['lr']
    topk = config2['topk']
    case_net = casemodel.CaseNet(topk = topk,nodulenet=nod_net)

    args.miss_ratio = config2['miss_ratio']
    args.miss_thresh = config2['miss_thresh']
    if args.debug:
        args.save_dir = 'debug'
    
    ###################################
    
    
    
    
    
    
    ################################
    start_epoch = args.start_epoch
    if args.resume:
        checkpoint = torch.load(args.resume)
        if start_epoch == 0:
            start_epoch = checkpoint['epoch'] + 1
        if not save_dir:
            save_dir = checkpoint['save_dir']
        else:
            save_dir = os.path.join('results',save_dir)
        case_net.load_state_dict(checkpoint['state_dict'])
    else:
        if start_epoch == 0:
            start_epoch = 1
        if not save_dir:
            exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime())
            save_dir = os.path.join('results', args.model1 + '-' + exp_id)
        else:
            save_dir = os.path.join('results',save_dir)
    if args.epochs == None:
        end_epoch = args.lr_stage2[-1]
    else:
        end_epoch = args.epochs
    ################################
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    logfile = os.path.join(save_dir,'log')
    if args.test1!=1 and args.test2!=1 :
        sys.stdout = Logger(logfile)
        pyfiles = [f for f in os.listdir('./') if f.endswith('.py')]
        for f in pyfiles:
            shutil.copy(f,os.path.join(save_dir,f))
    ################################
    torch.cuda.set_device(0)
    #nod_net = nod_net.cuda()
    case_net = case_net.cuda()
    loss = loss.cuda()
    cudnn.benchmark = True
    if not args.debug:
        case_net = DataParallel(case_net)
        nod_net = DataParallel(nod_net)
    ################################


    if args.test1 == 1:
        testsplit = np.load('full.npy')
        dataset = DataBowl3Classifier(testsplit, config2, phase = 'test')
        predlist = test_casenet(case_net,dataset).T
        anstable = np.concatenate([[testsplit],predlist],0).T
        df = pandas.DataFrame(anstable)
        df.columns={'id','cancer'}
        df.to_csv('allstage1.csv',index=False)
        return

    if args.test2 ==1:

        testsplit = np.load('test.npy')
        dataset = DataBowl3Classifier(testsplit, config2, phase = 'test')
        predlist = test_casenet(case_net,dataset).T
        anstable = np.concatenate([[testsplit],predlist],0).T
        df = pandas.DataFrame(anstable)
        df.columns={'id','cancer'}
        df.to_csv('quick',index=False)
        return
    if args.test3 == 1:
        testsplit3 = np.load('stage2.npy')
        dataset = DataBowl3Classifier(testsplit3,config2,phase = 'test')
        predlist = test_casenet(case_net,dataset).T
        anstable = np.concatenate([[testsplit3],predlist],0).T
        df = pandas.DataFrame(anstable)
        df.columns={'id','cancer'}
        df.to_csv('stage2_ans.csv',index=False)
        return
    print(save_dir)
    print(args.save_freq)
    trainsplit = np.load('kaggleluna_full.npy')
    valsplit = np.load('valsplit.npy')
    testsplit = np.load('test.npy')

    dataset = DataBowl3Detector(trainsplit,config1,phase = 'train')
    train_loader_nod = DataLoader(dataset,batch_size = args.batch_size,
        shuffle = True,num_workers = args.workers,pin_memory=True)

    dataset = DataBowl3Detector(valsplit,config1,phase = 'val')
    val_loader_nod = DataLoader(dataset,batch_size = args.batch_size,
        shuffle = False,num_workers = args.workers,pin_memory=True)

    optimizer = torch.optim.SGD(nod_net.parameters(),
        args.lr,momentum = 0.9,weight_decay = args.weight_decay)
    
    trainsplit = np.load('full.npy')
    dataset = DataBowl3Classifier(trainsplit,config2,phase = 'train')
    train_loader_case = DataLoader(dataset,batch_size = args.batch_size2,
        shuffle = True,num_workers = args.workers,pin_memory=True)
    
    dataset = DataBowl3Classifier(valsplit,config2,phase = 'val')
    val_loader_case = DataLoader(dataset,batch_size = max([args.batch_size2,1]),
        shuffle = False,num_workers = args.workers,pin_memory=True)

    dataset = DataBowl3Classifier(trainsplit,config2,phase = 'val')
    all_loader_case = DataLoader(dataset,batch_size = max([args.batch_size2,1]),
        shuffle = False,num_workers = args.workers,pin_memory=True)

    optimizer2 = torch.optim.SGD(case_net.parameters(),
        args.lr,momentum = 0.9,weight_decay = args.weight_decay)
    
    for epoch in range(start_epoch, end_epoch + 1):
        if epoch ==start_epoch:
            lr = args.lr
            debug = args.debug
            args.lr = 0.0
            args.debug = True
            train_casenet(epoch,case_net,train_loader_case,optimizer2,args)
            args.lr = lr
            args.debug = debug
        if epoch<args.lr_stage[-1]:
            train_nodulenet(train_loader_nod, nod_net, loss, epoch, optimizer, args)
            validate_nodulenet(val_loader_nod, nod_net, loss)
        if epoch>config2['startepoch']:
            train_casenet(epoch,case_net,train_loader_case,optimizer2,args)
            val_casenet(epoch,case_net,val_loader_case,args)
            val_casenet(epoch,case_net,all_loader_case,args)

        if epoch % args.save_freq == 0:            
            state_dict = case_net.module.state_dict()
            for key in state_dict.keys():
                state_dict[key] = state_dict[key].cpu()

            torch.save({
                'epoch': epoch,
                'save_dir': save_dir,
                'state_dict': state_dict,
                'args': args},
                os.path.join(save_dir, '%03d.ckpt' % epoch))
Exemplo n.º 33
0
def main():
    global args
    args = parser.parse_args()

    torch.manual_seed(0)
    torch.cuda.set_device(0)

    model = import_module(args.model)
    config, net, loss, get_pbb = model.get_model()
    start_epoch = args.start_epoch
    save_dir = args.save_dir

    if args.resume:
        checkpoint = torch.load(args.resume)
        if start_epoch == 0:
            start_epoch = checkpoint['epoch'] + 1
        if not save_dir:
            save_dir = checkpoint['save_dir']
        else:
            save_dir = os.path.join('results', save_dir)
        net.load_state_dict(checkpoint['state_dict'])
    else:
        if start_epoch == 0:
            start_epoch = 1
        if not save_dir:
            exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime())
            save_dir = os.path.join('results', args.model + '-' + exp_id)
        else:
            save_dir = os.path.join('results', save_dir)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    logfile = os.path.join(save_dir, 'log')
    if args.test != 1:
        sys.stdout = Logger(logfile)
        pyfiles = [f for f in os.listdir('./') if f.endswith('.py')]
        for f in pyfiles:
            shutil.copy(f, os.path.join(save_dir, f))
    n_gpu = setgpu(args.gpu)
    args.n_gpu = n_gpu
    net = net.cuda()
    loss = loss.cuda()
    cudnn.benchmark = True
    net = DataParallel(net)
    datadir = config_training['preprocess_result_path']

    print(net)
    if args.test == 1:
        margin = 32
        sidelen = 144

        split_comber = SplitComb(sidelen, config['max_stride'],
                                 config['stride'], margin, config['pad_value'])
        dataset = data.DataBowl3Detector(datadir,
                                         'full.npy',
                                         config,
                                         phase='test',
                                         split_comber=split_comber)
        test_loader = DataLoader(dataset,
                                 batch_size=1,
                                 shuffle=False,
                                 num_workers=args.workers,
                                 collate_fn=data.collate,
                                 pin_memory=False)
        with torch.no_grad():
            test(test_loader, net, get_pbb, save_dir, config)
        return

    #net = DataParallel(net)

    dataset = data.DataBowl3Detector(datadir,
                                     'kaggleluna_full.npy',
                                     config,
                                     phase='train')
    train_loader = DataLoader(dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.workers,
                              pin_memory=True)

    dataset = data.DataBowl3Detector(datadir,
                                     'valsplit.npy',
                                     config,
                                     phase='val')
    val_loader = DataLoader(dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.workers,
                            pin_memory=True)

    optimizer = torch.optim.SGD(net.parameters(),
                                args.lr,
                                momentum=0.9,
                                weight_decay=args.weight_decay)

    def get_lr(epoch):
        if epoch <= args.epochs * 0.5:
            lr = args.lr
        elif epoch <= args.epochs * 0.8:
            lr = 0.1 * args.lr
        else:
            lr = 0.01 * args.lr
        return lr

    for epoch in range(start_epoch, args.epochs + 1):
        train(train_loader, net, loss, epoch, optimizer, get_lr,
              args.save_freq, save_dir)
        with torch.no_grad():
            validate(val_loader, net, loss)