示例#1
0
train, val = split_by_individuals(glucose_file, meal_file, ratio=0.8)
glucose_train = GlucoseData(train, transform=combine_cgm_meals)
glucose_val = GlucoseData(val, transform=combine_cgm_meals)

loaders_dict = {
    'train':
    torch.utils.data.DataLoader(glucose_train,
                                batch_size=400,
                                shuffle=False,
                                drop_last=True),
    'val':
    torch.utils.data.DataLoader(glucose_val,
                                batch_size=400,
                                shuffle=False,
                                drop_last=True)
}

model = Seq2Seq()
model.to(device)

optimizer_base = RAdam(model.parameters(), lr=1e-1)
optimizer = Lookahead(optimizer=optimizer_base, k=5, alpha=0.5)

model.train()

train_model(model=model,
            dataloaders=loaders_dict,
            optimizer=optimizer,
            device=device,
            num_epochs=25)
示例#2
0
def main(args):
    # 1. prepare data & models
    train_transforms = transforms.Compose([
        ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)),
        CropCenter(CROP_SIZE),
        TransformByKeys(transforms.ToPILImage(), ("image", )),
        TransformByKeys(transforms.ToTensor(), ("image", )),
        TransformByKeys(
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ("image", )),
    ])

    print("Reading data...")
    train_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'),
                                             train_transforms,
                                             split="train")
    train_dataloader = data.DataLoader(train_dataset,
                                       batch_size=args.batch_size,
                                       num_workers=1,
                                       pin_memory=True,
                                       shuffle=True,
                                       drop_last=True)
    val_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'),
                                           train_transforms,
                                           split="val")
    val_dataloader = data.DataLoader(val_dataset,
                                     batch_size=args.batch_size,
                                     num_workers=1,
                                     pin_memory=True,
                                     shuffle=False,
                                     drop_last=False)

    print("Creating model...")
    device = torch.device("cuda: 0") if args.gpu else torch.device("cpu")
    #model = models.resnet18(pretrained=True)
    #model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True)

    #model = models.densenet161(pretrained=True)
    #model.classifier = nn.Linear(model.classifier.in_features, 2 * NUM_PTS, bias=True)

    model = EfficientNet.from_pretrained('efficientnet-b4',
                                         num_classes=2 * NUM_PTS)

    if args.oldname:
        with open(f"{args.oldname}_best.pth", "rb") as fp:
            best_state_dict = torch.load(fp, map_location="cpu")
            model.load_state_dict(best_state_dict)

    model.to(device)

    #optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, amsgrad=True)
    optimizer = RAdam(model.parameters(), lr=args.learning_rate)

    loss_fn = fnn.mse_loss
    lr_scheduler = ReduceLROnPlateau(optimizer)

    # 2. train & validate
    print("Ready for training...")
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        train_loss = train(model,
                           train_dataloader,
                           loss_fn,
                           optimizer,
                           device=device)
        val_loss = validate(model, val_dataloader, loss_fn, device=device)
        print("Epoch #{:2}:\ttrain loss: {:5.2}\tval loss: {:5.2}\tlr: {:5.2}".
              format(epoch, train_loss, val_loss, get_lr(optimizer)))
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            with open(f"{args.name}_best.pth", "wb") as fp:
                torch.save(model.state_dict(), fp)
            os.system(
                'cp /content/MadeCvHw1/{args.name}_best.pth "/content/drive/My Drive/MADE/CV_HW1/"'
            )
        lr_scheduler.step(train_loss)

    # 3. predict
    test_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'test'),
                                            train_transforms,
                                            split="test")
    test_dataloader = data.DataLoader(test_dataset,
                                      batch_size=args.batch_size,
                                      num_workers=1,
                                      pin_memory=True,
                                      shuffle=False,
                                      drop_last=False)

    with open(f"{args.name}_best.pth", "rb") as fp:
        best_state_dict = torch.load(fp, map_location="cpu")
        model.load_state_dict(best_state_dict)

    test_predictions = predict(model, test_dataloader, device)
    with open(f"{args.name}_test_predictions.pkl", "wb") as fp:
        pickle.dump(
            {
                "image_names": test_dataset.image_names,
                "landmarks": test_predictions
            }, fp)

    create_submission(args.data, test_predictions, f"{args.name}_submit.csv")
示例#3
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--meta_path",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        '--classifier',
        default='guoday',
        type=str,
        required=True,
        help='classifier type, guoday or MLP or GRU_MLP or ...')
    parser.add_argument('--optimizer',
                        default='RAdam',
                        type=str,
                        required=True,
                        help='optimizer we use, RAdam or ...')
    parser.add_argument("--do_label_smoothing",
                        default='yes',
                        type=str,
                        required=True,
                        help="Whether to do label smoothing. yes or no.")
    parser.add_argument('--draw_loss_steps',
                        default=1,
                        type=int,
                        required=True,
                        help='training steps to draw loss')
    parser.add_argument('--label_name',
                        default='label',
                        type=str,
                        required=True,
                        help='label name in original train set index')

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        default='yes',
                        type=str,
                        required=True,
                        help="Whether to run training. yes or no.")
    parser.add_argument("--do_test",
                        default='yes',
                        type=str,
                        required=True,
                        help="Whether to run training. yes or no.")
    parser.add_argument("--do_eval",
                        default='yes',
                        type=str,
                        required=True,
                        help="Whether to run eval on the dev set. yes or no.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--eval_steps", default=200, type=int, help="")
    parser.add_argument("--lstm_hidden_size", default=300, type=int, help="")
    parser.add_argument("--lstm_layers", default=2, type=int, help="")
    parser.add_argument("--dropout", default=0.5, type=float, help="")

    parser.add_argument("--train_steps", default=-1, type=int, help="")
    parser.add_argument("--report_steps", default=-1, type=int, help="")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--split_num", default=3, type=int, help="text split")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    try:
        os.makedirs(args.output_dir)
    except:
        pass

    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                              do_lower_case=args.do_lower_case)

    # tensorboard_log_dir = args.output_dir

    # loss_now = tf.placeholder(dtype=tf.float32, name='loss_now')
    # loss_mean = tf.placeholder(dtype=tf.float32, name='loss_mean')
    # loss_now_variable = loss_now
    # loss_mean_variable = loss_mean
    # train_loss = tf.summary.scalar('train_loss', loss_now_variable)
    # dev_loss_mean = tf.summary.scalar('dev_loss_mean', loss_mean_variable)
    # merged = tf.summary.merge([train_loss, dev_loss_mean])

    config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3)
    config.hidden_dropout_prob = args.dropout

    # Prepare model
    if args.do_train == 'yes':
        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path, args, config=config)

        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    if args.do_train == 'yes':
        print(
            '________________________now training______________________________'
        )
        # Prepare data loader

        train_examples = read_examples(os.path.join(args.data_dir,
                                                    'train.csv'),
                                       is_training=True,
                                       label_name=args.label_name)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer,
                                                      args.max_seq_length,
                                                      args.split_num, True)
        # print('train_feature_size=', train_features.__sizeof__())
        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(select_field(train_features,
                                                   'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(train_features,
                                                    'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features],
                                 dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label)
        # print('train_data=',train_data[0])
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size //
                                      args.gradient_accumulation_steps)

        num_train_optimization_steps = args.train_steps

        # Prepare optimizer

        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        if args.optimizer == 'RAdam':
            optimizer = RAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate)
        else:
            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=args.train_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        model.train()
        tr_loss = 0
        loss_batch = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        bar = tqdm(range(num_train_optimization_steps),
                   total=num_train_optimization_steps)
        train_dataloader = cycle(train_dataloader)

        # with tf.Session() as sess:
        #     summary_writer = tf.summary.FileWriter(tensorboard_log_dir, sess.graph)
        #     sess.run(tf.global_variables_initializer())

        list_loss_mean = []
        bx = []
        eval_F1 = []
        ax = []

        for step in bar:
            batch = next(train_dataloader)
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss = model(input_ids=input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         labels=label_ids)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            loss_batch += loss.item()
            train_loss = round(
                tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1),
                4)

            bar.set_description("loss {}".format(train_loss))
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            if args.fp16:
                # optimizer.backward(loss)
                loss.backward()
            else:
                loss.backward()

            # draw loss every n docs
            if (step + 1) % int(args.draw_loss_steps /
                                (args.train_batch_size /
                                 args.gradient_accumulation_steps)) == 0:
                list_loss_mean.append(round(loss_batch, 4))
                bx.append(step + 1)
                plt.plot(bx,
                         list_loss_mean,
                         label='loss_mean',
                         linewidth=1,
                         color='b',
                         marker='o',
                         markerfacecolor='green',
                         markersize=2)
                plt.savefig(args.output_dir + '/labeled.jpg')
                loss_batch = 0

            # paras update every batch data.
            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = args.learning_rate * warmup_linear.get_lr(
                        global_step, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            # report results every 200 real batch.
            if step % (args.eval_steps *
                       args.gradient_accumulation_steps) == 0 and step > 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            # do evaluation totally 10 times during training stage.
            if args.do_eval == 'yes' and (step + 1) % int(
                    num_train_optimization_steps / 10) == 0 and step > 450:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []
                    eval_examples = read_examples(os.path.join(
                        args.data_dir, file),
                                                  is_training=True,
                                                  label_name=args.label_name)
                    eval_features = convert_examples_to_features(
                        eval_examples, tokenizer, args.max_seq_length,
                        args.split_num, False)
                    all_input_ids = torch.tensor(select_field(
                        eval_features, 'input_ids'),
                                                 dtype=torch.long)
                    all_input_mask = torch.tensor(select_field(
                        eval_features, 'input_mask'),
                                                  dtype=torch.long)
                    all_segment_ids = torch.tensor(select_field(
                        eval_features, 'segment_ids'),
                                                   dtype=torch.long)
                    all_label = torch.tensor([f.label for f in eval_features],
                                             dtype=torch.long)

                    eval_data = TensorDataset(all_input_ids, all_input_mask,
                                              all_segment_ids, all_label)

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    # Run prediction for full data
                    eval_sampler = SequentialSampler(eval_data)
                    eval_dataloader = DataLoader(
                        eval_data,
                        sampler=eval_sampler,
                        batch_size=args.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss = model(input_ids=input_ids,
                                                  token_type_ids=segment_ids,
                                                  attention_mask=input_mask,
                                                  labels=label_ids)
                            logits = model(input_ids=input_ids,
                                           token_type_ids=segment_ids,
                                           attention_mask=input_mask)

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_labels = np.concatenate(inference_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    model.train()
                    ###############################################
                    num_gold_0 = np.sum(gold_labels == 0)
                    num_gold_1 = np.sum(gold_labels == 1)
                    num_gold_2 = np.sum(gold_labels == 2)

                    right_0 = 0
                    right_1 = 0
                    right_2 = 0
                    error_0 = 0
                    error_1 = 0
                    error_2 = 0

                    for gold_label, inference_label in zip(
                            gold_labels, inference_labels):
                        if gold_label == inference_label:
                            if gold_label == 0:
                                right_0 += 1
                            elif gold_label == 1:
                                right_1 += 1
                            else:
                                right_2 += 1
                        elif inference_label == 0:
                            error_0 += 1
                        elif inference_label == 1:
                            error_1 += 1
                        else:
                            error_2 += 1

                    recall_0 = right_0 / (num_gold_0 + 1e-5)
                    recall_1 = right_1 / (num_gold_1 + 1e-5)
                    recall_2 = right_2 / (num_gold_2 + 1e-5)
                    precision_0 = right_0 / (error_0 + right_0 + 1e-5)
                    precision_1 = right_1 / (error_1 + right_1 + 1e-5)
                    precision_2 = right_2 / (error_2 + right_2 + 1e-5)
                    f10 = 2 * precision_0 * recall_0 / (precision_0 +
                                                        recall_0 + 1e-5)
                    f11 = 2 * precision_1 * recall_1 / (precision_1 +
                                                        recall_1 + 1e-5)
                    f12 = 2 * precision_2 * recall_2 / (precision_2 +
                                                        recall_2 + 1e-5)

                    output_dev_result_file = os.path.join(
                        args.output_dir, "dev_results.txt")
                    with open(output_dev_result_file, 'a',
                              encoding='utf-8') as f:
                        f.write('precision:' + str(precision_0) + ' ' +
                                str(precision_1) + ' ' + str(precision_2) +
                                '\n')
                        f.write('recall:' + str(recall_0) + ' ' +
                                str(recall_1) + ' ' + str(recall_2) + '\n')
                        f.write('f1:' + str(f10) + ' ' + str(f11) + ' ' +
                                str(f12) + '\n' + '\n')

                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracy(inference_logits, gold_labels)
                    # draw loss.
                    eval_F1.append(round(eval_accuracy, 4))
                    ax.append(step)
                    plt.plot(ax,
                             eval_F1,
                             label='eval_F1',
                             linewidth=1,
                             color='r',
                             marker='o',
                             markerfacecolor='blue',
                             markersize=2)
                    for a, b in zip(ax, eval_F1):
                        plt.text(a, b, b, ha='center', va='bottom', fontsize=8)
                    plt.savefig(args.output_dir + '/labeled.jpg')

                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy > best_acc and 'dev' in file:
                        print("=" * 80)
                        print("more accurate model arises, now best F1 = ",
                              eval_accuracy)
                        print("Saving Model......")
                        best_acc = eval_accuracy
                        # Save a trained model, only save the model it-self
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    '''
                    if (step+1) / int(num_train_optimization_steps/10) > 9.5:
                        print("=" * 80)
                        print("End of training. Saving Model......")
                        # Save a trained model, only save the model it-self
                        model_to_save = model.module if hasattr(model, 'module') else model
                        output_model_file = os.path.join(args.output_dir, "pytorch_model_final_step.bin")
                        torch.save(model_to_save.state_dict(), output_model_file)
                        print("=" * 80)
                    '''

    if args.do_test == 'yes':
        start_time = time.time()
        print(
            '___________________now testing for best eval f1 model_________________________'
        )
        try:
            del model
        except:
            pass
        gc.collect()
        args.do_train = 'no'
        model = BertForSequenceClassification.from_pretrained(os.path.join(
            args.output_dir, "pytorch_model.bin"),
                                                              args,
                                                              config=config)
        model.half()
        for layer in model.modules():
            if isinstance(layer, torch.nn.modules.batchnorm._BatchNorm):
                layer.float()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from "
                    "https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        for file, flag in [('test.csv', 'test')]:
            inference_labels = []
            gold_labels = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=False,
                                          label_name=args.label_name)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                    # print('test_logits=', logits)
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            logits = np.concatenate(inference_labels, 0)
            if flag == 'dev':
                print(flag, accuracy(logits, gold_labels))
            elif flag == 'test':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub.csv"),
                                       index=False)
                # df[['id', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False)
            else:
                raise ValueError('flag not in [dev, test]')
        print('inference time usd = {}s'.format(time.time() - start_time))
        '''
示例#4
0
def main():
    seed_everything(69)
    outPath = f"{args.results}_{args.model_name}_YpUnet_AG_hnn234"
    if not os.path.exists(outPath):
        os.makedirs(outPath)

    ts = str(datetime.datetime.now()).split(".")[0].replace(" ", "_")
    ts = ts.replace(":", "_").replace("-", "_")
    file_path = os.path.join(outPath,
                             "{}_run_{}.json".format(args.model_name, ts))
    ##############choose model##########################
    net = get_model(args.num_classes, args.model_name).to(device)

    if args.pre_train:
        net = torch.load(args.ckp)["model_state"]  #load the pretrained model
        print("load pre-trained model sucessfully")
    if torch.cuda.device_count() > 1:
        print("using multi gpu")
        net = torch.nn.DataParallel(net, device_ids=[0, 1, 2, 3])
    else:
        print('using one gpu')

    ##########hyper parameters setting#################
    # optimizer = Adam(net.parameters(), lr=args.lr)
    optimizer = RAdam(params=net.parameters(), lr=args.lr, weight_decay=0.0001)
    optimizer = Lookahead(optimizer)
    milestones = [5 + x * 30 for x in range(5)]
    scheduler_c = CyclicCosAnnealingLR(optimizer,
                                       milestones=milestones,
                                       eta_min=5e-5)
    # # scheduler_r = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', factor=0.2, patience=4, verbose=True)
    scheduler = LearningRateWarmUP(optimizer=optimizer,
                                   target_iteration=5,
                                   target_lr=0.003,
                                   after_scheduler=scheduler_c)

    # criterion = FocalLoss2d().to(device)
    # criterion = BCEDiceLossWeighted().to(device)
    criterion = WeightedBceLoss().to(device)
    # criterion2 = WeightedBceLoss().to(device)

    ##########prepare dataset################################
    train_loader, val_loader = build_loader(batch_size=args.batch_size,
                                            num_workers=4)
    history = collections.defaultdict(list)
    best_miou = -100

    for epoch in range(args.num_epochs):
        print("Epoch: {}/{}".format(epoch + 1, args.num_epochs))
        # optimizer.step()
        # scheduler.step(epoch)
        ####################train####################################
        train_hist = train(train_loader, args.num_classes, device, net,
                           optimizer, criterion)
        print('loss', train_hist["loss"], 'miou', train_hist["miou"], 'fg_iou',
              train_hist["fg_iou"], 'mcc', train_hist["mcc"])

        for k, v in train_hist.items():
            history["train " + k].append(v)

    ######################valid##################################
        val_hist = validate(val_loader, args.num_classes, device, net,
                            scheduler, criterion)
        print('loss', val_hist["loss"], 'miou', val_hist["miou"], 'fg_iou',
              val_hist["fg_iou"], 'mcc', val_hist["mcc"])

        if val_hist["miou"] > best_miou:
            state = {
                "epoch": epoch + 1,
                "model_state": net,
                "best_miou": val_hist["miou"]
            }
            checkpoint = f'{args.model_name}_val_{val_hist["miou"]}_epoch{epoch + 1}.pth'
            torch.save(state, os.path.join(outPath, checkpoint))  # save model
            print("The model has saved successfully!")
            best_miou = val_hist["miou"]

        for k, v in val_hist.items():
            history["val " + k].append(v)

        f = open(file_path, "w+")
        f.write(json.dumps(history))
        f.close()
        shuffle=True,
        num_workers=config.data_loader_numworkers)

    #load data for testing
    test_loader = torch.utils.data.DataLoader(RoadSequenceDataset(
        file_path=config.test_path, transforms=op_tranforms),
                                              batch_size=args.test_batch_size,
                                              shuffle=False,
                                              num_workers=1)

    #load model
    model = UNet_TwoConvGRU(3, 2).to(device)
    # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    # Adam 参数betas=(0.9, 0.99)
    #optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.99))
    optimizer = RAdam(model.parameters(), lr=args.lr, betas=(0.9, 0.999))
    # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)

    scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5)
    class_weight = torch.Tensor(config.class_weight)
    criterion = torch.nn.CrossEntropyLoss(weight=class_weight).to(device)
    criterion2 = torch.nn.MSELoss().to(device)
    # best_acc = 0
    if config.pretrained_path:
        print('loading------------------')
        pretrained_dict = torch.load(config.pretrained_path)
        model_dict = model.state_dict()
        #
        pretrained_dict_1 = {
            k: v
            for k, v in pretrained_dict.items() if (k in model_dict)
示例#6
0
# schedule = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.333, patience=2, verbose=True)
criterion = nn.CrossEntropyLoss()
# criterion = LabelSmoothSoftmaxCE(lb_pos=0.9, lb_neg=5e-3)

'''
dataset.get_step() 获取数据的总迭代次数

'''
best_score = 0

print(max_epoch)
print('------------------start training------------------------')
for i, net in enumerate(nets):
    min_loss = 1000
    print('----------------------start net{}---------------------'.format(i))
    optimizer = RAdam(net.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
    schedule = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.333, patience=5, verbose=True)
    for step in range(max_epoch):
    # model.save_model(net, MODEL_PATH, overwrite=True)
    # break
        train_loss, train_acc = train(net, train_loader, optimizer,criterion)
        valid_loss, valid_acc = valid(net, val_loader, optimizer,criterion)
        schedule.step(valid_loss)
    # '''
    # 实现自己的模型保存逻辑
    # '''
    #


        if valid_loss < min_loss:
            model.save_model(net, MODEL_PATH, name='net_'+str(i)+'.pkl', overwrite=False)
示例#7
0
def main():
    num_epochs = 100
    train_x, train_y, valid_x, valid_y, test_x, test_y = load_data()
    model = CNN_LSTM()

    CUDA = torch.cuda.is_available()
    if CUDA:
        model = model.cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = RAdam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    # optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    # optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4,momentum=0.9)
    train_loss = []
    train_accuracy = []
    valid_loss = []
    valid_accuracy = []
    for epoch in range(num_epochs):
        start = time.time()

        # adjust_learning_rate(optimizer,epoch)
        loss_avg, accuracy_avg = train(train_x, train_y, model, criterion,
                                       optimizer)
        valid_loss_avg, valid_acc_avg = valid(valid_x, valid_y, model,
                                              criterion)

        train_loss.append(loss_avg)
        train_accuracy.append(accuracy_avg)
        valid_loss.append(valid_loss_avg)
        valid_accuracy.append(valid_acc_avg)
        stop = time.time()
        print(
            'Epoch: [%d | %d] LR: %f Train:Loss_Avg=%f Accuracy_Avg=%f Valid: Loss=%f Accuracy=%f Time: %f'
            % (epoch + 1, num_epochs, learning_rate, loss_avg, accuracy_avg,
               valid_loss_avg, valid_acc_avg, stop - start))

    test_accuracy = test(test_x, test_y, model)
    print('Test Accuracy: {:.3f} %'.format(test_accuracy))

    # torch.save(model,"base_cnn")

    if torch.cuda.is_available():
        inputs = test_x.cuda()
        labels = test_y.cuda()

    outputs = model(inputs)
    _, predicted = torch.max(outputs.data, 1)

    correct = ((predicted == labels).sum().item()) / labels.size(0)
    print('Test Accuracy: {:.3f} %'.format(100 * correct))

    print("Precision")
    print(
        precision_score(predicted.cpu().numpy(),
                        labels.cpu().numpy(),
                        average=None))
    print(
        precision_score(predicted.cpu().numpy(),
                        labels.cpu().numpy(),
                        average='weighted'))
    print("Recall")
    print(
        recall_score(predicted.cpu().numpy(),
                     labels.cpu().numpy(),
                     average=None))
    print(
        recall_score(predicted.cpu().numpy(),
                     labels.cpu().numpy(),
                     average='weighted'))

    print(
        "F1 Accuracy:",
        sk.metrics.f1_score(predicted.cpu().numpy(),
                            labels.cpu().numpy(),
                            average=None))
    print(
        "F1 Accuracy:",
        sk.metrics.f1_score(predicted.cpu().numpy(),
                            labels.cpu().numpy(),
                            average='weighted'))

    np.save("predicted", predicted.cpu().numpy())
    np.save("labels", labels.cpu().numpy())

    print("Confustion Matrix:")
    class_names = ['Start_gesture', 'Unknown']
    class_names = np.array(class_names)
    plot_confusion_matrix(predicted.cpu().numpy(),
                          labels.cpu().numpy(),
                          classes=class_names,
                          normalize=True,
                          title='Normalized confusion matrix')
    plt.figure(figsize=(10, 10))

    plt.savefig('foo.png')
    plt.show()
    print()

    # Loss
    f = plt.figure(figsize=(10, 10))
    plt.plot(train_loss, label='Training Loss')
    plt.plot(valid_loss, label='Valid Loss')
    plt.legend()
    plt.show()

    # Accuracy
    f = plt.figure(figsize=(10, 10))
    plt.plot(train_accuracy, label='Training Accuracy')
    plt.plot(valid_accuracy, label='Valid Accuracy')
    plt.legend()
    plt.show()

    train_loss = np.asarray(train_loss)
    train_accuracy = np.asarray(train_accuracy)
    valid_loss = np.asarray(valid_loss)
    valid_accuracy = np.asarray(valid_accuracy)
    np.save('train_loss', train_loss)
    np.save('train_accuracy', train_accuracy)
    np.save('valid_loss', valid_loss)
    np.save('valid_accuracy', valid_accuracy)
示例#8
0
def train(train_loader,
          val_loader,
          model,
          device,
          experiment_name,
          epochs=20,
          optimizer_name='radam',
          lr=1e-3,
          weight_decay=0,
          unfreeze_extractor_epoch=0,
          extractor_lr=1e-4,
          log_interval=50):
    freeze_extractor = True
    model.feature_extractor.toggle_extractor(freeze=True)

    parameter_groups = [
        {
            'params': filter_params(model.parameters()),
            'lr': lr
        },
    ]

    if optimizer_name == 'adam':
        optimizer = optim.Adam(parameter_groups,
                               lr=lr,
                               betas=(0.9, 0.999),
                               weight_decay=weight_decay)
    elif optimizer_name == 'radam':
        from radam import RAdam
        optimizer = RAdam(parameter_groups,
                          lr=lr,
                          betas=(0.9, 0.999),
                          weight_decay=weight_decay)
    elif optimizer_name == 'rmsprop':
        optimizer = optim.RMSprop(parameter_groups,
                                  lr=lr,
                                  weight_decay=weight_decay)
    else:
        raise ValueError(f'Unknown optimizer {optimizer_name}')

    for epoch in range(1, epochs + 1):
        print('Learning rate is {}'.format(optimizer.param_groups[0]['lr']))

        if epoch - 1 == unfreeze_extractor_epoch:
            print('Unfreezing extractor')
            freeze_extractor = False
            model.feature_extractor.toggle_extractor(freeze=False)
            pgroup = {
                'params':
                filter_params(model.feature_extractor.extractor.parameters()),
                'lr':
                extractor_lr
            }
            optimizer.add_param_group(pgroup)

        start_time = time.time()
        train_epoch(train_loader,
                    model,
                    optimizer,
                    device,
                    epoch,
                    log_interval,
                    freeze_extractor=freeze_extractor)
        end_time = time.time()
        print(f'Epoch took {end_time-start_time:.2f} seconds')

        val_epoch(val_loader, model, device)

        save_model(model, experiment_name)

    return model
示例#9
0
EGG_prior.load_state_dict(torch.load('./models/AAI_EGGAE/best_val-cosloss-ranger.pth'))
EGG_prior.eval()

STZ = FCEncoder()
STZ.cuda()
ZTE = FCDecoder()
ZTE.cuda()

DC = SimpleDiscriminator()
DC.cuda()

#encoder/decoder optimizers
# STZ_optimizer_gen = torch.optim.Adam(STZ.parameters(), lr = gen_lr)
# ZTE_optimizer = torch.optim.Adam(ZTE.parameters(), lr = gen_lr)

STZ_optimizer_gen = RAdam(STZ.parameters(), lr = gen_lr)
ZTE_optimizer = RAdam(ZTE.parameters(), lr = gen_lr)

STZ_optimizer_gen = Lookahead(STZ_optimizer_gen, alpha=0.5,k=5)
ZTE_optimizer = Lookahead(ZTE_optimizer, alpha=0.5,k=5)

#regularizing optimizer

# STZ_optimizer_enc = torch.optim.Adam(STZ.parameters(), lr = reg_lr)
# DC_optimizer = torch.optim.Adam(DC.parameters(),lr = reg_lr)

STZ_optimizer_enc = RAdam(STZ.parameters(), lr = reg_lr)
DC_optimizer = RAdam(DC.parameters(),lr = reg_lr)

STZ_optimizer_enc = Lookahead(STZ_optimizer_enc, alpha=0.5,k=5)
DC_optimizer = Lookahead(DC_optimizer, alpha=0.5,k=5)
示例#10
0
文件: main_1.py 项目: WendaDeng/ckmn
            lr=opt.learning_rate,
            momentum=opt.momentum,
            dampening=dampening,
            weight_decay=opt.weight_decay,
            nesterov=opt.nesterov)
    elif opt.optimizer == 'adam':
        optimizer = optim.Adam(
            parameters,
            lr=opt.learning_rate,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=opt.weight_decay)
    elif opt.optimizer == 'radam':
        optimizer = RAdam(
            parameters,
            lr=opt.learning_rate,
            betas=(0.9, 0.999),
            weight_decay=opt.weight_decay)

    normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    ## prepare train
    if not opt.no_train:
        temporal_transform = TemporalSegmentRandomCrop(opt.segment_number, opt.sample_duration)

        assert opt.train_crop in ['random', 'corner', 'center']
        if opt.train_crop == 'random':
            sceobj_crop_method = MultiScaleRandomCrop(opt.scales, opt.sceobj_frame_size)
        elif opt.train_crop == 'corner':
            sceobj_crop_method = MultiScaleCornerCrop(opt.scales, opt.sceobj_frame_size)
        elif opt.train_crop == 'center':
示例#11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--encoder', type=str, default='efficientnet-b0')
    parser.add_argument('--model', type=str, default='unet')
    parser.add_argument('--pretrained', type=str, default='imagenet')
    parser.add_argument('--logdir', type=str, default='../logs/')
    parser.add_argument('--exp_name', type=str)
    parser.add_argument('--data_folder', type=str, default='../input/')
    parser.add_argument('--height', type=int, default=320)
    parser.add_argument('--width', type=int, default=640)
    parser.add_argument('--batch_size', type=int, default=2)
    parser.add_argument('--accumulate', type=int, default=8)
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--enc_lr', type=float, default=1e-2)
    parser.add_argument('--dec_lr', type=float, default=1e-3)
    parser.add_argument('--optim', type=str, default="radam")
    parser.add_argument('--loss', type=str, default="bcedice")
    parser.add_argument('--schedule', type=str, default="rlop")
    parser.add_argument('--early_stopping', type=bool, default=True)

    args = parser.parse_args()

    encoder = args.encoder
    model = args.model
    pretrained = args.pretrained
    logdir = args.logdir
    name = args.exp_name
    data_folder = args.data_folder
    height = args.height
    width = args.width
    bs = args.batch_size
    accumulate = args.accumulate
    epochs = args.epochs
    enc_lr = args.enc_lr
    dec_lr = args.dec_lr
    optim = args.optim
    loss = args.loss
    schedule = args.schedule
    early_stopping = args.early_stopping

    if model == 'unet':
        model = smp.Unet(encoder_name=encoder,
                         encoder_weights=pretrained,
                         classes=4,
                         activation=None)
    if model == 'fpn':
        model = smp.FPN(
            encoder_name=encoder,
            encoder_weights=pretrained,
            classes=4,
            activation=None,
        )
    if model == 'pspnet':
        model = smp.PSPNet(
            encoder_name=encoder,
            encoder_weights=pretrained,
            classes=4,
            activation=None,
        )
    if model == 'linknet':
        model = smp.Linknet(
            encoder_name=encoder,
            encoder_weights=pretrained,
            classes=4,
            activation=None,
        )
    if model == 'aspp':
        print('aspp can only be used with resnet34')
        model = aspp(num_class=4)

    preprocessing_fn = smp.encoders.get_preprocessing_fn(encoder, pretrained)
    log = os.path.join(logdir, name)

    ds = get_dataset(path=data_folder)
    prepared_ds = prepare_dataset(ds)

    train_set, valid_set = get_train_test(ds)

    train_ds = CloudDataset(df=prepared_ds,
                            datatype='train',
                            img_ids=train_set,
                            transforms=training1(h=height, w=width),
                            preprocessing=get_preprocessing(preprocessing_fn),
                            folder=data_folder)
    valid_ds = CloudDataset(df=prepared_ds,
                            datatype='train',
                            img_ids=valid_set,
                            transforms=valid1(h=height, w=width),
                            preprocessing=get_preprocessing(preprocessing_fn),
                            folder=data_folder)

    train_loader = DataLoader(train_ds,
                              batch_size=bs,
                              shuffle=True,
                              num_workers=multiprocessing.cpu_count())
    valid_loader = DataLoader(valid_ds,
                              batch_size=bs,
                              shuffle=False,
                              num_workers=multiprocessing.cpu_count())

    loaders = {
        'train': train_loader,
        'valid': valid_loader,
    }

    num_epochs = epochs

    if args.model != "aspp":
        if optim == "radam":
            optimizer = RAdam([
                {
                    'params': model.encoder.parameters(),
                    'lr': enc_lr
                },
                {
                    'params': model.decoder.parameters(),
                    'lr': dec_lr
                },
            ])
        if optim == "adam":
            optimizer = Adam([
                {
                    'params': model.encoder.parameters(),
                    'lr': enc_lr
                },
                {
                    'params': model.decoder.parameters(),
                    'lr': dec_lr
                },
            ])
        if optim == "adamw":
            optimizer = AdamW([
                {
                    'params': model.encoder.parameters(),
                    'lr': enc_lr
                },
                {
                    'params': model.decoder.parameters(),
                    'lr': dec_lr
                },
            ])
        if optim == "sgd":
            optimizer = SGD([
                {
                    'params': model.encoder.parameters(),
                    'lr': enc_lr
                },
                {
                    'params': model.decoder.parameters(),
                    'lr': dec_lr
                },
            ])
    elif args.model == 'aspp':
        if optim == "radam":
            optimizer = RAdam([
                {
                    'params': model.parameters(),
                    'lr': enc_lr
                },
            ])
        if optim == "adam":
            optimizer = Adam([
                {
                    'params': model.parameters(),
                    'lr': enc_lr
                },
            ])
        if optim == "adamw":
            optimizer = AdamW([
                {
                    'params': model.parameters(),
                    'lr': enc_lr
                },
            ])
        if optim == "sgd":
            optimizer = SGD([
                {
                    'params': model.parameters(),
                    'lr': enc_lr
                },
            ])

    scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=5)
    if schedule == "rlop":
        scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=3)
    if schedule == "noam":
        scheduler = NoamLR(optimizer, 10)

    if loss == "bcedice":
        criterion = smp.utils.losses.BCEDiceLoss(eps=1.)
    if loss == "dice":
        criterion = smp.utils.losses.DiceLoss(eps=1.)
    if loss == "bcejaccard":
        criterion = smp.utils.losses.BCEJaccardLoss(eps=1.)
    if loss == "jaccard":
        criterion == smp.utils.losses.JaccardLoss(eps=1.)
    if loss == 'bce':
        criterion = NewBCELoss()

    callbacks = [NewDiceCallback(), CriterionCallback()]

    callbacks.append(OptimizerCallback(accumulation_steps=accumulate))
    if early_stopping:
        callbacks.append(EarlyStoppingCallback(patience=5, min_delta=0.001))

    runner = SupervisedRunner()
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=loaders,
        callbacks=callbacks,
        logdir=log,
        num_epochs=num_epochs,
        verbose=True,
    )
示例#12
0
文件: train.py 项目: chicm/clouds
def train(args):
    model, model_file = create_model(args.encoder_type, work_dir=args.work_dir)
    if torch.cuda.device_count() > 1:
        model = DataParallel(model)
    model = model.cuda()

    loaders = get_train_val_loaders(args.encoder_type,
                                    batch_size=args.batch_size,
                                    ifold=args.ifold)

    #optimizer = RAdam([
    #    {'params': model.decoder.parameters(), 'lr': args.lr},
    #    {'params': model.encoder.parameters(), 'lr': args.lr / 10.},
    #])
    if args.optim_name == 'RAdam':
        optimizer = RAdam(model.parameters(), lr=args.lr)
    elif args.optim_name == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=args.lr)
    elif args.optim_name == 'SGD':
        optimizer = optim.SGD(model.parameters(), momentum=0.9, lr=args.lr)

    if args.lrs == 'plateau':
        lr_scheduler = ReduceLROnPlateau(optimizer,
                                         mode='max',
                                         factor=args.factor,
                                         patience=args.patience,
                                         min_lr=args.min_lr)
    else:
        lr_scheduler = CosineAnnealingLR(optimizer,
                                         args.t_max,
                                         eta_min=args.min_lr)

    best_metrics = 0.
    best_key = 'dice'

    print(
        'epoch |    lr    |      %        |  loss  |  avg   |  dloss |  closs  |  dice  |  best  | time |  save |'
    )

    if not args.no_first_val:
        val_metrics = validate(args, model, loaders['valid'])
        print(
            'val   |          |               |        |        | {:.4f} | {:.4f} | {:.4f} | {:.4f} |        |        |'
            .format(val_metrics['dice_loss'], val_metrics['cls_loss'],
                    val_metrics['dice'], val_metrics['dice']))

        best_metrics = val_metrics[best_key]

    if args.val:
        return

    model.train()

    #if args.lrs == 'plateau':
    #    lr_scheduler.step(best_metrics)
    #else:
    #    lr_scheduler.step()
    train_iter = 0

    for epoch in range(args.num_epochs):
        train_loss = 0

        current_lr = get_lrs(optimizer)
        bg = time.time()
        for batch_idx, data in enumerate(loaders['train']):
            train_iter += 1
            img, mask_targets, cls_targets = data[0].cuda(), data[1][0].cuda(
            ), data[1][1].cuda()
            batch_size = img.size(0)

            outputs = model(img)
            dice_loss, cls_loss = criterion(outputs,
                                            (mask_targets, cls_targets))
            ((dice_loss + cls_loss) * batch_size).backward()

            optimizer.step()
            optimizer.zero_grad()

            train_loss += dice_loss.item() + cls_loss.item()
            print('\r {:4d} | {:.6f} | {:06d}/{} | {:.4f} | {:.4f} |'.format(
                epoch, float(current_lr[0]), args.batch_size * (batch_idx + 1),
                loaders['train'].num,
                dice_loss.item() + cls_loss.item(),
                train_loss / (batch_idx + 1)),
                  end='')

            if train_iter > 0 and train_iter % args.iter_val == 0:
                save_model(model, model_file + '_latest')
                val_metrics = validate(args, model, loaders['valid'])

                _save_ckp = ''
                if val_metrics[best_key] > best_metrics:
                    best_metrics = val_metrics[best_key]
                    save_model(model, model_file)
                    _save_ckp = '*'
                print(' {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.2f} |  {:4s} |'.
                      format(val_metrics['dice_loss'], val_metrics['cls_loss'],
                             val_metrics['dice'], best_metrics,
                             (time.time() - bg) / 60, _save_ckp))

                model.train()

                if args.lrs == 'plateau':
                    lr_scheduler.step(best_metrics)
                else:
                    lr_scheduler.step()
                current_lr = get_lrs(optimizer)
示例#13
0
    from model.cain_noca import CAIN_NoCA
    print("Building model: CAIN_NoCA")
    model = CAIN_NoCA(depth=args.depth)
else:
    raise NotImplementedError("Unknown model!")
# Just make every model to DataParallel
model = torch.nn.DataParallel(model).to(device)
#print(model)

##### Define Loss & Optimizer #####
criterion = Loss(args)

args.radam = False
if args.radam:
    from radam import RAdam
    optimizer = RAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2))
else:
    from torch.optim import Adam
    optimizer = Adam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2))
print('# of parameters: %d' % sum(p.numel() for p in model.parameters()))


# If resume, load checkpoint: model + optimizer
if args.resume:
    utils.load_checkpoint(args, model, optimizer)
    it = args.it

# Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5, verbose=True)
示例#14
0
def create_custom_optimizer(tvars,
                            loss,
                            bert_init_lr,
                            task_init_lr,
                            num_train_steps,
                            num_warmup_steps,
                            use_tpu,
                            global_step=None,
                            freeze=-1,
                            task_opt='adam',
                            eps=1e-6):
    """Creates an optimizer training op."""
    if global_step is None:
        global_step = tf.train.get_or_create_global_step()

    bert_learning_rate = tf.constant(value=bert_init_lr,
                                     shape=[],
                                     dtype=tf.float32)
    task_learning_rate = tf.constant(value=task_init_lr,
                                     shape=[],
                                     dtype=tf.float32)

    # Implements linear decay of the learning rate.
    bert_learning_rate = tf.train.polynomial_decay(bert_learning_rate,
                                                   global_step,
                                                   num_train_steps,
                                                   end_learning_rate=0.0,
                                                   power=1.0,
                                                   cycle=False)
    task_learning_rate = tf.train.polynomial_decay(task_learning_rate,
                                                   global_step,
                                                   num_train_steps,
                                                   end_learning_rate=0.0,
                                                   power=1.0,
                                                   cycle=False)

    # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
    # learning rate will be `global_step/num_warmup_steps * init_lr`.
    if num_warmup_steps:
        global_steps_int = tf.cast(global_step, tf.int32)
        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

        global_steps_float = tf.cast(global_steps_int, tf.float32)
        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

        warmup_percent_done = global_steps_float / warmup_steps_float
        bert_warmup_learning_rate = bert_init_lr * warmup_percent_done
        task_warmup_learning_rate = task_init_lr * warmup_percent_done

        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
        bert_learning_rate = ((1.0 - is_warmup) * bert_learning_rate +
                              is_warmup * bert_warmup_learning_rate)

    # It is recommended that you use this optimizer for fine tuning, since this
    # is how the model was trained (note that the Adam m/v variables are NOT
    # loaded from init_checkpoint.)
    bert_optimizer = AdamWeightDecayOptimizer(
        learning_rate=bert_learning_rate,
        weight_decay_rate=0.01,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=eps,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
    if task_opt == 'adam_weight_decay':
        task_optimizer = AdamWeightDecayOptimizer(
            learning_rate=task_learning_rate,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=eps)
    elif task_opt == 'adam':
        task_optimizer = tf.train.AdamOptimizer(
            learning_rate=task_learning_rate)
    elif task_opt == 'radam':
        task_optimizer = RAdam(learning_rate=task_learning_rate,
                               epsilon=1e-8,
                               beta1=0.9,
                               beta2=0.999)
    else:
        raise NotImplementedError(
            'Check optimizer. {} is invalid.'.format(task_opt))

    # tvars = tf.trainable_variables()
    bert_vars, task_vars = [], []
    for var in tvars:
        if var.name.startswith('bert'):
            can_optimize = False
            if var.name.startswith('bert/encoder/layer_') and int(
                    var.name.split('/')[2][len('layer_'):]) >= freeze:
                can_optimize = True
            if freeze == -1 or can_optimize:
                bert_vars.append(var)
        else:
            task_vars.append(var)
    print('bert:task', len(bert_vars), len(task_vars))
    grads = tf.gradients(loss, bert_vars + task_vars)
    bert_grads = grads[:len(bert_vars)]
    task_grads = grads[len(bert_vars):]

    # This is how the model was pre-trained.
    (bert_grads, _) = tf.clip_by_global_norm(bert_grads, clip_norm=1.0)
    (task_grads, _) = tf.clip_by_global_norm(task_grads, clip_norm=1.0)

    # global_step1 = tf.Print(global_step, [global_step], 'before')
    bert_train_op = bert_optimizer.apply_gradients(zip(bert_grads, bert_vars),
                                                   global_step=global_step)
    task_train_op = task_optimizer.apply_gradients(zip(task_grads, task_vars),
                                                   global_step=global_step)
    if task_opt == 'adam_weight_decay':
        new_global_step = global_step + 1
        train_op = tf.group(bert_train_op, task_train_op,
                            [global_step.assign(new_global_step)])
    else:
        train_op = tf.group(bert_train_op, task_train_op)
    return train_op
def train_vae(loader,
              device,
              stats_logger,
              lr=1e-3,
              schedule_lr=False,
              latent_dims=16,
              epochs=100,
              optimizer_name='adam',
              adam_beta1=0.5,
              loss_weights=None,
              extractor_lr=1e-5,
              clip_gradients=None,
              encoder_class=vae.Encoder,
              decoder_class=vae.Decoder,
              schedule_classes=None,
              beta_schedule_class=BetaSchedule):
    """Entry point for VAE training"""
    repr_dims = loader.dataset.dims

    encoder = encoder_class(repr_dims, latent_dims)
    decoder = decoder_class(repr_dims, latent_dims)
    model = vae.VAE(encoder, decoder).to(device)

    if schedule_lr:
        # LambdaLR multiplies the initial learning rate with the value
        # returned from lambda each epoch. If we want to directly use the
        # value returned from lambda as the learning rate, we can set an
        # initial learning rate of 1.
        initial_lr = lr
        lr = 1.0

    parameter_groups = [
        {
            'params': model.parameters(),
            'lr': lr
        },
    ]

    if optimizer_name == 'adam':
        optimizer = optim.Adam(parameter_groups,
                               lr=lr,
                               betas=(adam_beta1, 0.999))
    elif optimizer_name == 'radam':
        from radam import RAdam
        optimizer = RAdam(parameter_groups, lr=lr, betas=(adam_beta1, 0.999))
    elif optimizer_name == 'rmsprop':
        optimizer = optim.RMSprop(parameter_groups, lr=lr)
    else:
        raise ValueError(f'Unknown optimizer {optimizer_name}')

    if schedule_lr:
        lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                   LRSchedule(initial_lr))

    if loss_weights is None:
        loss_weights = {}
    else:
        assert isinstance(loss_weights, dict), \
            'Loss weights must be a dictionary `loss_name -> weight`'

    if schedule_classes is None:
        schedule_classes = {}
    else:
        assert isinstance(schedule_classes, dict), \
            'schedules_classes must be a dictionary `loss_name -> schedule_class`'
    schedule_classes['KLD'] = beta_schedule_class

    loss_schedules = {
        name: schedule_class()
        for name, schedule_class in schedule_classes.items()
    }

    print('Training VAE on features...')
    for epoch in range(1, epochs + 1):
        print('Learning rate is {}'.format(optimizer.param_groups[0]['lr']))
        for name, schedule in loss_schedules.items():
            if name == 'KLD':
                # Special case for KLD's weight (beta)
                if isinstance(schedule, BetaSchedule):
                    beta = schedule.get_beta(epoch - 1)
                    loss_weights['KLD'] = beta
                    print(f'Beta is {beta}')
            else:
                loss_weights[name] = schedule.get_beta(epoch - 1)

        if model.reg_loss.use_bayes_factor_vae0_loss:
            variances = (
                1 / model.reg_loss.log_precision.exp()).cpu().detach().numpy()
            print(variances[variances > 1])

        start_time = time.time()
        epoch_stats = train_epoch(loader, model, optimizer, device, epoch, 1,
                                  loss_weights, stats_logger, clip_gradients)
        end_time = time.time()
        print(f'Epoch took {end_time-start_time:.2f} seconds')
        stats_logger.append(epoch - 1, epoch_stats)

        if schedule_lr:
            lr_scheduler.step()

    return model
        attention=True,
    )

    model = model.to(device)

    # convert to half precision
    model.half()
    for layer in model.modules():
        if isinstance(layer, nn.BatchNorm2d):
            layer.float()

    # Add weights from checkpoint model if specified
    if opt.checkpoint_model:
        model.load_state_dict(torch.load(opt.checkpoint_model), )

    optimizer = RAdam(model.parameters(), lr=0.0001, eps=1e-04)

    #Stating the epoch
    for epoch in range(opt.num_epochs):
        #epoch+=104
        epoch_metrics = {"loss": [], "acc": []}
        prev_time = time.time()
        print(f"--- Epoch {epoch}---")
        for batch_i, (X, y) in enumerate(train_dataloader):
            if X.size(0) == 1:
                continue

            image_sequences = Variable(X.to(device), requires_grad=True)
            labels = Variable(y.to(device), requires_grad=False)

            image_sequences = image_sequences.half()
示例#17
0
if args.optimizer.lower()=='sgd':
    optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
if args.optimizer.lower()=='sgdwm':
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower()=='adam':
    optimizer = torch.optim.Adam(net.parameters(), lr=args.lr,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'rmsprop':
    optimizer = optim.RMSprop(net.parameters(),lr=args.lr, momentum=args.momentum,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'radam':
    from radam import RAdam
    optimizer = RAdam(net.parameters(),lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lars':#no tensorboardX
    from lars import LARS
    optimizer = LARS(net.parameters(), lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lamb':
    from lamb import Lamb
    optimizer  = Lamb(net.parameters(),lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'novograd':
    from novograd import NovoGrad
    optimizer = NovoGrad(net.parameters(), lr=args.lr,weight_decay=args.weight_decay)
else:
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                          weight_decay=args.weight_decay)
# lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay)
# lr_scheduler = LambdaLR(optimizer,lrs)
def lrs(batch):
示例#18
0
def train(args):
    print('start training...')
    model, model_file = create_model(args)
    train_loader, val_loader = get_train_val_loaders(batch_size=args.batch_size, val_batch_size=args.val_batch_size)
    #train_loader, val_loader = get_frame_train_loader(batch_size=args.batch_size, val_batch_size=args.val_batch_size)
    #model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)

    if args.optim == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0001)
    elif args.optim == 'RAdam':
        optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=0.0001)
    else:
        optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0001)

    if args.lrs == 'plateau':
        lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=args.factor, patience=args.patience, min_lr=args.min_lr)
    else:
        lr_scheduler = CosineAnnealingLR(optimizer, args.t_max, eta_min=args.min_lr)

    model = model.cuda()
    if torch.cuda.device_count() > 1:
        model_name = model.name
        model = DataParallel(model)
        model.name = model_name

    #model=model.train()

    best_f2 = 0.
    best_key = 'top1'

    print('epoch |    lr     |       %        |  loss  |  avg   |  loss  |  top1   |  top10  |  best  | time |  save |')

    if not args.no_first_val:
        val_metrics = validate(args, model, val_loader)
        print('val   |           |                |        |        | {:.4f} | {:.4f} | {:.4f} | {:.4f} |       |        |'.format(
            val_metrics['valid_loss'], val_metrics['top1'], val_metrics['top10'], val_metrics[best_key] ))

        best_f2 = val_metrics[best_key]

    if args.val:
        return

    model.train()

    if args.lrs == 'plateau':
        lr_scheduler.step(best_f2)
    else:
        lr_scheduler.step()

    train_iter = 0

    for epoch in range(args.start_epoch, args.num_epochs):
        #train_loader, val_loader = get_train_val_loaders(batch_size=args.batch_size, val_batch_size=args.val_batch_size, val_num=args.val_num)

        train_loss = 0

        current_lr = get_lrs(optimizer)
        bg = time.time()
        for batch_idx, data in enumerate(train_loader):
            train_iter += 1
            #if train_loader.seg:
            rgb, audio, labels = [x.cuda() for x in data]
            #else:
            #    rgb, audio, labels = data[0].cuda(), data[2].cuda(), data[4].cuda()
            
            output = model(rgb, audio)
            
            loss = criterion(output, labels)
            batch_size = rgb.size(0)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            #with amp.scale_loss(loss, optimizer) as scaled_loss:
            #    scaled_loss.backward()

            train_loss += loss.item()
            print('\r {:4d} | {:.7f} | {:06d}/{} | {:.4f} | {:.4f} |'.format(
                epoch, float(current_lr[0]), args.batch_size*(batch_idx+1), train_loader.num, loss.item(), train_loss/(batch_idx+1)), end='')

            if train_iter > 0 and train_iter % args.iter_val == 0:
                if isinstance(model, DataParallel):
                    torch.save(model.module.state_dict(), model_file+'_latest')
                else:
                    torch.save(model.state_dict(), model_file+'_latest')

                val_metrics = validate(args, model, val_loader)
                
                _save_ckp = ''
                if args.always_save or val_metrics[best_key] > best_f2:
                    best_f2 = val_metrics[best_key]
                    if isinstance(model, DataParallel):
                        torch.save(model.module.state_dict(), model_file)
                    else:
                        torch.save(model.state_dict(), model_file)
                    _save_ckp = '*'
                print(' {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.2f} |  {:4s} |'.format(
                    val_metrics['valid_loss'], val_metrics['top1'], val_metrics['top10'], best_f2,
                    (time.time() - bg) / 60, _save_ckp))

                model.train()
                if args.lrs == 'plateau':
                    lr_scheduler.step(best_f2)
                else:
                    lr_scheduler.step()
                current_lr = get_lrs(optimizer)
示例#19
0
    def __init__(self, log_dir, cfg):

        self.path = log_dir
        self.cfg = cfg

        if cfg.TRAIN.FLAG:
            self.model_dir = os.path.join(self.path, 'Model')
            self.log_dir = os.path.join(self.path, 'Log')
            mkdir_p(self.model_dir)
            mkdir_p(self.log_dir)
            self.writer = SummaryWriter(log_dir=self.log_dir)
            self.logfile = os.path.join(self.path, "logfile.log")
            sys.stdout = Logger(logfile=self.logfile)

        self.data_dir = cfg.DATASET.DATA_DIR
        self.max_epochs = cfg.TRAIN.MAX_EPOCHS
        self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL

        s_gpus = cfg.GPU_ID.split(',')
        self.gpus = [int(ix) for ix in s_gpus]
        self.num_gpus = len(self.gpus)

        self.batch_size = cfg.TRAIN.BATCH_SIZE
        self.lr = cfg.TRAIN.LEARNING_RATE

        torch.cuda.set_device(self.gpus[0])
        cudnn.benchmark = True

        sample = cfg.SAMPLE
        self.dataset = []
        self.dataloader = []
        self.use_feats = cfg.model.use_feats
        eval_split = cfg.EVAL if cfg.EVAL else 'val'
        train_split = cfg.DATASET.train_split
        if cfg.DATASET.DATASET == 'clevr':
            clevr_collate_fn = collate_fn
            cogent = cfg.DATASET.COGENT
            if cogent:
                print(f'Using CoGenT {cogent.upper()}')

            if cfg.TRAIN.FLAG:
                self.dataset = ClevrDataset(data_dir=self.data_dir,
                                            split=train_split + cogent,
                                            sample=sample,
                                            **cfg.DATASET.params)
                self.dataloader = DataLoader(dataset=self.dataset,
                                             batch_size=cfg.TRAIN.BATCH_SIZE,
                                             shuffle=True,
                                             num_workers=cfg.WORKERS,
                                             drop_last=True,
                                             collate_fn=clevr_collate_fn)

            self.dataset_val = ClevrDataset(data_dir=self.data_dir,
                                            split=eval_split + cogent,
                                            sample=sample,
                                            **cfg.DATASET.params)
            self.dataloader_val = DataLoader(dataset=self.dataset_val,
                                             batch_size=cfg.TEST_BATCH_SIZE,
                                             drop_last=False,
                                             shuffle=False,
                                             num_workers=cfg.WORKERS,
                                             collate_fn=clevr_collate_fn)

        elif cfg.DATASET.DATASET == 'gqa':
            if self.use_feats == 'spatial':
                gqa_collate_fn = collate_fn_gqa
            elif self.use_feats == 'objects':
                gqa_collate_fn = collate_fn_gqa_objs
            if cfg.TRAIN.FLAG:
                self.dataset = GQADataset(data_dir=self.data_dir,
                                          split=train_split,
                                          sample=sample,
                                          use_feats=self.use_feats,
                                          **cfg.DATASET.params)
                self.dataloader = DataLoader(dataset=self.dataset,
                                             batch_size=cfg.TRAIN.BATCH_SIZE,
                                             shuffle=True,
                                             num_workers=cfg.WORKERS,
                                             drop_last=True,
                                             collate_fn=gqa_collate_fn)

            self.dataset_val = GQADataset(data_dir=self.data_dir,
                                          split=eval_split,
                                          sample=sample,
                                          use_feats=self.use_feats,
                                          **cfg.DATASET.params)
            self.dataloader_val = DataLoader(dataset=self.dataset_val,
                                             batch_size=cfg.TEST_BATCH_SIZE,
                                             shuffle=False,
                                             num_workers=cfg.WORKERS,
                                             drop_last=False,
                                             collate_fn=gqa_collate_fn)

        # load model
        self.vocab = load_vocab(cfg)
        self.model, self.model_ema = mac.load_MAC(cfg, self.vocab)

        self.weight_moving_average(alpha=0)
        if cfg.TRAIN.RADAM:
            self.optimizer = RAdam(self.model.parameters(), lr=self.lr)
        else:
            self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        self.start_epoch = 0
        if cfg.resume_model:
            location = 'cuda' if cfg.CUDA else 'cpu'
            state = torch.load(cfg.resume_model, map_location=location)
            self.model.load_state_dict(state['model'])
            self.optimizer.load_state_dict(state['optim'])
            self.start_epoch = state['iter'] + 1
            state = torch.load(cfg.resume_model_ema, map_location=location)
            self.model_ema.load_state_dict(state['model'])

        if cfg.start_epoch is not None:
            self.start_epoch = cfg.start_epoch

        self.previous_best_acc = 0.0
        self.previous_best_epoch = 0
        self.previous_best_loss = 100
        self.previous_best_loss_epoch = 0

        self.total_epoch_loss = 0
        self.prior_epoch_loss = 10

        self.print_info()
        self.loss_fn = torch.nn.CrossEntropyLoss().cuda()

        self.comet_exp = Experiment(
            project_name=cfg.COMET_PROJECT_NAME,
            api_key=os.getenv('COMET_API_KEY'),
            workspace=os.getenv('COMET_WORKSPACE'),
            disabled=cfg.logcomet is False,
        )
        if cfg.logcomet:
            exp_name = cfg_to_exp_name(cfg)
            print(exp_name)
            self.comet_exp.set_name(exp_name)
            self.comet_exp.log_parameters(flatten_json_iterative_solution(cfg))
            self.comet_exp.log_asset(self.logfile)
            self.comet_exp.log_asset_data(json.dumps(cfg, indent=4),
                                          file_name='cfg.json')
            self.comet_exp.set_model_graph(str(self.model))
            if cfg.cfg_file:
                self.comet_exp.log_asset(cfg.cfg_file)

        with open(os.path.join(self.path, 'cfg.json'), 'w') as f:
            json.dump(cfg, f, indent=4)
示例#20
0
def focal_loss(y_true, y_pred):
    alpha, gamma = 0.25, 2
    y_pred = K.clip(y_pred, 1e-8, 1 - 1e-8)
    return - alpha * y_true * K.log(y_pred) * (1 - y_pred)**gamma\
           - (1 - alpha) * (1 - y_true) * K.log(1 - y_pred) * y_pred**gamma


loss1 = focal_loss(a1_in, pa1)
loss1 = K.sum(loss1 * p_mask[..., 0]) / K.sum(p_mask)
loss2 = focal_loss(a2_in, pa2)
loss2 = K.sum(loss2 * p_mask[..., 0]) / K.sum(p_mask)
loss = (loss1 + loss2) * 100  # 放大100倍,可读性好些,不影响Adam的优化

train_model.add_loss(loss)
train_model.compile(optimizer=RAdam(1e-3))


class ExponentialMovingAverage:
    """对模型权重进行指数滑动平均。
    用法:在model.compile之后、第一次训练之前使用;
    先初始化对象,然后执行inject方法。
    """
    def __init__(self, model, momentum=0.9999):
        self.momentum = momentum
        self.model = model
        self.ema_weights = [K.zeros(K.shape(w)) for w in model.weights]

    def inject(self):
        """添加更新算子到model.metrics_updates。
        """
示例#21
0
                    help='base learning rate (default: 0.1)')
args = parser.parse_args()


if args.optimizer.lower()=='adam':
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
elif args.optimizer.lower()=='sgd':
    optimizer = optim.SGD(model.parameters(), lr=args.lr)
elif args.optimizer.lower()=='sgdwm':
    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)
elif args.optimizer.lower() == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(),lr=args.lr, momentum=0.9)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'radam':
    optimizer = RAdam(model.parameters(),lr=args.lr)
elif args.optimizer.lower() == 'lars':#no tensorboardX
    optimizer = LARS(model.parameters(), lr=args.lr, momentum=0.9)
elif args.optimizer.lower() == 'lamb':
    optimizer  = Lamb(model.parameters(),lr=args.lr)
elif args.optimizer.lower() == 'novograd':
    optimizer = NovoGrad(model.parameters(), lr=args.lr, weight_decay=0.0001)
else:
    optimizer = optim.SGD(model.parameters(), lr=1)

optname = args.optimizer if len(sys.argv)>=2 else 'sgd'

# log = open(optname+'log.txt','w+')

def lrs(batch):
    low = math.log2(1e-5)
示例#22
0
def train(opt):
    """ dataset preparation """
    if not opt.data_filtering_off:
        print('Filtering the images containing characters which are not in opt.character')
        print('Filtering the images whose label is longer than opt.batch_max_length')
        # see https://github.com/clovaai/deep-text-recognition-benchmark/blob/6593928855fb7abb999a99f428b3e4477d4ae356/dataset.py#L130

    opt.select_data = opt.select_data.split('-')
    opt.batch_ratio = opt.batch_ratio.split('-')
    train_dataset = Batch_Balanced_Dataset(opt)

    log = open(f'./saved_models/{opt.experiment_name}/log_dataset.txt', 'a')
    AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD)
    valid_dataset, valid_dataset_log = hierarchical_dataset(root=opt.valid_data, opt=opt)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=opt.batch_size,
        shuffle=True,  # 'True' to check training progress with validation function.
        num_workers=int(opt.workers),
        collate_fn=AlignCollate_valid, pin_memory=True)
    log.write(valid_dataset_log)
    print('-' * 80)
    log.write('-' * 80 + '\n')
    log.close()
    
    """ model configuration """
    if 'CTC' in opt.Prediction:
        converter = CTCLabelConverter(opt.character)
    elif opt.Prediction == 'None':
        converter = TransformerConverter(opt.character)
    else:
        converter = AttnLabelConverter(opt.character)
    opt.num_class = len(converter.character)

    if opt.rgb:
        opt.input_channel = 3
    model = Model(opt)
    print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel,
          opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction,
          opt.SequenceModeling, opt.Prediction)

    # weight initialization
    for name, param in model.named_parameters():
        if 'localization_fc2' in name:
            print(f'Skip {name} as it is already initialized')
            continue
        try:
            if 'bias' in name:
                init.constant_(param, 0.0)
            elif 'weight' in name:
                init.kaiming_normal_(param)
        except Exception as e:  # for batchnorm.
            if 'weight' in name:
                param.data.fill_(1)
            continue

    # data parallel for multi-GPU
    # model = torch.nn.DataParallel(model).to(device)
    model = model.to(device)
    model.train()
    if opt.load_from_checkpoint:
        model.load_state_dict(torch.load(os.path.join(opt.load_from_checkpoint, 'checkpoint.pth')))
        print(f'loaded checkpoint from {opt.load_from_checkpoint}...')
    elif opt.saved_model != '':
        print(f'loading pretrained model from {opt.saved_model}')
        if opt.SequenceModeling == 'Transformer':
            fe_state = OrderedDict()
            state_dict = torch.load(opt.saved_model)
            for k, v in state_dict.items():
                if k.startswith('module.FeatureExtraction'):
                    new_k = re.sub('module.FeatureExtraction.', '', k)
                    fe_state[new_k] = state_dict[k]
            model.FeatureExtraction.load_state_dict(fe_state)
        else:
            if opt.FT:
                model.load_state_dict(torch.load(opt.saved_model), strict=False)
            else:
                model.load_state_dict(torch.load(opt.saved_model))
    if opt.freeze_fe:
        model.freeze(['FeatureExtraction'])
    print("Model:")
    print(model)

    """ setup loss """
    if 'CTC' in opt.Prediction:
        criterion = torch.nn.CTCLoss(zero_infinity=True).to(device)
    elif opt.Prediction == 'None':
        criterion = LabelSmoothingLoss(classes=converter.n_classes, padding_idx=converter.pad_idx, smoothing=0.1)
        # criterion = torch.nn.CrossEntropyLoss(ignore_index=converter.pad_idx)
    else:
        criterion = torch.nn.CrossEntropyLoss(ignore_index=0).to(device)  # ignore [GO] token = ignore index 0
    # loss averager
    loss_avg = Averager()

    # filter that only require gradient decent
    filtered_parameters = []
    params_num = []
    for p in filter(lambda p: p.requires_grad, model.parameters()):
        filtered_parameters.append(p)
        params_num.append(np.prod(p.size()))
    print('Trainable params num : ', sum(params_num))
    # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())]

    # setup optimizer
    if opt.adam:
        assert opt.adam in ['Adam', 'AdamW', 'RAdam'], 'adam optimizer must be in Adam, AdamW or RAdam'
        if opt.adam == 'Adam':
            optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999))
        elif opt.adam == "AdamW":
            optimizer = optim.AdamW(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999))
        else:
            optimizer = RAdam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999))
    else:
        optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps)
    print("Optimizer:")
    print(optimizer)

    if opt.load_from_checkpoint and opt.load_optimizer_state:
        optimizer.load_state_dict(torch.load(os.path.join(opt.load_from_checkpoint, 'optimizer.pth')))
        print(f'loaded optimizer state from {os.path.join(opt.load_from_checkpoint, "optimizer.pth")}')

    """ final options """
    # print(opt)
    with open(f'./saved_models/{opt.experiment_name}/opt.txt', 'a') as opt_file:
        opt_log = '------------ Options -------------\n'
        args = vars(opt)
        for k, v in args.items():
            opt_log += f'{str(k)}: {str(v)}\n'
        opt_log += '---------------------------------------\n'
        print(opt_log)
        opt_file.write(opt_log)

    """ start training """
    start_iter = 0
    if opt.saved_model != '':
        try:
            start_iter = int(opt.saved_model.split('_')[-1].split('.')[0])
            print(f'continue to train, start_iter: {start_iter}')
        except:
            pass

    if opt.load_from_checkpoint:
        with open(os.path.join(opt.load_from_checkpoint, 'iter.json'), mode='r', encoding='utf8') as f:
            start_iter = json.load(f)
            print(f'continue to train, start_iter: {start_iter}')
            f.close()

    start_time = time.time()
    best_accuracy = -1
    best_norm_ED = -1
    # i = start_iter

    bar = tqdm(range(start_iter, opt.num_iter))
    # while(True):
    for i in bar:
        bar.set_description(f'Iter {i}: train_loss = {loss_avg.val():.5f}')
        # train part
        image_tensors, labels = train_dataset.get_batch()
        image = image_tensors.to(device)
        text, length = converter.encode(labels, batch_max_length=opt.batch_max_length)
        batch_size = image.size(0)

        if 'CTC' in opt.Prediction:
            preds = model(image, text).log_softmax(2)
            preds_size = torch.IntTensor([preds.size(1)] * batch_size)
            preds = preds.permute(1, 0, 2)

            # (ctc_a) For PyTorch 1.2.0 and 1.3.0. To avoid ctc_loss issue, disabled cudnn for the computation of the ctc_loss
            # https://github.com/jpuigcerver/PyLaia/issues/16
            torch.backends.cudnn.enabled = False
            cost = criterion(preds, text.to(device), preds_size.to(device), length.to(device))
            torch.backends.cudnn.enabled = True

            # # (ctc_b) To reproduce our pretrained model / paper, use our previous code (below code) instead of (ctc_a).
            # # With PyTorch 1.2.0, the below code occurs NAN, so you may use PyTorch 1.1.0.
            # # Thus, the result of CTCLoss is different in PyTorch 1.1.0 and PyTorch 1.2.0.
            # # See https://github.com/clovaai/deep-text-recognition-benchmark/issues/56#issuecomment-526490707
            # cost = criterion(preds, text, preds_size, length)

        elif opt.Prediction == 'None':
            tgt_input = text['tgt_input']
            tgt_output = text['tgt_output']
            tgt_padding_mask = text['tgt_padding_mask']
            preds = model(image, tgt_input.transpose(0, 1), tgt_key_padding_mask=tgt_padding_mask,)
            cost = criterion(preds.view(-1, preds.shape[-1]), tgt_output.contiguous().view(-1))
        else:
            preds = model(image, text[:, :-1])  # align with Attention.forward
            target = text[:, 1:]  # without [GO] Symbol
            cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1))

        model.zero_grad()
        cost.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip)  # gradient clipping with 5 (Default)
        optimizer.step()

        loss_avg.add(cost)

        # validation part
        if (i + 1) % opt.valInterval == 0:
            elapsed_time = time.time() - start_time
            # for log
            with open(f'./saved_models/{opt.experiment_name}/log_train.txt', 'a') as log:
                model.eval()
                with torch.no_grad():
                    valid_loss, current_accuracy, current_norm_ED, preds, confidence_score, labels, infer_time, length_of_data = validation(
                        model, criterion, valid_loader, converter, opt)
                model.train()

                # training loss and validation loss
                loss_log = f'[{i}/{opt.num_iter}] Train loss: {loss_avg.val():0.5f}, Valid loss: {valid_loss:0.5f}, Elapsed_time: {elapsed_time:0.5f}'
                loss_avg.reset()

                current_model_log = f'{"Current_accuracy":17s}: {current_accuracy:0.3f}, {"Current_norm_ED":17s}: {current_norm_ED:0.2f}'

                # keep best accuracy model (on valid dataset)
                if current_accuracy > best_accuracy:
                    best_accuracy = current_accuracy
                    torch.save(model.state_dict(), f'./saved_models/{opt.experiment_name}/best_accuracy.pth')
                if current_norm_ED > best_norm_ED:
                    best_norm_ED = current_norm_ED
                    torch.save(model.state_dict(), f'./saved_models/{opt.experiment_name}/best_norm_ED.pth')

                # checkpoint
                os.makedirs(f'./checkpoints/{opt.experiment_name}/', exist_ok=True)

                torch.save(model.state_dict(), f'./checkpoints/{opt.experiment_name}/checkpoint.pth')
                torch.save(optimizer.state_dict(), f'./checkpoints/{opt.experiment_name}/optimizer.pth')
                with open(f'./checkpoints/{opt.experiment_name}/iter.json', mode='w', encoding='utf8') as f:
                    json.dump(i + 1, f)
                    f.close()

                with open(f'./checkpoints/{opt.experiment_name}/checkpoint.log', mode='a', encoding='utf8') as f:
                    f.write(f'Saved checkpoint with iter={i}\n')
                    f.write(f'\tCheckpoint at: ./checkpoints/{opt.experiment_name}/checkpoint.pth')
                    f.write(f'\tOptimizer at: ./checkpoints/{opt.experiment_name}/optimizer.pth')

                best_model_log = f'{"Best_accuracy":17s}: {best_accuracy:0.3f}, {"Best_norm_ED":17s}: {best_norm_ED:0.2f}'

                loss_model_log = f'{loss_log}\n{current_model_log}\n{best_model_log}'
                print(loss_model_log)
                log.write(loss_model_log + '\n')

                # show some predicted results
                dashed_line = '-' * 80
                head = f'{"Ground Truth":25s} | {"Prediction":25s} | Confidence Score & T/F'
                predicted_result_log = f'{dashed_line}\n{head}\n{dashed_line}\n'
                for gt, pred, confidence in zip(labels[:5], preds[:5], confidence_score[:5]):
                    if 'Attn' in opt.Prediction:
                        gt = gt[:gt.find('[s]')]
                        pred = pred[:pred.find('[s]')]

                    predicted_result_log += f'{gt:25s} | {pred:25s} | {confidence:0.4f}\t{str(pred == gt)}\n'
                predicted_result_log += f'{dashed_line}'
                print(predicted_result_log)
                log.write(predicted_result_log + '\n')

        # save model per 1e+5 iter.
        if (i + 1) % 1e+5 == 0:
            torch.save(
                model.state_dict(), f'./saved_models/{opt.experiment_name}/iter_{i+1}.pth')

        # if i == opt.num_iter:
        #     print('end the training')
        #     sys.exit()
        # i += 1
        # if i == 1: break
    print('end training')
示例#23
0
                                           transforms=True)
train_dataloader = DataLoader(train_dataset,
                              batch_size=opts.batch_size,
                              drop_last=False,
                              shuffle=True)

if opts.visdom:
    vis = visdom.Visdom()
    train_loss_window = vis.line(X=torch.zeros((1, )).cpu(),
                                 Y=torch.zeros((1)).cpu(),
                                 opts=dict(xlabel='epoch',
                                           ylabel='Loss',
                                           title='Training Loss',
                                           legend=['Loss']))

optimizer = RAdam(filter(lambda p: p.requires_grad, model.parameters()),
                  lr=opts.learning_rate)
criterion = nn.BCELoss()
cudnn.benchmark = True
train_add_loss = []
train_epoch = []
one_epoch_iteration = len(train_dataloader)
early_stoping = EarlyStopping(patience=30,
                              learning_rate=opts.learning_rate,
                              verbose=True)


def adjust_learning_rate(optimizer, lr):
    for param_group in optimizer.param_groups:

        param_group['lr'] = lr
示例#24
0
def main():

    global args
    best_prec1, best_epoch = 0.0, 0

    if not os.path.exists(args.save):
        os.makedirs(args.save)

    if args.data.startswith('cifar'):
        IM_SIZE = 32
    else:
        IM_SIZE = 224

    model = getattr(models, args.arch)(args)
    n_flops, n_params = measure_model(model, IM_SIZE, IM_SIZE)    
    torch.save(n_flops, os.path.join(args.save, 'flops.pth'))
    del(model)
        
        
    model = getattr(models, args.arch)(args)

    if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
        model.features = torch.nn.DataParallel(model.features)
        model.cuda()
    else:
        model = torch.nn.DataParallel(model).cuda()

    criterion = nn.CrossEntropyLoss().cuda()

    if args.optimizer == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
    elif args.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), args.lr,
                                     weight_decay=args.weight_decay)
    elif args.optimizer == 'radam':
        from radam import RAdam
        optimizer = RAdam(model.parameters(), args.lr,
                          weight_decay=args.weight_decay)
    else:
        raise NotImplementedError("Wrong optimizer.")
    

    if args.resume:
        checkpoint = load_checkpoint(args)
        if checkpoint is not None:
            args.start_epoch = checkpoint['epoch'] + 1
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])

    cudnn.benchmark = True

    train_loader, val_loader, test_loader = get_dataloaders(args)

    if args.evalmode is not None:
        state_dict = torch.load(args.evaluate_from)['state_dict']
        model.load_state_dict(state_dict)

        if args.evalmode == 'anytime':
            validate(test_loader, model, criterion)
        else:
            dynamic_evaluate(model, test_loader, val_loader, args)
        return

    scores = ['epoch\tlr\ttrain_loss\tval_loss\ttrain_prec1'
              '\tval_prec1\ttrain_prec5\tval_prec5']

    for epoch in range(args.start_epoch, args.epochs):

        train_loss, train_prec1, train_prec5, lr = train(train_loader, model, criterion, optimizer, epoch)

        val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion)

        scores.append(('{}\t{:.3f}' + '\t{:.4f}' * 6)
                      .format(epoch, lr, train_loss, val_loss,
                              train_prec1, val_prec1, train_prec5, val_prec5))

        is_best = val_prec1 > best_prec1
        if is_best:
            best_prec1 = val_prec1
            best_epoch = epoch
            print('Best var_prec1 {}'.format(best_prec1))

        model_filename = 'checkpoint_%03d.pth.tar' % epoch
        save_checkpoint({
            'epoch': epoch,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer': optimizer.state_dict(),
        }, args, is_best, model_filename, scores)

    print('Best val_prec1: {:.4f} at epoch {}'.format(best_prec1, best_epoch))

    ### Test the final model

    print('********** Final prediction results **********')
    validate(test_loader, model, criterion)

    return 
示例#25
0
def train_ccblock(model_options):
    # get train&valid datasets' paths
    if model_options.trainset_num > 1:
        train_file_paths = [
            model_options.trainset_path.format(i)
            for i in range(1, model_options.trainset_num + 1)
        ]
    else:
        train_file_paths = [model_options.trainset_path]

    # load datasets
    print(train_file_paths)
    label_paths = "/home/langruimin/BLSTM_pytorch/data/fcv/fcv_train_labels.mat"
    videoset = VideoDataset(train_file_paths, label_paths)
    print(len(videoset))

    # create model
    model = RCCAModule(1, 1)
    model_quan = Quantization(model_options.subLevel, model_options.subCenters,
                              model_options.dim)

    params_path = os.path.join(model_options.model_save_path,
                               model_options.params_filename)
    params_path_Q = os.path.join(model_options.model_save_path,
                                 model_options.Qparams_filename)
    if model_options.reload_params:
        print('Loading model params...')
        model.load_state_dict(torch.load(params_path))
        print('Done.')

    model = model.cuda()
    model_quan = model_quan.cuda()
    # optimizer
    optimizer = RAdam(model.parameters(),
                      lr=1e-3,
                      betas=(0.9, 0.999),
                      weight_decay=1e-4)
    optimizer2 = RAdam(
        model_quan.parameters(),
        lr=1e-3,  # 7e-6
        betas=(0.9, 0.999),
        weight_decay=1e-4)

    lr_C = ""
    lr_Q = ""
    # milestones = []
    # lr_schduler_C = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones, gamma=0.1, last_epoch=-1)
    # lr_schduler_Q = torch.optim.lr_scheduler.MultiStepLR(optimizer2, milestones, gamma=0.6, last_epoch=-1)

    selector = AllTripletSelector()
    triplet_loss = OnlineTripletLoss(margin=512, triplet_selector=selector)

    batch_idx = 1
    train_loss_rec = open(
        os.path.join(model_options.records_save_path,
                     model_options.train_loss_filename), 'w')
    error_ = 0.
    loss_ = 0.
    num = 0
    print("##########start train############")
    trainloader = torch.utils.data.DataLoader(videoset,
                                              batch_size=9,
                                              shuffle=True,
                                              num_workers=4,
                                              pin_memory=True)
    model.train()
    model_quan.train()

    init_train_label = np.load(
        "/home/langruimin/BLSTM_pytorch/data/fcv/init_train_labels.npy")

    for l in range(100):
        # lr_schduler_C.step(l)
        # milestones.append(l+2)
        # lr_schduler_Q.step(l)

        # training
        for i, (data, index, _, _) in enumerate(trainloader):
            data = data.to(model_options.default_dtype)
            data = data.unsqueeze(1)
            data = data.cuda()
            # cc_block
            output_ccblock_mean = torch.tanh(model(data))

            # quantization block
            Qhard, Qsoft, SoftDistortion, HardDistortion, JointCenter, error, _ = model_quan(
                output_ccblock_mean)
            Q_loss = 0.1 * SoftDistortion + HardDistortion + 0.1 * JointCenter

            tri_loss, tri_num = triplet_loss(output_ccblock_mean,
                                             init_train_label[index])

            optimizer2.zero_grad()
            Q_loss.backward(retain_graph=True)
            optimizer2.step()

            optimizer.zero_grad()
            tri_loss.backward()
            optimizer.step()

            error_ += error.item()
            loss_ += tri_loss.item()
            num += 1
            if batch_idx % model_options.disp_freq == 0:
                info = "epoch{0} Batch {1} loss:{2:.3f}  distortion:{3:.3f} " \
                    .format(l, batch_idx, loss_/ num, error_ / num)
                print(info)
                train_loss_rec.write(info + '\n')

            batch_idx += 1
        batch_idx = 0
        error_ = 0.
        loss_ = 0.
        num = 0

        if (l + 1) % model_options.save_freq == 0:
            print('epoch: ', l, 'New best model. Saving model ...')
            torch.save(model.state_dict(), params_path)
            torch.save(model_quan.state_dict(), params_path_Q)

            for param_group in optimizer.param_groups:
                lr_C = param_group['lr']
            for param_group in optimizer2.param_groups:
                lr_Q = param_group['lr']
            record_inf = "saved model at epoch {0} lr_C:{1} lr_Q:{2}".format(
                l, lr_C, lr_Q)
            train_loss_rec.write(record_inf + '\n')
        print("##########epoch done##########")

    print('train done. Saving model ...')
    torch.save(model.state_dict(), params_path)
    torch.save(model_quan.state_dict(), params_path_Q)
    print("##########train done##########")
示例#26
0
model = LeNet5(N_CLASSES).to(DEVICE)

if len(sys.argv) == 1:
    optimizer = optim.SGD(model.parameters(), lr=0.01)
elif sys.argv[1] == 'adam':
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
elif sys.argv[1] == 'sgd':
    optimizer = optim.SGD(model.parameters(), lr=0.01)
elif sys.argv[1] == 'sgdwm':
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
elif sys.argv[1] == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(), lr=0.001, momentum=0.9)
elif sys.argv[1] == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(), lr=0.01)
elif sys.argv[1] == 'radam':
    optimizer = RAdam(model.parameters())
elif sys.argv[1] == 'lars':  #no tensorboardX
    optimizer = LARS(model.parameters(), lr=0.1, momentum=0.9)
elif sys.argv[1] == 'lamb':
    optimizer = Lamb(model.parameters())
elif sys.argv[1] == 'novograd':
    optimizer = NovoGrad(model.parameters(), lr=0.01, weight_decay=0.001)
    schedular = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                     3 * len(train_loader),
                                                     1e-4)

    def train(train_loader, model, criterion, optimizer, schedular, device):
        '''
        Function for the training step of the training loop
        '''
示例#27
0
    optimizer = optim.Adam(model.parameters(),
                           lr=args.init_lr,
                           betas=betas,
                           eps=1e-9)
elif args.optim == '1cycle':
    optimizer = optim.Adam(model.parameters(),
                           lr=args.init_lr,
                           betas=betas,
                           eps=1e-9)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer,
                                              max_lr=args.max_lr,
                                              steps_per_epoch=len(train_data),
                                              epochs=args.max_epochs)
elif args.optim == 'radam':
    optimizer = RAdam(model.parameters(),
                      lr=args.init_lr,
                      betas=betas,
                      eps=1e-9)
elif args.optim == 'schedule':
    optimizer = optim.Adam(model.parameters(),
                           lr=args.init_lr,
                           betas=betas,
                           eps=1e-9)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        factor=args.schedule_factor,
        patience=args.schedule_patience)

# **************** TRAINING ******************
print('Training starts...')

alignment = None
示例#28
0
def build_pipeline(
        data_dir,  
        model, 
        save_every,
        batch_size, 
        input_size, 
        output_size,
        raw, 
        labels,
        affs,
        affs_predicted,
        lr=1e-5): 

    dataset_shape = zarr.open(str(data_dir))['train/raw'].shape
    num_samples = dataset_shape[0]
    sample_size = dataset_shape[1:]

    loss = torch.nn.MSELoss()
    optimizer = RAdam(model.parameters(), lr=lr)
    
    pipeline = (
            gp.ZarrSource(
                data_dir,
                {
                    raw: 'train/raw',
                    labels: 'train/gt'
                },
                array_specs={
                    raw: gp.ArraySpec(
                        roi=gp.Roi((0, 0, 0), (num_samples,) + sample_size),
                        voxel_size=(1, 1, 1)),
                    labels: gp.ArraySpec(
                        roi=gp.Roi((0, 0, 0), (num_samples,) + sample_size),
                        voxel_size=(1, 1, 1))
                }) +
            # raw: (d=1, h, w)
            # labels: (d=1, fmap_inc_factors=5h, w)
            gp.RandomLocation() +
            # raw: (d=1, h, w)
            # labels: (d=1, h, w)
            gp.AddAffinities(
                affinity_neighborhood=[(0, 1, 0), (0, 0, 1)],
                labels=labels,
                affinities=affs) +
            gp.Normalize(affs, factor=1.0) +
            # raw: (d=1, h, w)
            # affs: (c=2, d=1, h, w)
            Squash(dim=-3) +
            # get rid of z dim
            # raw: (h, w)
            # affs: (c=2, h, w)
            AddChannelDim(raw) +
            # raw: (c=1, h, w)
            # affs: (c=2, h, w)
            gp.PreCache() +
            gp.Stack(batch_size) +
            # raw: (b=10, c=1, h, w)
            # affs: (b=10, c=2, h, w)
            Train(
                model=model,
                loss=loss,
                optimizer=optimizer,
                inputs={'x': raw},
                target=affs,
                output=affs_predicted,
                save_every=save_every,
                log_dir='log') +
            # raw: (b=10, c=1, h, w)
            # affs: (b=10, c=2, h, w)
            # affs_predicted: (b=10, c=2, h, w)
            TransposeDims(raw,(1, 0, 2, 3)) +
            TransposeDims(affs,(1, 0, 2, 3)) +
            TransposeDims(affs_predicted,(1, 0, 2, 3)) +
            # raw: (c=1, b=10, h, w)
            # affs: (c=2, b=10, h, w)
            # affs_predicted: (c=2, b=10, h, w)
            RemoveChannelDim(raw) +
            # raw: (b=10, h, w)
            # affs: (c=2, b=10, h, w)
            # affs_predicted: (c=2, b=10, h, w)
            gp.Snapshot(
                dataset_names={
                    raw: 'raw',
                    labels: 'labels',
                    affs: 'affs',
                    affs_predicted: 'affs_predicted'
                },
                every=100) +
            gp.PrintProfilingStats(every=100)
        )
    return pipeline 
示例#29
0
def train(config, num_classes=1108):
    model = model_whale(num_classes=num_classes,
                        inchannels=6,
                        model_name=config.train.model_name,
                        pretrained=config.train.pretrained).cuda()
    if config.train.freeze:
        model.freeze()

    base_opt = RAdam(model.parameters(), lr=config.train.lr)
    optimizer = Lookahead(base_opt)
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.train.lr,  betas=(0.9, 0.99), weight_decay=0.0002)

    resultDir = config.train.result_dir
    checkPoint = join(resultDir, 'checkpoint')
    #     if not config.train.in_colab:
    #         os.makedirs(checkPoint, exist_ok=True)
    train_dataset = CustomDataset(config.train.csv_file,
                                  config.train.img_dir,
                                  transforms=transforms['train'])
    dataset_size = len(train_dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(config.train.validation_split * dataset_size))
    if config.train.shuffle_dataset:
        np.random.seed(config.train.random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.train.batch_size,
        sampler=train_sampler,
        num_workers=config.train.num_workers)
    validation_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.train.batch_size,
        sampler=valid_sampler,
        num_workers=config.train.num_workers)

    train_loss = 0.

    # load from cpk:
    if config.train.load_cpk:
        model.load_pretrain(os.path.join(
            checkPoint, '%08d_model.pth' % (config.train.start_epoch)),
                            skip=[])
        cpk = torch.load(
            os.path.join(checkPoint,
                         '%08d_optimizer.pth' % (config.train.start_epoch)))
        optimizer.load_state_dict(cpk['optimizer'])
        adjust_learning_rate(optimizer, config.train.lr)
        start_epoch = cpk['epoch']
    else:
        start_epoch = 0

    top1_batch, map5_batch = 0, 0

    for epoch in range(start_epoch + 1, config.train.epochs):
        print('Starting:', epoch, 'Iterations:', len(train_loader))
        for i, data in enumerate(train_loader):
            model.train()
            model.mode = 'train'
            images, labels = data
            images = images.cuda()
            labels = labels.cuda().long()
            global_feat, local_feat, results = data_parallel(model, images)
            model.getLoss(global_feat,
                          local_feat,
                          results,
                          labels,
                          config,
                          verbose=(i % config.loss.verbose_interval == 0))
            batch_loss = model.loss

            optimizer.zero_grad()
            batch_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           max_norm=5.0,
                                           norm_type=2)
            optimizer.step()
            results = torch.sigmoid(results)
            train_loss += batch_loss.data.cpu().numpy()
            top1_batch += accuracy(results, labels, topk=[1])[0]
            map5_batch += mapk(labels, results, k=5)

            if i % config.train.verbose_interval == 0:
                print(
                    'epoch: %03d, iter: %05d, train_loss: %f, top1_batch: %f, map5_batch: %f'
                    % (epoch, i,
                       float(train_loss / config.train.verbose_interval),
                       float(top1_batch / config.train.verbose_interval),
                       float(map5_batch / config.train.verbose_interval)))

                #                 print(f'epoch: {epoch}, iter: {i}, train_loss: {float(train_loss / config.train.verbose_interval)}, top1_batch: {float(top1_batch / config.train.verbose_interval)}, map5_batch: {float(map5_batch / config.train.verbose_interval)}')
                train_loss, top1_batch, map5_batch = 0, 0, 0

                valid_loss, top1_valid, map5_valid = valid_eval(
                    config, model, validation_loader)
                print(
                    'epoch: %03d, iter: %05d, valid_loss: %f, valid_top1_batch: %f, valid_map5_batch: %f'
                    % (epoch, i, valid_loss, top1_valid, map5_valid))


#                 print(f'epoch: {epoch}, iter: {i}, valid_loss: {valid_loss}, top1_batch: {top1_valid}, map5_batch: {map5_valid}')

        if epoch % config.train.save_period == 0:
            os.system("touch " + resultDir + "/checkpoint/%08d_model.pth" %
                      (epoch))
            os.system("touch " + resultDir + "/checkpoint/%08d_optimizer.pth" %
                      (epoch))
            time.sleep(1)
            torch.save(model.state_dict(),
                       resultDir + '/checkpoint/%08d_model.pth' % (epoch))
            torch.save({
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
            }, resultDir + '/checkpoint/%08d_optimizer.pth' % (epoch))
示例#30
0
def train(args):
    # augmentations
    train_transform = Compose([

        Resize(args.img_size, args.img_size),
        Cutout(num_holes=8, max_h_size=20, max_w_size=20, fill_value=0, always_apply=False, p=0.5),
        Normalize(
                mean=[0.0692],
                std=[0.205],
            ),
        ToTensorV2()
    ])
    val_transform = Compose([
        Resize(args.img_size, args.img_size),
        Normalize(
            mean=[0.0692],
            std=[0.205],
        ),
        ToTensorV2() 
    ])

    
    # Load data
    df_train = pd.read_csv("../input/train_folds.csv")

    if args.fold == -1: 
        sys.exit()


    train = df_train[df_train['kfold']!=args.fold].reset_index(drop=True)#[:1000]
    val = df_train[df_train['kfold']==args.fold].reset_index(drop=True)#[:1000]

    train_data = ImageDataset('../input/images', train_transform, train)
    train_loader = utils.DataLoader(train_data, shuffle=True, num_workers=5, batch_size=args.batch_size, pin_memory=True)

    val_data = ImageDataset('../input/images', val_transform, val)
    val_loader = utils.DataLoader(val_data, shuffle=False, num_workers=5, batch_size=args.batch_size, pin_memory=True)   

# create model 

    device = torch.device(f"cuda:{args.gpu_n}")
    model = PretrainedCNN()
    
    
    if args.pretrain_path != "":
        model.load_state_dict(torch.load(args.pretrain_path, map_location=f"cuda:{args.gpu_n}"))
        print("weights loaded")
    model.to(device)
    
    
    
    optimizer = RAdam(model.parameters(), lr=args.start_lr)     

    opt_level = 'O1'
    model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
    loss_fn = nn.CrossEntropyLoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=8, factor=0.6)

    best_models = deque(maxlen=5)
    best_score = 0.99302


    for e in range(args.epoch):

        # Training:
        train_loss = []
        model.train()

        for image, target in tqdm(train_loader, ncols = 70):   
            optimizer.zero_grad()
            xs = image.to(device)
            ys = target.to(device)

            # Cutmix using with BUG
            if np.random.rand()<0.5:
                 images, targets = cutmix(xs, ys[:,0], ys[:,1], ys[:,2], 1.0)
                 pred = model(xs)
                 output1 = pred[:,:168]
                 output2 = pred[:,168:179]
                 output3 = pred[:,179:]
                 loss = cutmix_criterion(output1,output2,output3, targets)

            else:
                pred = model(xs)
                grapheme = pred[:,:168]
                vowel = pred[:,168:179]
                cons = pred[:,179:]

                loss = loss_fn(grapheme, ys[:,0]) + loss_fn(vowel, ys[:,1])+ loss_fn(cons, ys[:,2])

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            optimizer.step()
            train_loss.append(loss.item())

        
        #Validation    
        val_loss = []
        val_true = []
        val_pred = []
        model.eval()  
        with torch.no_grad():
            for image, target in val_loader:#tqdm(val_loader, ncols=50):
                xs = image.to(device)
                ys = target.to(device)

                pred = model(xs)
                grapheme = pred[:,:168]
                vowel = pred[:,168:179]
                cons = pred[:,179:]

                loss = loss_fn(grapheme, ys[:,0]) + loss_fn(vowel, ys[:,1])+ loss_fn(cons, ys[:,2])
                val_loss.append(loss.item())

                grapheme = grapheme.cpu().argmax(dim=1).data.numpy()
                vowel = vowel.cpu().argmax(dim=1).data.numpy()
                cons = cons.cpu().argmax(dim=1).data.numpy()
                val_true.append(target.numpy())
                val_pred.append(np.stack([grapheme, vowel, cons], axis=1))

        val_true = np.concatenate(val_true)
        val_pred = np.concatenate(val_pred)

        val_loss = np.mean(val_loss)
        train_loss = np.mean(train_loss)
        scores = []

        for i in [0,1,2]:
            scores.append(sklearn.metrics.recall_score(val_true[:,i], val_pred[:,i], average='macro'))
        final_score = np.average(scores, weights=[2,1,1])


        print(f'Epoch: {e:03d}; train_loss: {train_loss:.05f}; val_loss: {val_loss:.05f}; ', end='')
        print(f'score: {final_score:.5f} ', end='')

    
    #   Checkpoint model. If there are 2nd stage(224x224) save best 5 checkpoints
        if final_score > best_score:
            best_score = final_score
            state_dict = copy.deepcopy(model.state_dict()) 
            if args.save_queue==1:
                best_models.append(state_dict)
                for i, m in enumerate(best_models):
                    path = f"models/{args.exp_name}"
                    os.makedirs(path, exist_ok=True)
                    torch.save(m, join(path, f"{i}.pt"))
            else:
                path = f"models/{args.exp_name}"
                os.makedirs(path, exist_ok=True)
                torch.save(state_dict, join(path, "model.pt"))
            print('+')
        else:
            print()


        scheduler.step(final_score)