def train():
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loaders = create_datasets(num_workers=32, batch_size=600)
    # info = pd.read_csv("./flower_data/train.csv")[["image","label"]]
    # class_weights = torch.tensor(1.0/info.groupby(["label"]).count().values.astype(np.float32))
    # del info
    models_ensamble = [
                    # {"name":"vgg", "model":models.vgg16_bn(pretrained=True)},
                    {"name":"resnet", "model":models.resnet50(pretrained=True)}, 
                    # {"name":"densenet", "model":models.densenet121(pretrained=True) },
                    {"name":"resnet", "model":models.resnet101(pretrained=True) },
                    ]

    # model = Ensemble(models_ensamble, name="star_ensemble")
    model = load_checkpoint("ensemble_iso_star_5118.pt")

    ft, cl =model.get_parameters()
    # model = nn.DataParallel(model)
    model = DataParallelModel(model)
    model = model.to(device)
    weight = torch.from_numpy(weight_train[0]).to(device)
    criterion = nn.NLLLoss(weight)
    criterion = DataParallelCriterion(criterion)
  
    optimizers = [ optim.Adam(ft, lr=5e-4), optim.Adam(cl, lr=5e-3)]
    # # print("")
    # # print('-' * 40)
    # # print("lr = {} bs= {}".format(lr,bs) )
    # # print('-' * 40)

    # # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_schedulers = [lr_scheduler.StepLR(optimizers[0], step_size = 1, gamma = 0.995),
                        lr_scheduler.StepLR(optimizers[1], step_size = 1, gamma = 0.992) ]


    model = [model, criterion, optimizers, exp_lr_schedulers, device]

    model = train_model(*model, loaders, num_epochs = 100)
예제 #2
0
def main():
    args = setup_parser()
    args.final_eval = False

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup CUDA, GPU & distributed training
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels(args.data_dir)
    num_labels = len(label_list)
    args.num_labels = num_labels

    # Load pretrained model and tokenizer
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case)
    model = model_class.from_pretrained(args.model_name_or_path, config=config)
    model.to(args.device)

    # logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)

        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)
        if args.n_gpu > 1:
            model = DataParallelModel(model)

    # Evaluation
    results = {}
    if args.do_eval:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            if args.n_gpu > 1:
                model = DataParallelModel(model)
            args.final_eval = True
            result = evaluate(args, model, tokenizer, prefix=global_step)
            result = dict(
                (k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

    if args.save_embeddings:
        save_embeddings(args, model, tokenizer)

    return results