def train(): os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") loaders = create_datasets(num_workers=32, batch_size=600) # info = pd.read_csv("./flower_data/train.csv")[["image","label"]] # class_weights = torch.tensor(1.0/info.groupby(["label"]).count().values.astype(np.float32)) # del info models_ensamble = [ # {"name":"vgg", "model":models.vgg16_bn(pretrained=True)}, {"name":"resnet", "model":models.resnet50(pretrained=True)}, # {"name":"densenet", "model":models.densenet121(pretrained=True) }, {"name":"resnet", "model":models.resnet101(pretrained=True) }, ] # model = Ensemble(models_ensamble, name="star_ensemble") model = load_checkpoint("ensemble_iso_star_5118.pt") ft, cl =model.get_parameters() # model = nn.DataParallel(model) model = DataParallelModel(model) model = model.to(device) weight = torch.from_numpy(weight_train[0]).to(device) criterion = nn.NLLLoss(weight) criterion = DataParallelCriterion(criterion) optimizers = [ optim.Adam(ft, lr=5e-4), optim.Adam(cl, lr=5e-3)] # # print("") # # print('-' * 40) # # print("lr = {} bs= {}".format(lr,bs) ) # # print('-' * 40) # # Decay LR by a factor of 0.1 every 7 epochs exp_lr_schedulers = [lr_scheduler.StepLR(optimizers[0], step_size = 1, gamma = 0.995), lr_scheduler.StepLR(optimizers[1], step_size = 1, gamma = 0.992) ] model = [model, criterion, optimizers, exp_lr_schedulers, device] model = train_model(*model, loaders, num_epochs = 100)
def main(): args = setup_parser() args.final_eval = False if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA, GPU & distributed training device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels(args.data_dir) num_labels = len(label_list) args.num_labels = num_labels # Load pretrained model and tokenizer args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.model_name_or_path, config=config) model.to(args.device) # logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train: # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) if args.n_gpu > 1: model = DataParallelModel(model) # Evaluation results = {} if args.do_eval: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) if args.n_gpu > 1: model = DataParallelModel(model) args.final_eval = True result = evaluate(args, model, tokenizer, prefix=global_step) result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) if args.save_embeddings: save_embeddings(args, model, tokenizer) return results