def train(dataset_name: str, model_name: str, expt_dir: str, data_folder: str, num_workers: int = 0, is_test: bool = False, resume_from_checkpoint: str = None): seed_everything(SEED) dataset_main_folder = data_folder vocab = Vocabulary.load(join(dataset_main_folder, "vocabulary.pkl")) if model_name == "code2seq": config_function = get_code2seq_test_config if is_test else get_code2seq_default_config config = config_function(dataset_main_folder) model = Code2Seq(config, vocab, num_workers) model.half() #elif model_name == "code2class": # config_function = get_code2class_test_config if is_test else get_code2class_default_config # config = config_function(dataset_main_folder) # model = Code2Class(config, vocab, num_workers) else: raise ValueError(f"Model {model_name} is not supported") # define logger wandb_logger = WandbLogger(project=f"{model_name}-{dataset_name}", log_model=True, offline=True) wandb_logger.watch(model) # define model checkpoint callback model_checkpoint_callback = ModelCheckpoint( filepath=join(expt_dir, "{epoch:02d}-{val_loss:.4f}"), period=config.hyperparams.save_every_epoch, save_top_k=3, ) # define early stopping callback early_stopping_callback = EarlyStopping( patience=config.hyperparams.patience, verbose=True, mode="min") # use gpu if it exists gpu = 1 if torch.cuda.is_available() else None # define learning rate logger lr_logger = LearningRateLogger() trainer = Trainer( max_epochs=20, gradient_clip_val=config.hyperparams.clip_norm, deterministic=True, check_val_every_n_epoch=config.hyperparams.val_every_epoch, row_log_interval=config.hyperparams.log_every_epoch, logger=wandb_logger, checkpoint_callback=model_checkpoint_callback, early_stop_callback=early_stopping_callback, resume_from_checkpoint=resume_from_checkpoint, gpus=gpu, callbacks=[lr_logger], reload_dataloaders_every_epoch=True, ) trainer.fit(model) trainer.save_checkpoint(join(expt_dir, 'Latest.ckpt')) trainer.test()
def preprocess(problem: str, data: str, is_vocab_collected: bool, n_jobs: int): # Collect vocabulary from train holdout if needed if problem not in _config_switcher: raise ValueError(f"Unknown problem ({problem}) passed") config_function = _config_switcher[problem] config = config_function(data) vocab_path = path.join(DATA_FOLDER, config.dataset_name, "vocabulary.pkl") if path.exists(vocab_path): vocab = Vocabulary.load(vocab_path) else: vocab = collect_vocabulary(config) if is_vocab_collected else convert_vocabulary(config) vocab.dump(vocab_path) for holdout in ["train", "val", "test"]: convert_holdout(holdout, vocab, config, n_jobs)
def preprocess(problem: str, data: str, is_vocab_collected: bool, n_jobs: int, data_folder: str, just_test: bool, test_name: str): # Collect vocabulary from train holdout if needed if problem not in _config_switcher: raise ValueError(f"Unknown problem ({problem}) passed") config_function = _config_switcher[problem] config = config_function(data) vocab_path = path.join(data_folder, "vocabulary.pkl") if path.exists(vocab_path): vocab = Vocabulary.load(vocab_path) else: vocab = collect_vocabulary( config, data_folder) if is_vocab_collected else convert_vocabulary( config, data_folder) vocab.dump(vocab_path) split = ["train", "val", "test"] if just_test: split = ["test"] for holdout in split: convert_holdout(holdout, vocab, config, n_jobs, data_folder, test_name)