def main(data_path, version, config_args, train_args, func, save_dir, pretrain_state=None): if pretrain_state: pretrain_vocab = {'itos': pretrain_state['itos'], 'stoi': pretrain_state['stoi']} state_dict = pretrain_state['state_dict'] else: pretrain_vocab = None state_dict = None # get device device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu' # load pretrain dataset games = open(data_path).read() # build datasets print('\nProcessing dataset...') train_dataset = dataset.Directory(games, version, config_args, pretrain_vocab)() # load model mconf = model.GPTConfig( vocab_size=train_dataset.vocab_size, args_dict=config_args ) # build model gpt_model = model.GPT(mconf) gpt_model = gpt_model.to(device) train_config = trainer.TrainerConfig(func=func, state_dict=state_dict, args_dict=train_args) model_trainer = trainer.Trainer(gpt_model, train_dataset, save_dir, config=train_config) model_trainer.train()
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu' # Keep the block size 128 # Why is the pretraining corpus always required (even if we're not pretraining?) # It's because we're using it as a hack to always have the same vocabulary # (that is, the same mapping from character to integer, and we build the # vocab from the pretraining corpus.) block_size = 128 text = open(args.pretrain_corpus_path).read() pretrain_dataset = dataset.CharCorruptionDataset(text, block_size) # We don't suggest you change these hyperparameters, as they're known to work. # use them for both the vanilla and the synthesizer models mconf = model.GPTConfig(pretrain_dataset.vocab_size, pretrain_dataset.block_size, n_layer=4, n_head=8, n_embd=256) """ Don't change above here; write your code below """ if args.variant == 'vanilla': pass # TODO [part c]: Make some model here elif args.variant == 'synthesizer': pass # TODO [part g]: Make some other model here # From here on, your code should be identical independent of which # variant (vanilla or synthesizer) has been chosen. if args.function == 'pretrain':
assert os.path.isfile(test_file) else: assert os.path.isfile(chept_ckpt) suffix = '_with_chept' if chept_ckpt else '_score_eval' args.save_name = comm_ckpt.split('/')[1] + suffix # get ckpt comm_ckpt = torch.load(comm_ckpt, map_location=torch.device(device)) comm_model_config = comm_ckpt['model_config'] comm_itos = comm_ckpt['itos'] comm_stoi = comm_ckpt['stoi'] comm_vocabs = {'itos': comm_itos, 'stoi': comm_stoi} # build model config comm_mconf = model.GPTConfig(vocab_size=len(comm_itos), args_dict=comm_model_config.__dict__) # load model weights comm_model = model.GPT(comm_mconf) comm_model = comm_model.to(device) comm_model.load_state_dict(comm_ckpt['state_dict']) if chept_ckpt: chept_ckpt = torch.load(chept_ckpt, map_location=torch.device(device)) chept_model_config = chept_ckpt['model_config'] chept_itos = chept_ckpt['itos'] chept_stoi = chept_ckpt['stoi'] chept_vocabs = {'itos': chept_itos, 'stoi': chept_stoi}
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu' # Keep the block size 128 # Why is the pretraining corpus always required (even if we're not pretraining?) # It's because we're using it as a hack to always have the same vocabulary # (that is, the same mapping from character to integer, and we build the # vocab from the pretraining corpus.) block_size = 128 text = open(args.pretrain_corpus_path).read() pretrain_dataset = dataset.CharCorruptionDataset(text, block_size) # We suggest you don't change these hyperparameters, as they're known to work. # use them for both the vanilla and the synthesizer models mconf = model.GPTConfig(pretrain_dataset.vocab_size, pretrain_dataset.block_size, n_layer=4, n_head=8, n_embd=256) """ Don't change above here; write your code below """ if args.variant == 'vanilla': model = model.GPT(mconf) model = model.to(device) elif args.variant == 'synthesizer': mconf = model.GPTConfig(vocab_size=pretrain_dataset.vocab_size, block_size=pretrain_dataset.block_size, n_layer=4, n_head=8, n_embd=256, synthesizer=True)
parser.add_argument('--n_tries', type=int, default=5, help='Number of retries to give ChePT') args = parser.parse_args() if not args.ckpt: ckpt_path = get_recent_ckpt('ckpts/finetune_default') print("\nWARNING: NO CHECKPOINT GIVEN") print(f"Using {ckpt_path}") else: ckpt_path = args.ckpt args.save_name = ckpt_path.split('/')[1] # get ckpt ckpt = torch.load(ckpt_path, map_location=torch.device(device)) model_config = ckpt['model_config'] itos = ckpt['itos'] stoi = ckpt['stoi'] # build model config mconf = model.GPTConfig( vocab_size=len(itos), args_dict=model_config.__dict__ ) # load model weights gpt_model = model.GPT(mconf) gpt_model = gpt_model.to(device) gpt_model.load_state_dict(ckpt['state_dict']) main(gpt_model, stoi, itos, args)
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu' # Keep the block size 128 # Why is the pretraining corpus always required (even if we're not pretraining?) # It's because we're using it as a hack to always have the same vocabulary # (that is, the same mapping from character to integer, and we build the # vocab from the pretraining corpus.) block_size = 128 text = open(args.pretrain_corpus_path).read() pretrain_dataset = dataset.CharCorruptionDataset(text, block_size) # We don't suggest you change these hyperparameters, as they're known to work. # use them for both the vanilla and the synthesizer models mconf = model.GPTConfig(pretrain_dataset.vocab_size, pretrain_dataset.block_size, n_layer=4, n_head=8, n_embd=256) """ Don't change above here; write your code below """ if args.variant == 'vanilla': # [part c]: mconf = model.GPTConfig(pretrain_dataset.vocab_size, pretrain_dataset.block_size, n_layer=4, n_head=8, n_embd=256, variant="vanilla") elif args.variant == 'synthesizer':