def finetune_lm(train_df, config, arch, args): # Function to fine-tune the pre-trained language model if args.lang != 'en': tok = SentencePieceTokenizer(lang=args.lang, sp_model=SP_MODEL_LOCAL_PATH) else: tok = None blocks = TextBlock.from_df(args.text_col, is_lm=True, tok=tok) data_block = DataBlock(blocks=blocks, get_x=ColReader("text"), splitter=RandomSplitter(valid_pct=VAL_SIZE, seed=RANDOM_STATE)) lm_dataloaders = (data_block).dataloaders(train_df, bs=args.batch_size, backwards=args.bw) if args.lang == 'en': pretrained_filenames = None else: pretrained_filenames = [WEIGHTS_PRETRAINED_FILE, VOCAB_PRETRAINED_FILE] learner_lm = language_model_learner( lm_dataloaders, arch, config=config, path=LM_MODEL_PATH, pretrained=True, pretrained_fnames=pretrained_filenames).to_fp32() lr = find_best_lr(learner_lm) learner_lm = fit_with_gradual_unfreezing(learner_lm, args.epochs, lr, args) learner_lm.save_encoder(ENCODER_FILE_NAME) return lm_dataloaders
def __init__(self, data_path: str = 'lang_model', emb_sz: int = 800, qrnn: bool = False, bidir: bool = False, n_layers: int = 4, n_hid: int = 2500, bs: int = 104, bptt: int = 67, lr: float = 0.0013, wd: float = .012, one_cycle: bool = True, cycle_len: int = 1) -> None: """ Instantiate AWD_LSTM Language Model with hyper-parameters. data_path: str path where databunch is loaded from emb_sz: int size of word embeddings qrnn: bool whether or not to use qrnn (requires CudNN) bidir: bool if RNN should be bi-directional n_layers: int number of layers in lang model n_hid: int number of hidden units in model lr: float learning rate bptt: int back-propigation-through-time; max sequence length through which gradients will be accumulated. bs: int batch size The hyper-parameters are stored in a fastai dict called `fastai.text.models.awd_lstm_lm_config`: {'emb_sz': 400, 'n_hid': 1150, 'n_layers': 3, 'pad_token': 1, 'qrnn': False, 'bidir': False, 'output_p': 0.1, 'hidden_p': 0.15, 'input_p': 0.25, 'embed_p': 0.02,'weight_p': 0.2, 'tie_weights': True, 'out_bias': True} """ self.lr, self.wd, self.one_cycle, self.cycle_len = lr, wd, one_cycle, cycle_len awd_lstm_lm_config.update( dict(emb_sz=emb_sz, qrnn=qrnn, bidir=bidir, n_layers=n_layers, n_hid=n_hid)) #log params wb_handle = wandb.init(config=awd_lstm_lm_config) wandb.config.update({ 'data_path': str(data_path), 'bs': bs, 'bptt': bptt, 'lr': lr }) self.csv_name = 'history_' + wb_handle.name wandb.config.update({'csvlog_save_path': self.csv_name}) # instantiate databunch self.data_lm = load_data(data_path, bs=bs, bptt=bptt) # instantiate language model self.learn = language_model_learner(data=self.data_lm, arch=AWD_LSTM, pretrained=False, model_dir=Path('models_' + wb_handle.name), config=awd_lstm_lm_config) self.full_model_path = str(self.learn.path / self.learn.model_dir) wandb.config.update({'model_save_path': self.full_model_path}) # prepare callbacks escb = EarlyStoppingCallback(learn=self.learn, patience=2) smcb = SaveModelCallback(learn=self.learn, name='best_' + wb_handle.name) rpcb = ReduceLROnPlateauCallback(learn=self.learn, patience=1) csvcb = CSVLogger(learn=self.learn, filename=self.csv_name) wb = wandbCallback(self.learn) self.callbacks = [escb, smcb, rpcb, csvcb, wb] self.fit()
splitter=RandomSplitter(0.1)) dls_lm = dls_lm.dataloaders(path, path=path, bs=bs, seq_len=80) print(dls_lm.show_batch(max_n=3)) # #%% # learn = language_model_learner( # dls_lm, AWD_LSTM, drop_mult=0.3, # metrics=[accuracy, Perplexity()]).to_fp16() # learn.lr_find() # print(learn.model) # learn.fit_one_cycle(1, 2e-2, moms=(0.8,0.7,0.8), cbs=cbs) # learn.save("1epoch") # #%% learn = language_model_learner(dls_lm, AWD_LSTM, drop_mult=0.3, metrics=[accuracy, Perplexity()]).to_fp16() learn = learn.load("1epoch") print(learn.model) # #%% # learn.unfreeze() # learn.fit_one_cycle(1, 2e-3, moms=(0.8,0.7,0.8), cbs=cbs) # learn.save("1epochs_finetuned") # learn.save_encoder("finetuned") # #%% # # Classification def read_tokenized_file(f): return L(f.read_text().split(' '))
def main(gpu: Param("GPU to run on", str)=None, max_cpu_per_dataloader: Param("Max CPU", int, opt=True)=8, bs: Param("batch size", int)=256, fp16: Param("mixed precision", int, opt=True)=0, use_sp_processor: Param("use sentence piece as processor", int) = 0, sp_model: Param("sentence piece trained model file", str)=None, sp_vocab: Param("sentence piece trained model file", str)=None, ): datetime_str = f'{datetime.now():%Y-%m-%d_%H-%M-%S%z}' random_seed = 0 max_vocab = 30000 print('max_cpu_per_dataloader', max_cpu_per_dataloader, 'bs', bs, 'fp16', fp16, 'sp_processor', use_sp_processor, 'sp_model', sp_model, 'sp_vocab', sp_vocab) """## Prepare Dataset""" local_project_path = './data/sprot_lm/' #### Distributed print('gpu', gpu) gpu = setup_distrib(gpu) n_gpus = num_distrib() if n_gpus > 0: workers = min(max_cpu_per_dataloader, num_cpus()//n_gpus) else: workers = min(max_cpu_per_dataloader, num_cpus()) print(gpu, 'n_gpus', n_gpus) print(gpu, 'workers', workers) """## Prepare fastai""" np.random.seed(random_seed) if not os.path.exists(local_project_path): os.makedirs(local_project_path) print('local_project_path:', local_project_path) """## Tokenization""" tokenizer = Tokenizer(tok_func=dna_tokenizer, pre_rules=[], post_rules=[], special_cases=[]) processor = [TokenizeProcessor(tokenizer=tokenizer, include_bos=True, include_eos=True), NumericalizeProcessor(max_vocab=max_vocab)] df = pickle.load( open('./data/sprot_lm/sprot_sequence_taxon_anc.pickle', 'rb')) if use_sp_processor: # './data/sprot_lm/tmp/spm.model', './data/sprot_lm/tmp/spm.vocab' processor = [OpenFileProcessor(), SPProcessor(sp_model=sp_model, sp_vocab=sp_vocab, max_sentence_len=35826, max_vocab_sz=max_vocab)] data_lm = (TextList.from_df(df, path=local_project_path, cols='seq_anc_tax', processor=processor) .split_by_rand_pct(0.1, seed = random_seed) .label_for_lm() .databunch(bs=bs, num_workers=workers)) data_lm.vocab.save(local_project_path + 'vocab_lm_sprot_seq_anc_tax-' + datetime_str + '.pickle') print('data_cls Training set size', len(data_lm.train_ds)) print('data_cls Validation set size', len(data_lm.valid_ds)) print('vocab size ', len(data_lm.vocab.itos)) learn_lm = language_model_learner( data_lm, AWD_LSTM, drop_mult=0.1, pretrained=False) if gpu is None: print(gpu, 'DataParallel') learn_lm.model = nn.DataParallel(learn_lm.model) else: print(gpu, 'to_distributed') learn_lm.to_distributed(gpu) if fp16: learn_lm.to_fp16() lr = 3e-3 print(gpu, 'freeze') learn_lm.freeze() learn_lm.fit_one_cycle(1, lr, moms=(0.8, 0.7)) # I don't know why multigpu doesn't work without first freezing print(gpu, 'unfreeze') learn_lm.unfreeze() learn_lm.fit_one_cycle(10, lr*10, moms=(0.8, 0.7)) learn_lm.save('lm-sp-anc-v1-1-' + datetime_str) learn_lm.save_encoder('lm-sp-ans-v1-1-enc-' + datetime_str) learn_lm.fit_one_cycle(10, lr, moms=(0.8, 0.7)) learn_lm.save('lm-sp-anc-v1-2-' + datetime_str) learn_lm.save_encoder('lm-sp-ans-v1-2-enc-' + datetime_str) learn_lm.fit_one_cycle(10, lr/10, moms=(0.8, 0.7)) learn_lm.save('lm-sp-anc-v1-3' + datetime_str) learn_lm.save_encoder('lm-sp-ans-v1-3-enc-' + datetime_str) learn_lm.export(file = 'export-lm-sp-ans-v1-3' + datetime_str+ '.pkl') print('Done')