def tokenizer_to_fastai_args(self, sp_data_func, use_moses): moses_preproc = [MosesPreprocessingFunc(self.lang)] if use_moses else [] if self.tokenizer is Tokenizers.SUBWORD or self.tokenizer is Tokenizers.BROKENSUBWORD: if self.base_lm_path and not(self.cache_dir/"spm.model").exists(): # ensure we are using the same sentence piece model shutil.copy(self.base_lm_path / '..' / 'itos.pkl', self.cache_dir) shutil.copy(self.base_lm_path / '..' / 'spm.model', self.cache_dir) shutil.copy(self.base_lm_path / '..' / 'spm.vocab', self.cache_dir) args = get_sentencepiece(self.cache_dir, sp_data_func, vocab_size=self.max_vocab, lang=self.lang, pre_rules=moses_preproc + defaults.text_pre_rules, post_rules=defaults.text_post_rules) elif self.tokenizer is Tokenizers.MOSES: args = dict(tokenizer=Tokenizer(tok_func=BaseTokenizer, lang=self.lang, pre_rules=moses_preproc + [replace_std_toks], post_rules=[])) elif self.tokenizer is Tokenizers.MOSES_FA: args = dict(tokenizer=Tokenizer(tok_func=BaseTokenizer, lang=self.lang, pre_rules=moses_preproc + defaults.text_pre_rules, post_rules=defaults.text_post_rules)) elif self.tokenizer is Tokenizers.FASTAI: args = dict() else: raise ValueError( f"self.tokenizer has wrong value {self.tokenizer}, Allowed values are taken from {Tokenizers}") return args
def tokenizer_to_fastai_args(self, trn_data_loading_func, add_moses): tok_func = MosesTokenizerFunc if add_moses else BaseTokenizer if self.tokenizer is Tokenizers.SUBWORD: if self.base_lm_path: # ensure we are using the same sentence piece model shutil.copy(self.base_lm_path / '..' / 'itos.pkl', self.cache_dir) shutil.copy(self.base_lm_path / '..' / 'spm.model', self.cache_dir) shutil.copy(self.base_lm_path / '..' / 'spm.vocab', self.cache_dir) args = get_sentencepiece(self.cache_dir, trn_data_loading_func, vocab_size=self.max_vocab, use_moses=add_moses, lang=self.lang) elif self.tokenizer is Tokenizers.MOSES: args = dict(tokenizer=Tokenizer(tok_func=tok_func, lang=self.lang, pre_rules=[replace_std_toks], post_rules=[])) elif self.tokenizer is Tokenizers.MOSES_FA: args = dict(tokenizer=Tokenizer( tok_func=tok_func, lang=self.lang)) # use default pre/post rules elif self.tokenizer is Tokenizers.FASTAI: args = dict() else: raise ValueError( f"self.tokenizer has wrong value {self.tokenizer}, Allowed values are taken from {Tokenizers}" ) return args
def pretrain_lm(dir_path, lang='en', cuda_id=0, qrnn=True, subword=False, max_vocab=60000, bs=70, bptt=70, name='wt-103', num_epochs=10, bidir=False, ds_pct=1.0): """ :param dir_path: The path to the directory of the file. :param lang: the language unicode :param cuda_id: The id of the GPU. Uses GPU 0 by default or no GPU when run on CPU. :param qrnn: Use a QRNN. Requires installing cupy. :param subword: Use sub-word tokenization on the cleaned data. :param max_vocab: The maximum size of the vocabulary. :param bs: The batch size. :param bptt: The back-propagation-through-time sequence length. :param name: The name used for both the model and the vocabulary. :param model_dir: The path to the directory where the models should be saved :param bidir: whether the language model is bidirectional """ results = {} if not torch.cuda.is_available(): print('CUDA not available. Setting device=-1.') cuda_id = -1 torch.cuda.set_device(cuda_id) dir_path = Path(dir_path) assert dir_path.exists() model_dir = dir_path / 'models' # removed from params, as it is absolute models location in train_clas and here it is relative model_dir.mkdir(exist_ok=True) print('Batch size:', bs) print('Max vocab:', max_vocab) model_name = 'qrnn' if qrnn else 'lstm' if qrnn: print('Using QRNNs...') trn_path = dir_path / f'{lang}.wiki.train.tokens' val_path = dir_path / f'{lang}.wiki.valid.tokens' tst_path = dir_path / f'{lang}.wiki.test.tokens' for path_ in [trn_path, val_path, tst_path]: assert path_.exists(), f'Error: {path_} does not exist.' if subword: # apply sentencepiece tokenization trn_path = dir_path / f'{lang}.wiki.train.tokens' val_path = dir_path / f'{lang}.wiki.valid.tokens' read_file(trn_path, 'train') read_file(val_path, 'valid') sp = get_sentencepiece(dir_path, trn_path, name, vocab_size=max_vocab) lm_type = contrib_data.LanguageModelType.BiLM if bidir else contrib_data.LanguageModelType.FwdLM data_lm = TextLMDataBunch.from_csv(dir_path, 'train.csv', **sp, bs=bs, bptt=bptt, lm_type=lm_type) itos = data_lm.train_ds.vocab.itos stoi = data_lm.train_ds.vocab.stoi else: # read the already whitespace separated data without any preprocessing trn_tok = read_whitespace_file(trn_path) val_tok = read_whitespace_file(val_path) if ds_pct < 1.0: trn_tok = trn_tok[:max(20, int(len(trn_tok) * ds_pct))] val_tok = val_tok[:max(20, int(len(val_tok) * ds_pct))] print( f"Limiting data sets to {ds_pct*100}%, trn {len(trn_tok)}, val: {len(val_tok)}" ) itos_fname = model_dir / f'itos_{name}.pkl' if not itos_fname.exists(): # create the vocabulary cnt = Counter(word for sent in trn_tok for word in sent) itos = [o for o, c in cnt.most_common(n=max_vocab)] itos.insert(1, PAD) # set pad id to 1 to conform to fast.ai standard assert UNK in itos, f'Unknown words are expected to have been replaced with {UNK} in the data.' # save vocabulary print(f"Saving vocabulary as {itos_fname}") results['itos_fname'] = itos_fname with open(itos_fname, 'wb') as f: pickle.dump(itos, f) else: print("Loading itos:", itos_fname) itos = np.load(itos_fname) vocab = Vocab(itos) stoi = vocab.stoi trn_ids = np.array([([stoi.get(w, stoi[UNK]) for w in s]) for s in trn_tok]) val_ids = np.array([([stoi.get(w, stoi[UNK]) for w in s]) for s in val_tok]) lm_type = contrib_data.LanguageModelType.BiLM if bidir else contrib_data.LanguageModelType.FwdLM # data_lm = TextLMDataBunch.from_ids(dir_path, trn_ids, [], val_ids, [], len(itos)) data_lm = TextLMDataBunch.from_ids(path=dir_path, vocab=vocab, train_ids=trn_ids, valid_ids=val_ids, bs=bs, bptt=bptt, lm_type=lm_type) print('Size of vocabulary:', len(itos)) print('First 10 words in vocab:', ', '.join([itos[i] for i in range(10)])) # these hyperparameters are for training on ~100M tokens (e.g. WikiText-103) # for training on smaller datasets, more dropout is necessary if qrnn: emb_sz, nh, nl = 400, 1550, 3 #dps = np.array([0.0, 0.0, 0.0, 0.0, 0.0]) dps = np.array([0.25, 0.1, 0.2, 0.02, 0.15]) drop_mult = 0.1 else: emb_sz, nh, nl = 400, 1150, 3 # emb_sz, nh, nl = 400, 1150, 3 dps = np.array([0.25, 0.1, 0.2, 0.02, 0.15]) drop_mult = 0.1 fastai.text.learner.default_dropout['language'] = dps lm_learner = bilm_learner if bidir else language_model_learner learn = lm_learner(data_lm, bptt=bptt, emb_sz=emb_sz, nh=nh, nl=nl, pad_token=1, drop_mult=drop_mult, tie_weights=True, model_dir=model_dir.name, bias=True, qrnn=qrnn, clip=0.12) # compared to standard Adam, we set beta_1 to 0.8 learn.opt_fn = partial(optim.Adam, betas=(0.8, 0.99)) learn.true_wd = False print("true_wd: ", learn.true_wd) if bidir: learn.metrics = [accuracy_fwd, accuracy_bwd] else: learn.metrics = [accuracy] try: learn.load(f'{model_name}_{name}') print("Weights loaded") except FileNotFoundError: print("Starting from random weights") pass learn.fit_one_cycle(num_epochs, 5e-3, (0.8, 0.7), wd=1e-7) if not subword and max_vocab is None: # only if we use the unpreprocessed version and the full vocabulary # are the perplexity results comparable to previous work print( f"Validating model performance with test tokens from: {trn_path}") tst_tok = read_whitespace_file(trn_path) tst_ids = np.array([([stoi.get(w, stoi[UNK]) for w in s]) for s in tst_tok]) logloss, perplexity = validate(learn.model, tst_ids, bptt) print('Test logloss:', logloss.item(), 'perplexity:', perplexity.item()) print(f"Saving models at {learn.path / learn.model_dir}") learn.save(f'{model_name}_{name}') opt_state_path = learn.path / learn.model_dir / f'{model_name}3_{name}_state.pth' print(f"Saving optimiser state at {opt_state_path}") torch.save(learn.opt.opt.state_dict(), opt_state_path) results['accuracy'] = learn.validate()[1] return results