def __init__(self, tok_func: Callable = BaseProteinTokenizer, lang: str = 'prot', n_cpus: int = None): self.tok_func = tok_func self.lang = lang self.n_cpus = n_cpus or num_cpus() // 2
def test_num_cpus_without_sched_getaffinity(os_mock): os_mock.sched_getaffinity = Mock(side_effect=AttributeError) os_mock.cpu_count = Mock(return_value=3) assert core.num_cpus() == 3
def test_num_cpus_with_sched_getaffinity(os_mock): os_mock.sched_getaffinity = Mock(return_value=["foo", "bar"]) assert core.num_cpus() == 2
def main(gpu: Param("GPU to run on", str)=None, max_cpu_per_dataloader: Param("Max CPU", int, opt=True)=8, bs: Param("batch size", int)=256, fp16: Param("mixed precision", int, opt=True)=0, use_sp_processor: Param("use sentence piece as processor", int) = 0, sp_model: Param("sentence piece trained model file", str)=None, sp_vocab: Param("sentence piece trained model file", str)=None, ): datetime_str = f'{datetime.now():%Y-%m-%d_%H-%M-%S%z}' random_seed = 0 max_vocab = 30000 print('max_cpu_per_dataloader', max_cpu_per_dataloader, 'bs', bs, 'fp16', fp16, 'sp_processor', use_sp_processor, 'sp_model', sp_model, 'sp_vocab', sp_vocab) """## Prepare Dataset""" local_project_path = './data/sprot_lm/' #### Distributed print('gpu', gpu) gpu = setup_distrib(gpu) n_gpus = num_distrib() if n_gpus > 0: workers = min(max_cpu_per_dataloader, num_cpus()//n_gpus) else: workers = min(max_cpu_per_dataloader, num_cpus()) print(gpu, 'n_gpus', n_gpus) print(gpu, 'workers', workers) """## Prepare fastai""" np.random.seed(random_seed) if not os.path.exists(local_project_path): os.makedirs(local_project_path) print('local_project_path:', local_project_path) """## Tokenization""" tokenizer = Tokenizer(tok_func=dna_tokenizer, pre_rules=[], post_rules=[], special_cases=[]) processor = [TokenizeProcessor(tokenizer=tokenizer, include_bos=True, include_eos=True), NumericalizeProcessor(max_vocab=max_vocab)] df = pickle.load( open('./data/sprot_lm/sprot_sequence_taxon_anc.pickle', 'rb')) if use_sp_processor: # './data/sprot_lm/tmp/spm.model', './data/sprot_lm/tmp/spm.vocab' processor = [OpenFileProcessor(), SPProcessor(sp_model=sp_model, sp_vocab=sp_vocab, max_sentence_len=35826, max_vocab_sz=max_vocab)] data_lm = (TextList.from_df(df, path=local_project_path, cols='seq_anc_tax', processor=processor) .split_by_rand_pct(0.1, seed = random_seed) .label_for_lm() .databunch(bs=bs, num_workers=workers)) data_lm.vocab.save(local_project_path + 'vocab_lm_sprot_seq_anc_tax-' + datetime_str + '.pickle') print('data_cls Training set size', len(data_lm.train_ds)) print('data_cls Validation set size', len(data_lm.valid_ds)) print('vocab size ', len(data_lm.vocab.itos)) learn_lm = language_model_learner( data_lm, AWD_LSTM, drop_mult=0.1, pretrained=False) if gpu is None: print(gpu, 'DataParallel') learn_lm.model = nn.DataParallel(learn_lm.model) else: print(gpu, 'to_distributed') learn_lm.to_distributed(gpu) if fp16: learn_lm.to_fp16() lr = 3e-3 print(gpu, 'freeze') learn_lm.freeze() learn_lm.fit_one_cycle(1, lr, moms=(0.8, 0.7)) # I don't know why multigpu doesn't work without first freezing print(gpu, 'unfreeze') learn_lm.unfreeze() learn_lm.fit_one_cycle(10, lr*10, moms=(0.8, 0.7)) learn_lm.save('lm-sp-anc-v1-1-' + datetime_str) learn_lm.save_encoder('lm-sp-ans-v1-1-enc-' + datetime_str) learn_lm.fit_one_cycle(10, lr, moms=(0.8, 0.7)) learn_lm.save('lm-sp-anc-v1-2-' + datetime_str) learn_lm.save_encoder('lm-sp-ans-v1-2-enc-' + datetime_str) learn_lm.fit_one_cycle(10, lr/10, moms=(0.8, 0.7)) learn_lm.save('lm-sp-anc-v1-3' + datetime_str) learn_lm.save_encoder('lm-sp-ans-v1-3-enc-' + datetime_str) learn_lm.export(file = 'export-lm-sp-ans-v1-3' + datetime_str+ '.pkl') print('Done')