def test_masked_by_flag(self): batch_paths = [ '/client/user1/cuongdev/GenImputation/data/test/electra_G1K_22_hs37d5/corpus_dir/G1K_22_hs37d5_biallelic_test.r0000.b0000.page.gz' ] tokenizer = ElectraTokenizer( vocab_file= '/client/user1/cuongdev/GenImputation/data/train/electra_G1K_22_hs37d5/data_dir/vocab.txt' ) test_dataset = GenNLPMaskedDataset(batch_paths, tokenizer, masked_by_flag=True, only_input=True) pass
def test_how_model_run(self): vocab_file = '/client/user1/cuongdev/GenImputation/data/train/electra/data_dir/vocab.txt' train_paths = [ '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0000.page.gz', '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0001.page.gz', '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0002.page.gz', '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0003.page.gz', '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0004.page.gz', '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0005.page.gz', ] eval_paths = [ '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0007.page.gz' ] tokenizer = ElectraTokenizer(vocab_file=vocab_file) eval_dataset = GenNLPMaskedDataset(eval_paths, tokenizer, seed=42, masked_per=0.15) model = ElectraForMaskedLM.from_pretrained( '/client/user1/cuongdev/GenImputation/data/train/electra/checkpoints/small_2048/checkpoint-8568' ) output = model(**eval_dataset.__getitem__([0, 1, 2, 3])) # output = model(**{key: torch.unsqueeze(val,0) for key, val in eval_dataset.__getitem__([0,1,2,3]).items()}) pass
if 'masked_mode' in config: masked_mode = config['masked_mode'] masked_flag_random = masked_mode == 'random' train_region_paths = page_config.get_file_paths( config[page_config.file_train_prefix], page_config.page, regions, batchs) test_region_paths = page_config.get_file_paths( config[page_config.file_test_prefix], page_config.page, regions, [0]) vocab_file = config[page_config.vocab_file] save_dir = config[page_config.save_dir] training_args = OTrainingArguments(**config[page_config.train_args]) output_dir = training_args.output_dir logging_dir = training_args.logging_dir modeling_args = ElectraConfig(**config[page_config.model_args]) tokenizer = ElectraTokenizer(vocab_file=vocab_file) seed = training_args.seed for i, region in enumerate(regions): clear_output(wait=True) print('Region {} trainning...'.format(region)) print('Prevert region {} trainning...'.format(region - 1)) save_path = save_dir.format(region) + detail prevert_path = save_dir.format(region - 1) + detail training_args.output_dir = output_dir.format(region) + detail training_args.logging_dir = logging_dir.format(region) + detail ## Train and eval data train_batch_paths = train_region_paths[i] train_dataset = GenNLPMaskedDataset(train_batch_paths, tokenizer, seed=seed,
model_type = 'albert' tokenizer = AlbertTokenizer(vocab_file=args.tokenizer) config = AlbertConfig.from_json_file(args.config) model = AlbertModel.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=torch.load(args.model)) elif 'bert' in args.model: model_type = 'bert' tokenizer = BertTokenizer(vocab_file=args.tokenizer) config = BertConfig.from_json_file(args.config) model = BertModel.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=torch.load(args.model)) elif 'electra' in args.model: model_type = 'electra' tokenizer = ElectraTokenizer(vocab_file=args.tokenizer) config = ElectraConfig.from_json_file(args.config) model = ElectraModel.from_pretrained( pretrained_model_name_or_path=None, config=config, state_dict=torch.load(args.model)) else: raise NotImplementedError("The model is currently not supported") def process_line(line): data = json.loads(line) tokens = data['text'].split(' ') labels = data['targets'] return tokens, labels def retokenize(tokens_labels):