示例#1
0
 def test_masked_by_flag(self):
     batch_paths = [
         '/client/user1/cuongdev/GenImputation/data/test/electra_G1K_22_hs37d5/corpus_dir/G1K_22_hs37d5_biallelic_test.r0000.b0000.page.gz'
     ]
     tokenizer = ElectraTokenizer(
         vocab_file=
         '/client/user1/cuongdev/GenImputation/data/train/electra_G1K_22_hs37d5/data_dir/vocab.txt'
     )
     test_dataset = GenNLPMaskedDataset(batch_paths,
                                        tokenizer,
                                        masked_by_flag=True,
                                        only_input=True)
     pass
示例#2
0
 def test_how_model_run(self):
     vocab_file = '/client/user1/cuongdev/GenImputation/data/train/electra/data_dir/vocab.txt'
     train_paths = [
         '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0000.page.gz',
         '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0001.page.gz',
         '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0002.page.gz',
         '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0003.page.gz',
         '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0004.page.gz',
         '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0005.page.gz',
     ]
     eval_paths = [
         '/client/user1/cuongdev/GenImputation/data/train/electra/corpus_dir_2048/G1K_VN_chr20_biallelic_train.r0000.b0007.page.gz'
     ]
     tokenizer = ElectraTokenizer(vocab_file=vocab_file)
     eval_dataset = GenNLPMaskedDataset(eval_paths,
                                        tokenizer,
                                        seed=42,
                                        masked_per=0.15)
     model = ElectraForMaskedLM.from_pretrained(
         '/client/user1/cuongdev/GenImputation/data/train/electra/checkpoints/small_2048/checkpoint-8568'
     )
     output = model(**eval_dataset.__getitem__([0, 1, 2, 3]))
     # output = model(**{key: torch.unsqueeze(val,0) for key, val in eval_dataset.__getitem__([0,1,2,3]).items()})
     pass
示例#3
0
if 'masked_mode' in config:
    masked_mode = config['masked_mode']
masked_flag_random = masked_mode == 'random'

train_region_paths = page_config.get_file_paths(
    config[page_config.file_train_prefix], page_config.page, regions, batchs)
test_region_paths = page_config.get_file_paths(
    config[page_config.file_test_prefix], page_config.page, regions, [0])
vocab_file = config[page_config.vocab_file]
save_dir = config[page_config.save_dir]

training_args = OTrainingArguments(**config[page_config.train_args])
output_dir = training_args.output_dir
logging_dir = training_args.logging_dir
modeling_args = ElectraConfig(**config[page_config.model_args])
tokenizer = ElectraTokenizer(vocab_file=vocab_file)
seed = training_args.seed

for i, region in enumerate(regions):
    clear_output(wait=True)
    print('Region {} trainning...'.format(region))
    print('Prevert region {} trainning...'.format(region - 1))
    save_path = save_dir.format(region) + detail
    prevert_path = save_dir.format(region - 1) + detail
    training_args.output_dir = output_dir.format(region) + detail
    training_args.logging_dir = logging_dir.format(region) + detail
    ## Train and eval data
    train_batch_paths = train_region_paths[i]
    train_dataset = GenNLPMaskedDataset(train_batch_paths,
                                        tokenizer,
                                        seed=seed,
        model_type = 'albert'
        tokenizer = AlbertTokenizer(vocab_file=args.tokenizer)
        config = AlbertConfig.from_json_file(args.config)
        model = AlbertModel.from_pretrained(pretrained_model_name_or_path=None,
                                            config=config,
                                            state_dict=torch.load(args.model))
    elif 'bert' in args.model:
        model_type = 'bert'
        tokenizer = BertTokenizer(vocab_file=args.tokenizer)
        config = BertConfig.from_json_file(args.config)
        model = BertModel.from_pretrained(pretrained_model_name_or_path=None,
                                          config=config,
                                          state_dict=torch.load(args.model))
    elif 'electra' in args.model:
        model_type = 'electra'
        tokenizer = ElectraTokenizer(vocab_file=args.tokenizer)
        config = ElectraConfig.from_json_file(args.config)
        model = ElectraModel.from_pretrained(
            pretrained_model_name_or_path=None,
            config=config,
            state_dict=torch.load(args.model))
    else:
        raise NotImplementedError("The model is currently not supported")

    def process_line(line):
        data = json.loads(line)
        tokens = data['text'].split(' ')
        labels = data['targets']
        return tokens, labels

    def retokenize(tokens_labels):