# f = open('word.txt', 'w', encoding='utf-8') # for k,v in gram2id.items(): # f.write(str(k) +'\n') # f.close() processor = NgramProcessor(dataset=dataset, cat_type="length", cat_num=cat_num, ngram_length=ngram_length, gram2id=gram2id, use_ngram=args.use_ngram) label_list = processor.get_labels() ngram_train_examlpes = load_examples(data_dir, max_seq_len, tokenizer, processor, label_list, mode="train") ngram_dev_examlpes = load_examples(data_dir, max_seq_len, tokenizer, processor, label_list, mode="dev") ngram_test_examlpes = load_examples(data_dir, max_seq_len, tokenizer, processor, label_list, mode="test") # ngram_dev_dataset = None
ngram_dict = ZenNgramDict(zen_model_path, tokenizer=tokenizer) zen_model = ZenForTokenClassification.from_pretrained( zen_model_path, cache_dir="caches/", num_labels=len(label_list), multift=False) zen_model = zen_model.bert zen_model.to(device) zen_model.eval() data_dir = os.path.join("data", dataset) max_seq_len = 512 zen_train_dataset = load_examples(data_dir, max_seq_len, tokenizer, ngram_dict, processor, label_list, mode="train") zen_dev_dataset = load_examples(data_dir, max_seq_len, tokenizer, ngram_dict, processor, label_list, mode="dev") zen_test_dataset = load_examples(data_dir, max_seq_len, tokenizer, ngram_dict, processor,