# f = open('word.txt', 'w', encoding='utf-8')
    # for k,v in gram2id.items():
    #     f.write(str(k) +'\n')
    # f.close()

    processor = NgramProcessor(dataset=dataset,
                               cat_type="length",
                               cat_num=cat_num,
                               ngram_length=ngram_length,
                               gram2id=gram2id,
                               use_ngram=args.use_ngram)
    label_list = processor.get_labels()

    ngram_train_examlpes = load_examples(data_dir,
                                         max_seq_len,
                                         tokenizer,
                                         processor,
                                         label_list,
                                         mode="train")
    ngram_dev_examlpes = load_examples(data_dir,
                                       max_seq_len,
                                       tokenizer,
                                       processor,
                                       label_list,
                                       mode="dev")
    ngram_test_examlpes = load_examples(data_dir,
                                        max_seq_len,
                                        tokenizer,
                                        processor,
                                        label_list,
                                        mode="test")
    # ngram_dev_dataset = None
示例#2
0
    ngram_dict = ZenNgramDict(zen_model_path, tokenizer=tokenizer)
    zen_model = ZenForTokenClassification.from_pretrained(
        zen_model_path,
        cache_dir="caches/",
        num_labels=len(label_list),
        multift=False)
    zen_model = zen_model.bert
    zen_model.to(device)
    zen_model.eval()
    data_dir = os.path.join("data", dataset)
    max_seq_len = 512

    zen_train_dataset = load_examples(data_dir,
                                      max_seq_len,
                                      tokenizer,
                                      ngram_dict,
                                      processor,
                                      label_list,
                                      mode="train")
    zen_dev_dataset = load_examples(data_dir,
                                    max_seq_len,
                                    tokenizer,
                                    ngram_dict,
                                    processor,
                                    label_list,
                                    mode="dev")
    zen_test_dataset = load_examples(data_dir,
                                     max_seq_len,
                                     tokenizer,
                                     ngram_dict,
                                     processor,