def load_data(): # 替换路径 if dataset == 'conll2003': # conll2003的lr不能超过0.002 paths = { 'test': "../data/conll2003/test.txt", 'train': "../data/conll2003/train.txt", 'dev': "../data/conll2003/dev.txt" } data = Conll2003NERPipe( encoding_type=encoding_type).process_from_file(paths) elif dataset == 'en-ontonotes': paths = '../data/en-ontonotes/english' data = OntoNotesNERPipe( encoding_type=encoding_type).process_from_file(paths) char_embed = None if char_type == 'cnn': char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3], word_dropout=0, dropout=0.3, pool_method='max', include_word_start_end=False, min_char_freq=2) elif char_type in ['adatrans', 'naive']: char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, pool_method='max', activation='relu', min_char_freq=2, requires_grad=True, include_word_start_end=False, char_attn_type=char_type, char_n_head=3, char_dim_ffn=60, char_scale=char_type == 'naive', char_dropout=0.15, char_after_norm=True) elif char_type == 'lstm': char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, hidden_size=100, pool_method='max', activation='relu', min_char_freq=2, bidirectional=True, requires_grad=True, include_word_start_end=False) word_embed = StaticEmbedding(vocab=data.get_vocab('words'), model_dir_or_name='en-glove-6b-100d', requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=normalize_embed) data.rename_field('words', 'chars') embed = ElmoEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name='en-original', layers='mix', requires_grad=False, word_dropout=0.0, dropout=0.5, cache_word_reprs=False) embed.set_mix_weights_requires_grad() embed = StackEmbedding([embed, word_embed, char_embed], dropout=0, word_dropout=0.02) return data, embed
def load_data(): if dataset == 'ON5e': paths = 'data/ON5e/english' data = OntoNotesNERPipe( encoding_type=encoding_type).process_from_file(paths) else: paths = { "train": "data/{}/train.txt".format(dataset), "dev": "data/{}/dev.txt".format(dataset), "test": "data/{}/test.txt".format(dataset) } data = ENNERPipe(encoding_type=encoding_type).process_from_file(paths) if knowledge: train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api( os.path.join("data", dataset), "all", feature_level) else: train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = None, None, None, None, None, None char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'), embed_size=embed_size, char_emb_size=embed_size, word_dropout=0, dropout=0.3, pool_method='max', activation='relu', min_char_freq=2, requires_grad=True, include_word_start_end=False, char_attn_type=char_type, char_n_head=3, char_dim_ffn=60, char_scale=char_type == 'naive', char_dropout=0.15, char_after_norm=True) word_embed = StaticEmbedding(vocab=data.get_vocab('words'), model_dir_or_name='en-glove-6b-100d', requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=normalize_embed) data.rename_field('words', 'chars') embed = ElmoEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=elmo_model, layers='mix', requires_grad=False, word_dropout=0.0, dropout=0.5, cache_word_reprs=False) embed.set_mix_weights_requires_grad() bert_embed = BertEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method="first", word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, bert_embed, word_embed, char_embed], dropout=0, word_dropout=0.02) return data, embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature