def load_data(): paths = { 'train': 'data/{}/train.txt'.format(dataset), 'dev': 'data/{}/dev.txt'.format(dataset), 'test': 'data/{}/test.txt'.format(dataset) } min_freq = 2 data_bundle = CNNERPipe( bigrams=True, encoding_type=encoding_type).process_from_file(paths) dict_save_path = os.path.join("data/{}/data.pth".format(dataset)) context_dict, context_word2id, context_id2word = get_neighbor_for_vocab( data_bundle.get_vocab('chars').word2idx, glove_path, dict_save_path) train_feature_data, dev_feature_data, test_feature_data = build_instances( "data/{}".format(dataset), context_num, context_dict) embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bi_embed = StaticEmbedding( data_bundle.get_vocab('bigrams'), model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec', word_dropout=0.02, dropout=0.3, min_freq=min_freq, only_norm_found_vector=normalize_embed, only_train_min_freq=True) tencent_embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/tencent_unigram.txt', min_freq=min_freq, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method=args.pool_method, word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, tencent_embed, bert_embed], dropout=0, word_dropout=0.02) return data_bundle, embed, bi_embed, train_feature_data, dev_feature_data, test_feature_data, context_word2id, context_id2word
def load_data(): paths = { 'train': 'data/{}/train.txt'.format(dataset), "dev": 'data/{}/dev.txt'.format(dataset), "test": 'data/{}/test.txt'.format(dataset) } min_freq = 1 data_bundle = CNNERPipe( bigrams=True, encoding_type=encoding_type).process_from_file(paths) train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api( os.path.join("data", dataset), "all", args.feature_level) embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) tencent_embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/tencent_unigram.txt', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bi_embed = StaticEmbedding( data_bundle.get_vocab('bigrams'), model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec', word_dropout=0.02, dropout=0.3, min_freq=min_freq, only_norm_found_vector=normalize_embed, only_train_min_freq=True) bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method=args.pool_method, word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, tencent_embed, bert_embed], dropout=0, word_dropout=0.02) return data_bundle, embed, bi_embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature
def load_ner_data(): paths = { 'train': 'data/{}/train.txt'.format(dataset), 'dev': 'data/{}/dev.txt'.format(dataset), 'test': 'data/{}/test.txt'.format(dataset) } min_freq = 2 data_bundle = CNNERPipe( bigrams=True, encoding_type=encoding_type).process_from_file(paths) # train_list = data_bundle.get_dataset('train')['raw_chars'] embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bi_embed = StaticEmbedding( data_bundle.get_vocab('bigrams'), model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec', word_dropout=0.02, dropout=0.3, min_freq=2, only_norm_found_vector=normalize_embed, only_train_min_freq=True) tencent_embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/tencent_unigram.txt', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method=args.pool_method, word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=True) # embed = StackEmbedding([tencent_embed, bert_embed], dropout=0, word_dropout=0.02) embed = StackEmbedding([embed, tencent_embed, bert_embed], dropout=0, word_dropout=0.02) return data_bundle, embed, bi_embed
def test_case_1(self): ds = DataSet([ Instance(words=['hello', 'world']), Instance(words=['hello', 'Jack']) ]) vocab = Vocabulary().from_dataset(ds, field_name='words') self.assertEqual(len(vocab), 5) cnn_embed = CNNCharEmbedding(vocab, embed_size=60) lstm_embed = LSTMCharEmbedding(vocab, embed_size=70) embed = StackEmbedding([cnn_embed, lstm_embed]) x = torch.LongTensor([[2, 1, 0], [4, 3, 4]]) y = embed(x) self.assertEqual(tuple(y.size()), (2, 3, 130))
def load_data(): # paths = {'test': "../data/conll2003/test.txt", # 'train': "../data/conll2003/train.txt", # 'dev': "../data/conll2003/dev.txt"} paths = {'test': args.test, 'train': args.train, 'dev': args.dev} data = Conll2003NERPipe( encoding_type=encoding_type).process_from_file(paths) char_embed = None if char_type == 'cnn': char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3], word_dropout=0, dropout=0.3, pool_method='max', include_word_start_end=False, min_char_freq=2) elif char_type == 'lstm': char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, hidden_size=100, pool_method='max', activation='relu', min_char_freq=2, bidirectional=True, requires_grad=True, include_word_start_end=False) word_embed = StaticEmbedding(vocab=data.get_vocab('words'), model_dir_or_name='en-glove-6b-100d', requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=normalize_embed) if char_embed is not None: embed = StackEmbedding([word_embed, char_embed], dropout=0, word_dropout=0.02) else: word_embed.word_drop = 0.02 embed = word_embed data.rename_field('words', 'chars') return data, embed
def load_data(): if dataset == 'vlsp2016': paths = {'test': "./data_2/test.txt", 'train': "./data_2/train.txt", 'dev': "./data_2/dev.txt"} data = VLSP2016NERPipe(encoding_type=encoding_type).process_from_file(paths) # data.get_vocab('words').clear() vocab = [] with open("vocab.txt", 'r') as files: for word in files: vocab.append(word.replace("\n", "")) data.get_vocab('words').add_word_lst(vocab) char_embed = None if char_type == 'cnn': char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3], word_dropout=0, dropout=0.3, pool_method='max' , include_word_start_end=False, min_char_freq=2) elif char_type in ['adatrans', 'naive']: char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, pool_method='max', activation='relu', min_char_freq=2, requires_grad=True, include_word_start_end=False, char_attn_type=char_type, char_n_head=3, char_dim_ffn=60, char_scale=char_type=='naive', char_dropout=0.15, char_after_norm=True) elif char_type == 'bilstm': char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, hidden_size=100, pool_method='max', activation='relu', min_char_freq=2, bidirectional=True, requires_grad=True, include_word_start_end=False) elif char_type == 'lstm': char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, hidden_size=100, pool_method='max', activation='relu', min_char_freq=2, bidirectional=False, requires_grad=True, include_word_start_end=False) word_embed = StaticEmbedding(vocab=data.get_vocab('words'), model_dir_or_name='word2vec', requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=normalize_embed) if char_embed is not None: embed = StackEmbedding([word_embed, char_embed], dropout=0, word_dropout=0.02) else: word_embed.word_drop = 0.02 embed = word_embed # print(data.get_dataset('train')) data__ = data.get_vocab('words') data.rename_field('words', 'chars') return data, embed, data__
def load_data(): paths = { "train": "../data/{}/train.txt".format(dataset), "test": "../data/{}/test.txt".format(dataset), "dev": "../data/{}/dev.txt".format(dataset) } data = WNUT_17NERPipe(encoding_type=encoding_type).process_from_file(paths) dict_save_path = os.path.join("../data/{}/data.pth".format(dataset)) context_dict, context_word2id, context_id2word = get_neighbor_for_vocab( data.get_vocab('words').word2idx, glove_path, dict_save_path) train_feature_data, dev_feature_data, test_feature_data = build_instances( "../data/{}".format(dataset), context_num, context_dict) data.rename_field('words', 'chars') embed = ElmoEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=elmo_model, layers='mix', requires_grad=False, word_dropout=0.0, dropout=0.5, cache_word_reprs=False) embed.set_mix_weights_requires_grad() bert_embed = BertEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method=args.pool_method, word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, bert_embed], dropout=0, word_dropout=0.02) return data, embed, train_feature_data, dev_feature_data, test_feature_data, context_word2id, context_id2word
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], model_dir_or_name='en-glove-6b-100d', requires_grad=True, normalize=normalize, word_dropout=0.01, dropout=dropout, lower=True, min_freq=1) return data, char_embed, word_embed data, char_embed, word_embed = cache() print(data) embed = StackEmbedding([word_embed, char_embed]) model = CNNBiLSTMCRF(embed, hidden_size=1200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type, dropout=dropout) callbacks = [ GradientClipCallback(clip_value=5, clip_type='value'), EvaluateCallback(data.datasets['test']) ] optimizer = SGD(model.parameters(), lr=lr, momentum=0.9) scheduler = LRScheduler( LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
def load_data(): # 替换路径 if dataset == 'conll2003': # conll2003的lr不能超过0.002 paths = { 'test': "../data/conll2003/test.txt", 'train': "../data/conll2003/train.txt", 'dev': "../data/conll2003/dev.txt" } data = Conll2003NERPipe( encoding_type=encoding_type).process_from_file(paths) elif dataset == 'en-ontonotes': paths = '../data/en-ontonotes/english' data = OntoNotesNERPipe( encoding_type=encoding_type).process_from_file(paths) char_embed = None if char_type == 'cnn': char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3], word_dropout=0, dropout=0.3, pool_method='max', include_word_start_end=False, min_char_freq=2) elif char_type in ['adatrans', 'naive']: char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, pool_method='max', activation='relu', min_char_freq=2, requires_grad=True, include_word_start_end=False, char_attn_type=char_type, char_n_head=3, char_dim_ffn=60, char_scale=char_type == 'naive', char_dropout=0.15, char_after_norm=True) elif char_type == 'lstm': char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, hidden_size=100, pool_method='max', activation='relu', min_char_freq=2, bidirectional=True, requires_grad=True, include_word_start_end=False) word_embed = StaticEmbedding(vocab=data.get_vocab('words'), model_dir_or_name='en-glove-6b-100d', requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=normalize_embed) data.rename_field('words', 'chars') embed = ElmoEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name='en-original', layers='mix', requires_grad=False, word_dropout=0.0, dropout=0.5, cache_word_reprs=False) embed.set_mix_weights_requires_grad() embed = StackEmbedding([embed, word_embed, char_embed], dropout=0, word_dropout=0.02) return data, embed
def load_data(dataset, config): # 替换路径 data = read_dataset(dataset, config) char_embed = None if config['char_type'] == 'cnn': char_embed = CNNCharEmbedding(vocab=data.get_vocab('chars'), embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3], word_dropout=0, dropout=0.3, pool_method='max', include_word_start_end=False, min_char_freq=2) elif config['char_type'] in ['adatrans', 'naive']: char_embed = TransformerCharEmbed( vocab=data.get_vocab('chars'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, pool_method='max', activation='relu', min_char_freq=2, requires_grad=True, include_word_start_end=False, char_attn_type=config['char_type'], char_n_head=3, char_dim_ffn=60, char_scale=config['char_type'] == 'naive', char_dropout=0.15, char_after_norm=True) elif config['char_type'] == 'lstm': char_embed = LSTMCharEmbedding(vocab=data.get_vocab('chars'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, hidden_size=100, pool_method='max', activation='relu', min_char_freq=2, bidirectional=True, requires_grad=True, include_word_start_end=False) word_embed = StaticEmbedding( vocab=data.get_vocab('chars'), model_dir_or_name='ru' if dataset.split('/')[-1] in config['datasets']['ru'] else 'en-glove-6b-100d', requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=config['normalize_embed']) if char_embed is not None: embed = StackEmbedding([word_embed, char_embed], dropout=0, word_dropout=0.02) else: word_embed.word_drop = 0.02 embed = word_embed data.rename_field('words', 'chars') return data, embed
def load_data(): if dataset == 'ON5e': paths = 'data/ON5e/english' data = OntoNotesNERPipe( encoding_type=encoding_type).process_from_file(paths) else: paths = { "train": "data/{}/train.txt".format(dataset), "dev": "data/{}/dev.txt".format(dataset), "test": "data/{}/test.txt".format(dataset) } data = ENNERPipe(encoding_type=encoding_type).process_from_file(paths) if knowledge: train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api( os.path.join("data", dataset), "all", feature_level) else: train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = None, None, None, None, None, None char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'), embed_size=embed_size, char_emb_size=embed_size, word_dropout=0, dropout=0.3, pool_method='max', activation='relu', min_char_freq=2, requires_grad=True, include_word_start_end=False, char_attn_type=char_type, char_n_head=3, char_dim_ffn=60, char_scale=char_type == 'naive', char_dropout=0.15, char_after_norm=True) word_embed = StaticEmbedding(vocab=data.get_vocab('words'), model_dir_or_name='en-glove-6b-100d', requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=normalize_embed) data.rename_field('words', 'chars') embed = ElmoEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=elmo_model, layers='mix', requires_grad=False, word_dropout=0.0, dropout=0.5, cache_word_reprs=False) embed.set_mix_weights_requires_grad() bert_embed = BertEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method="first", word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, bert_embed, word_embed, char_embed], dropout=0, word_dropout=0.02) return data, embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature