Exemplo n.º 1
0
def load_data():

    paths = {
        'train': 'data/{}/train.txt'.format(dataset),
        'dev': 'data/{}/dev.txt'.format(dataset),
        'test': 'data/{}/test.txt'.format(dataset)
    }
    min_freq = 2
    data_bundle = CNNERPipe(
        bigrams=True, encoding_type=encoding_type).process_from_file(paths)

    dict_save_path = os.path.join("data/{}/data.pth".format(dataset))
    context_dict, context_word2id, context_id2word = get_neighbor_for_vocab(
        data_bundle.get_vocab('chars').word2idx, glove_path, dict_save_path)

    train_feature_data, dev_feature_data, test_feature_data = build_instances(
        "data/{}".format(dataset), context_num, context_dict)

    embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bi_embed = StaticEmbedding(
        data_bundle.get_vocab('bigrams'),
        model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec',
        word_dropout=0.02,
        dropout=0.3,
        min_freq=min_freq,
        only_norm_found_vector=normalize_embed,
        only_train_min_freq=True)

    tencent_embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/tencent_unigram.txt',
        min_freq=min_freq,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method=args.pool_method,
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)

    embed = StackEmbedding([embed, tencent_embed, bert_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data_bundle, embed, bi_embed, train_feature_data, dev_feature_data, test_feature_data, context_word2id, context_id2word
Exemplo n.º 2
0
def load_data():
    paths = {
        'train': 'data/{}/train.txt'.format(dataset),
        "dev": 'data/{}/dev.txt'.format(dataset),
        "test": 'data/{}/test.txt'.format(dataset)
    }
    min_freq = 1
    data_bundle = CNNERPipe(
        bigrams=True, encoding_type=encoding_type).process_from_file(paths)

    train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api(
        os.path.join("data", dataset), "all", args.feature_level)

    embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    tencent_embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/tencent_unigram.txt',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bi_embed = StaticEmbedding(
        data_bundle.get_vocab('bigrams'),
        model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec',
        word_dropout=0.02,
        dropout=0.3,
        min_freq=min_freq,
        only_norm_found_vector=normalize_embed,
        only_train_min_freq=True)

    bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method=args.pool_method,
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)

    embed = StackEmbedding([embed, tencent_embed, bert_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data_bundle, embed, bi_embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature
def load_ner_data():
    paths = {
        'train': 'data/{}/train.txt'.format(dataset),
        'dev': 'data/{}/dev.txt'.format(dataset),
        'test': 'data/{}/test.txt'.format(dataset)
    }
    min_freq = 2
    data_bundle = CNNERPipe(
        bigrams=True, encoding_type=encoding_type).process_from_file(paths)

    # train_list = data_bundle.get_dataset('train')['raw_chars']

    embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bi_embed = StaticEmbedding(
        data_bundle.get_vocab('bigrams'),
        model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec',
        word_dropout=0.02,
        dropout=0.3,
        min_freq=2,
        only_norm_found_vector=normalize_embed,
        only_train_min_freq=True)

    tencent_embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/tencent_unigram.txt',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method=args.pool_method,
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=True)

    # embed = StackEmbedding([tencent_embed, bert_embed], dropout=0, word_dropout=0.02)
    embed = StackEmbedding([embed, tencent_embed, bert_embed],
                           dropout=0,
                           word_dropout=0.02)
    return data_bundle, embed, bi_embed
Exemplo n.º 4
0
 def test_case_1(self):
     ds = DataSet([
         Instance(words=['hello', 'world']),
         Instance(words=['hello', 'Jack'])
     ])
     vocab = Vocabulary().from_dataset(ds, field_name='words')
     self.assertEqual(len(vocab), 5)
     cnn_embed = CNNCharEmbedding(vocab, embed_size=60)
     lstm_embed = LSTMCharEmbedding(vocab, embed_size=70)
     embed = StackEmbedding([cnn_embed, lstm_embed])
     x = torch.LongTensor([[2, 1, 0], [4, 3, 4]])
     y = embed(x)
     self.assertEqual(tuple(y.size()), (2, 3, 130))
Exemplo n.º 5
0
def load_data():
    # paths = {'test': "../data/conll2003/test.txt",
    #          'train': "../data/conll2003/train.txt",
    #          'dev': "../data/conll2003/dev.txt"}
    paths = {'test': args.test, 'train': args.train, 'dev': args.dev}
    data = Conll2003NERPipe(
        encoding_type=encoding_type).process_from_file(paths)
    char_embed = None
    if char_type == 'cnn':
        char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'),
                                      embed_size=30,
                                      char_emb_size=30,
                                      filter_nums=[30],
                                      kernel_sizes=[3],
                                      word_dropout=0,
                                      dropout=0.3,
                                      pool_method='max',
                                      include_word_start_end=False,
                                      min_char_freq=2)
    elif char_type == 'lstm':
        char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'),
                                       embed_size=30,
                                       char_emb_size=30,
                                       word_dropout=0,
                                       dropout=0.3,
                                       hidden_size=100,
                                       pool_method='max',
                                       activation='relu',
                                       min_char_freq=2,
                                       bidirectional=True,
                                       requires_grad=True,
                                       include_word_start_end=False)
    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 lower=True,
                                 word_dropout=0,
                                 dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    if char_embed is not None:
        embed = StackEmbedding([word_embed, char_embed],
                               dropout=0,
                               word_dropout=0.02)
    else:
        word_embed.word_drop = 0.02
        embed = word_embed

    data.rename_field('words', 'chars')
    return data, embed
Exemplo n.º 6
0
def load_data():
    if dataset == 'vlsp2016':
        paths = {'test': "./data_2/test.txt",
                 'train': "./data_2/train.txt",
                 'dev': "./data_2/dev.txt"}
        data = VLSP2016NERPipe(encoding_type=encoding_type).process_from_file(paths)
        # data.get_vocab('words').clear()
        vocab = []
        with open("vocab.txt", 'r') as files:
            for word in files:
                vocab.append(word.replace("\n", ""))
        data.get_vocab('words').add_word_lst(vocab)
    char_embed = None
    if char_type == 'cnn':
        char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30],
                                      kernel_sizes=[3], word_dropout=0, dropout=0.3, pool_method='max'
                                      , include_word_start_end=False, min_char_freq=2)
    elif char_type in ['adatrans', 'naive']:
        char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0,
                 dropout=0.3, pool_method='max', activation='relu',
                 min_char_freq=2, requires_grad=True, include_word_start_end=False,
                 char_attn_type=char_type, char_n_head=3, char_dim_ffn=60, char_scale=char_type=='naive',
                 char_dropout=0.15, char_after_norm=True)
    elif char_type == 'bilstm':
        char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0,
                 dropout=0.3, hidden_size=100, pool_method='max', activation='relu',
                 min_char_freq=2, bidirectional=True, requires_grad=True, include_word_start_end=False)
    elif char_type == 'lstm':
        char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0,
                 dropout=0.3, hidden_size=100, pool_method='max', activation='relu',
                 min_char_freq=2, bidirectional=False, requires_grad=True, include_word_start_end=False)
    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='word2vec',
                                 requires_grad=True, lower=True, word_dropout=0, dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    if char_embed is not None:
        embed = StackEmbedding([word_embed, char_embed], dropout=0, word_dropout=0.02)
    else:
        word_embed.word_drop = 0.02
        embed = word_embed
    # print(data.get_dataset('train'))
    data__ = data.get_vocab('words')
    data.rename_field('words', 'chars')
    return data, embed, data__
Exemplo n.º 7
0
def load_data():

    paths = {
        "train": "../data/{}/train.txt".format(dataset),
        "test": "../data/{}/test.txt".format(dataset),
        "dev": "../data/{}/dev.txt".format(dataset)
    }
    data = WNUT_17NERPipe(encoding_type=encoding_type).process_from_file(paths)

    dict_save_path = os.path.join("../data/{}/data.pth".format(dataset))
    context_dict, context_word2id, context_id2word = get_neighbor_for_vocab(
        data.get_vocab('words').word2idx, glove_path, dict_save_path)

    train_feature_data, dev_feature_data, test_feature_data = build_instances(
        "../data/{}".format(dataset), context_num, context_dict)

    data.rename_field('words', 'chars')
    embed = ElmoEmbedding(vocab=data.get_vocab('chars'),
                          model_dir_or_name=elmo_model,
                          layers='mix',
                          requires_grad=False,
                          word_dropout=0.0,
                          dropout=0.5,
                          cache_word_reprs=False)
    embed.set_mix_weights_requires_grad()
    bert_embed = BertEmbedding(vocab=data.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method=args.pool_method,
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)
    embed = StackEmbedding([embed, bert_embed], dropout=0, word_dropout=0.02)

    return data, embed, train_feature_data, dev_feature_data, test_feature_data, context_word2id, context_id2word
Exemplo n.º 8
0
    word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 normalize=normalize,
                                 word_dropout=0.01,
                                 dropout=dropout,
                                 lower=True,
                                 min_freq=1)
    return data, char_embed, word_embed


data, char_embed, word_embed = cache()

print(data)

embed = StackEmbedding([word_embed, char_embed])
model = CNNBiLSTMCRF(embed,
                     hidden_size=1200,
                     num_layers=1,
                     tag_vocab=data.vocabs[Const.TARGET],
                     encoding_type=encoding_type,
                     dropout=dropout)

callbacks = [
    GradientClipCallback(clip_value=5, clip_type='value'),
    EvaluateCallback(data.datasets['test'])
]

optimizer = SGD(model.parameters(), lr=lr, momentum=0.9)
scheduler = LRScheduler(
    LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
Exemplo n.º 9
0
def load_data():
    # 替换路径
    if dataset == 'conll2003':
        # conll2003的lr不能超过0.002
        paths = {
            'test': "../data/conll2003/test.txt",
            'train': "../data/conll2003/train.txt",
            'dev': "../data/conll2003/dev.txt"
        }
        data = Conll2003NERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    elif dataset == 'en-ontonotes':
        paths = '../data/en-ontonotes/english'
        data = OntoNotesNERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    char_embed = None
    if char_type == 'cnn':
        char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'),
                                      embed_size=30,
                                      char_emb_size=30,
                                      filter_nums=[30],
                                      kernel_sizes=[3],
                                      word_dropout=0,
                                      dropout=0.3,
                                      pool_method='max',
                                      include_word_start_end=False,
                                      min_char_freq=2)
    elif char_type in ['adatrans', 'naive']:
        char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'),
                                          embed_size=30,
                                          char_emb_size=30,
                                          word_dropout=0,
                                          dropout=0.3,
                                          pool_method='max',
                                          activation='relu',
                                          min_char_freq=2,
                                          requires_grad=True,
                                          include_word_start_end=False,
                                          char_attn_type=char_type,
                                          char_n_head=3,
                                          char_dim_ffn=60,
                                          char_scale=char_type == 'naive',
                                          char_dropout=0.15,
                                          char_after_norm=True)
    elif char_type == 'lstm':
        char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'),
                                       embed_size=30,
                                       char_emb_size=30,
                                       word_dropout=0,
                                       dropout=0.3,
                                       hidden_size=100,
                                       pool_method='max',
                                       activation='relu',
                                       min_char_freq=2,
                                       bidirectional=True,
                                       requires_grad=True,
                                       include_word_start_end=False)
    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 lower=True,
                                 word_dropout=0,
                                 dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    data.rename_field('words', 'chars')

    embed = ElmoEmbedding(vocab=data.get_vocab('chars'),
                          model_dir_or_name='en-original',
                          layers='mix',
                          requires_grad=False,
                          word_dropout=0.0,
                          dropout=0.5,
                          cache_word_reprs=False)
    embed.set_mix_weights_requires_grad()

    embed = StackEmbedding([embed, word_embed, char_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data, embed
Exemplo n.º 10
0
    def load_data(dataset, config):
        # 替换路径
        data = read_dataset(dataset, config)
        char_embed = None
        if config['char_type'] == 'cnn':
            char_embed = CNNCharEmbedding(vocab=data.get_vocab('chars'),
                                          embed_size=30,
                                          char_emb_size=30,
                                          filter_nums=[30],
                                          kernel_sizes=[3],
                                          word_dropout=0,
                                          dropout=0.3,
                                          pool_method='max',
                                          include_word_start_end=False,
                                          min_char_freq=2)
        elif config['char_type'] in ['adatrans', 'naive']:
            char_embed = TransformerCharEmbed(
                vocab=data.get_vocab('chars'),
                embed_size=30,
                char_emb_size=30,
                word_dropout=0,
                dropout=0.3,
                pool_method='max',
                activation='relu',
                min_char_freq=2,
                requires_grad=True,
                include_word_start_end=False,
                char_attn_type=config['char_type'],
                char_n_head=3,
                char_dim_ffn=60,
                char_scale=config['char_type'] == 'naive',
                char_dropout=0.15,
                char_after_norm=True)
        elif config['char_type'] == 'lstm':
            char_embed = LSTMCharEmbedding(vocab=data.get_vocab('chars'),
                                           embed_size=30,
                                           char_emb_size=30,
                                           word_dropout=0,
                                           dropout=0.3,
                                           hidden_size=100,
                                           pool_method='max',
                                           activation='relu',
                                           min_char_freq=2,
                                           bidirectional=True,
                                           requires_grad=True,
                                           include_word_start_end=False)
        word_embed = StaticEmbedding(
            vocab=data.get_vocab('chars'),
            model_dir_or_name='ru' if dataset.split('/')[-1]
            in config['datasets']['ru'] else 'en-glove-6b-100d',
            requires_grad=True,
            lower=True,
            word_dropout=0,
            dropout=0.5,
            only_norm_found_vector=config['normalize_embed'])
        if char_embed is not None:
            embed = StackEmbedding([word_embed, char_embed],
                                   dropout=0,
                                   word_dropout=0.02)
        else:
            word_embed.word_drop = 0.02
            embed = word_embed

        data.rename_field('words', 'chars')
        return data, embed
Exemplo n.º 11
0
def load_data():
    if dataset == 'ON5e':
        paths = 'data/ON5e/english'
        data = OntoNotesNERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    else:
        paths = {
            "train": "data/{}/train.txt".format(dataset),
            "dev": "data/{}/dev.txt".format(dataset),
            "test": "data/{}/test.txt".format(dataset)
        }
        data = ENNERPipe(encoding_type=encoding_type).process_from_file(paths)

    if knowledge:
        train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api(
            os.path.join("data", dataset), "all", feature_level)
    else:
        train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = None, None, None, None, None, None

    char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'),
                                      embed_size=embed_size,
                                      char_emb_size=embed_size,
                                      word_dropout=0,
                                      dropout=0.3,
                                      pool_method='max',
                                      activation='relu',
                                      min_char_freq=2,
                                      requires_grad=True,
                                      include_word_start_end=False,
                                      char_attn_type=char_type,
                                      char_n_head=3,
                                      char_dim_ffn=60,
                                      char_scale=char_type == 'naive',
                                      char_dropout=0.15,
                                      char_after_norm=True)

    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 lower=True,
                                 word_dropout=0,
                                 dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    data.rename_field('words', 'chars')

    embed = ElmoEmbedding(vocab=data.get_vocab('chars'),
                          model_dir_or_name=elmo_model,
                          layers='mix',
                          requires_grad=False,
                          word_dropout=0.0,
                          dropout=0.5,
                          cache_word_reprs=False)
    embed.set_mix_weights_requires_grad()

    bert_embed = BertEmbedding(vocab=data.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method="first",
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)

    embed = StackEmbedding([embed, bert_embed, word_embed, char_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data, embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature