예제 #1
0
def load_conllized_ontonote_NER(path, embedding_path=None):
    from fastNLP.io.pipe.conll import OntoNotesNERPipe
    ontoNotesNERPipe = OntoNotesNERPipe(lower=True, target_pad_val=-100)
    bundle_NER = ontoNotesNERPipe.process_from_file(path)

    train_set_NER = bundle_NER.datasets['train']
    dev_set_NER = bundle_NER.datasets['dev']
    test_set_NER = bundle_NER.datasets['test']

    train_set_NER.add_seq_len('words', 'seq_len')
    dev_set_NER.add_seq_len('words', 'seq_len')
    test_set_NER.add_seq_len('words', 'seq_len')

    NER_vocab = bundle_NER.get_vocab('target')
    word_vocab = bundle_NER.get_vocab('words')

    if embedding_path is not None:

        embed = StaticEmbedding(vocab=word_vocab,
                                model_dir_or_name=embedding_path,
                                word_dropout=0.01,
                                dropout=0.5,
                                lower=True)

        # pretrained_embedding = load_word_emb(embedding_path, 300, word_vocab)
        return (train_set_NER,dev_set_NER,test_set_NER),\
               (word_vocab,NER_vocab),embed
    else:
        return (train_set_NER, dev_set_NER, test_set_NER), (NER_vocab,
                                                            word_vocab)
예제 #2
0
def load_conllized_ontonote_NER_POS(path, embedding_path=None):
    from fastNLP.io.pipe.conll import OntoNotesNERPipe
    ontoNotesNERPipe = OntoNotesNERPipe(lower=True)
    bundle_NER = ontoNotesNERPipe.process_from_file(path)

    train_set_NER = bundle_NER.datasets['train']
    dev_set_NER = bundle_NER.datasets['dev']
    test_set_NER = bundle_NER.datasets['test']

    NER_vocab = bundle_NER.get_vocab('target')
    word_vocab = bundle_NER.get_vocab('words')

    (train_set_POS, dev_set_POS,
     test_set_POS), (_, POS_vocab) = load_conllized_ontonote_POS(path)
    POS_vocab = POS_vocab['POS']

    train_set_NER.add_field('pos', train_set_POS['POS'], is_target=True)
    dev_set_NER.add_field('pos', dev_set_POS['POS'], is_target=True)
    test_set_NER.add_field('pos', test_set_POS['POS'], is_target=True)

    if train_set_NER.has_field('target'):
        train_set_NER.rename_field('target', 'ner')

    if dev_set_NER.has_field('target'):
        dev_set_NER.rename_field('target', 'ner')

    if test_set_NER.has_field('target'):
        test_set_NER.rename_field('target', 'ner')

    if train_set_NER.has_field('pos'):
        train_set_NER.rename_field('pos', 'posid')
    if dev_set_NER.has_field('pos'):
        dev_set_NER.rename_field('pos', 'posid')
    if test_set_NER.has_field('pos'):
        test_set_NER.rename_field('pos', 'posid')

    if train_set_NER.has_field('ner'):
        train_set_NER.rename_field('ner', 'nerid')
    if dev_set_NER.has_field('ner'):
        dev_set_NER.rename_field('ner', 'nerid')
    if test_set_NER.has_field('ner'):
        test_set_NER.rename_field('ner', 'nerid')

    if embedding_path is not None:

        embed = StaticEmbedding(vocab=word_vocab,
                                model_dir_or_name=embedding_path,
                                word_dropout=0.01,
                                dropout=0.5,
                                lower=True)

        return (train_set_NER,dev_set_NER,test_set_NER),\
               (word_vocab,POS_vocab,NER_vocab),embed
    else:
        return (train_set_NER, dev_set_NER, test_set_NER), (NER_vocab,
                                                            word_vocab)
예제 #3
0
def cache():
    data = OntoNotesNERPipe(encoding_type=encoding_type).process_from_file(
        '../../../../others/data/v4/english')
    char_embed = CNNCharEmbedding(vocab=data.vocabs['words'],
                                  embed_size=30,
                                  char_emb_size=30,
                                  filter_nums=[30],
                                  kernel_sizes=[3],
                                  dropout=dropout)
    word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 normalize=normalize,
                                 word_dropout=0.01,
                                 dropout=dropout,
                                 lower=True,
                                 min_freq=1)
    return data, char_embed, word_embed
예제 #4
0
def load_data():
    # 替换路径
    if dataset == 'conll2003':
        # conll2003的lr不能超过0.002
        paths = {
            'test': "../data/conll2003/test.txt",
            'train': "../data/conll2003/train.txt",
            'dev': "../data/conll2003/dev.txt"
        }
        data = Conll2003NERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    elif dataset == 'en-ontonotes':
        paths = '../data/en-ontonotes/english'
        data = OntoNotesNERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    char_embed = None
    if char_type == 'cnn':
        char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'),
                                      embed_size=30,
                                      char_emb_size=30,
                                      filter_nums=[30],
                                      kernel_sizes=[3],
                                      word_dropout=0,
                                      dropout=0.3,
                                      pool_method='max',
                                      include_word_start_end=False,
                                      min_char_freq=2)
    elif char_type in ['adatrans', 'naive']:
        char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'),
                                          embed_size=30,
                                          char_emb_size=30,
                                          word_dropout=0,
                                          dropout=0.3,
                                          pool_method='max',
                                          activation='relu',
                                          min_char_freq=2,
                                          requires_grad=True,
                                          include_word_start_end=False,
                                          char_attn_type=char_type,
                                          char_n_head=3,
                                          char_dim_ffn=60,
                                          char_scale=char_type == 'naive',
                                          char_dropout=0.15,
                                          char_after_norm=True)
    elif char_type == 'lstm':
        char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'),
                                       embed_size=30,
                                       char_emb_size=30,
                                       word_dropout=0,
                                       dropout=0.3,
                                       hidden_size=100,
                                       pool_method='max',
                                       activation='relu',
                                       min_char_freq=2,
                                       bidirectional=True,
                                       requires_grad=True,
                                       include_word_start_end=False)
    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 lower=True,
                                 word_dropout=0,
                                 dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    data.rename_field('words', 'chars')

    embed = ElmoEmbedding(vocab=data.get_vocab('chars'),
                          model_dir_or_name='en-original',
                          layers='mix',
                          requires_grad=False,
                          word_dropout=0.0,
                          dropout=0.5,
                          cache_word_reprs=False)
    embed.set_mix_weights_requires_grad()

    embed = StackEmbedding([embed, word_embed, char_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data, embed
예제 #5
0
def load_data():
    if dataset == 'ON5e':
        paths = 'data/ON5e/english'
        data = OntoNotesNERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    else:
        paths = {
            "train": "data/{}/train.txt".format(dataset),
            "dev": "data/{}/dev.txt".format(dataset),
            "test": "data/{}/test.txt".format(dataset)
        }
        data = ENNERPipe(encoding_type=encoding_type).process_from_file(paths)

    if knowledge:
        train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api(
            os.path.join("data", dataset), "all", feature_level)
    else:
        train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = None, None, None, None, None, None

    char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'),
                                      embed_size=embed_size,
                                      char_emb_size=embed_size,
                                      word_dropout=0,
                                      dropout=0.3,
                                      pool_method='max',
                                      activation='relu',
                                      min_char_freq=2,
                                      requires_grad=True,
                                      include_word_start_end=False,
                                      char_attn_type=char_type,
                                      char_n_head=3,
                                      char_dim_ffn=60,
                                      char_scale=char_type == 'naive',
                                      char_dropout=0.15,
                                      char_after_norm=True)

    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 lower=True,
                                 word_dropout=0,
                                 dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    data.rename_field('words', 'chars')

    embed = ElmoEmbedding(vocab=data.get_vocab('chars'),
                          model_dir_or_name=elmo_model,
                          layers='mix',
                          requires_grad=False,
                          word_dropout=0.0,
                          dropout=0.5,
                          cache_word_reprs=False)
    embed.set_mix_weights_requires_grad()

    bert_embed = BertEmbedding(vocab=data.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method="first",
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)

    embed = StackEmbedding([embed, bert_embed, word_embed, char_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data, embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature