Пример #1
0
    # load corpus

    if args.test_file:
        with codecs.open(args.test_file, 'r', 'utf-8') as f:
            test_lines = f.readlines()
    else:
        with codecs.open(jd['test_file'], 'r', 'utf-8') as f:
            test_lines = f.readlines()

    # converting format

    test_features, test_labels = utils.read_corpus(test_lines)

    # construct dataset
    test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels,
                                                  f_map, l_map, jd['caseless'])

    test_dataset_loader = [
        torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False)
        for tup in test_dataset
    ]

    # build model
    ner_model = LSTM_CRF(len(f_map),
                         len(l_map),
                         jd['embedding_dim'],
                         jd['hidden'],
                         jd['layers'],
                         jd['drop_out'],
                         large_CRF=jd['small_crf'])
Пример #2
0
                f_map = {'<eof>': 0}
            f_map, embedding_tensor, in_doc_words = utils.load_embedding_wlm(
                args.emb_file, ' ', f_map, dt_f_set, args.caseless, args.unk,
                args.embedding_dim)
            print("embedding size: '{}'".format(len(f_map)))

        l_set = functools.reduce(lambda x, y: x | y,
                                 map(lambda t: set(t), dev_labels))
        l_set = functools.reduce(lambda x, y: x | y,
                                 map(lambda t: set(t), test_labels), l_set)
        for label in l_set:
            if label not in l_map:
                l_map[label] = len(l_map)

    # construct dataset
    dataset = utils.construct_bucket_mean_vb(train_features, train_labels,
                                             f_map, l_map, args.caseless)
    dev_dataset = utils.construct_bucket_mean_vb(dev_features, dev_labels,
                                                 f_map, l_map, args.caseless)
    test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels,
                                                  f_map, l_map, args.caseless)

    dataset_loader = [
        torch.utils.data.DataLoader(tup,
                                    args.batch_size,
                                    shuffle=True,
                                    drop_last=False) for tup in dataset
    ]
    dev_dataset_loader = [
        torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False)
        for tup in dev_dataset
    ]
Пример #3
0
    else:
        with codecs.open(jd['test_file'], 'r', 'utf-8') as f:
            test_lines = f.readlines()


    # converting format
    test_features, test_labels, test_bichar_features = utils.read_corpus(test_lines)

    with codecs.open(args.lexicon_test_file, 'r', 'utf-8') as f:
        lexicon_test_lines = f.readlines()
    lexicon_test_features, lexicon_feature_map = utils.read_corpus_lexicon(lexicon_test_lines, test_features,
                                                                           lexicon_f_map)
    lexicon_test_dataset = utils.padding_lexicon_bucket(lexicon_test_features, lexicon_f_map, args.gpu)

    # construct dataset
    test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, lexicon_test_dataset, f_map, l_map,
                                                  test_bichar_features, bichar_f_map, jd['caseless'])

    # build model
    ner_model = LSTM_CRF(len(f_map), len(bichar_f_map), len(lexicon_f_map), len(l_map), jd['embedding_dim'], jd['hidden'], jd['layers'], jd['drop_out'], args.gpu, is_bichar, large_CRF=jd['small_crf'])

    ner_model.load_state_dict(checkpoint_file['state_dict'])

    if args.gpu >= 0:
        if_cuda = True
        torch.cuda.set_device(args.gpu)
        ner_model.cuda()
        packer = CRFRepack(len(l_map), True)
    else:
        if_cuda = False
        packer = CRFRepack(len(l_map), False)
Пример #4
0
                bichar_f_map,
                bichar_dt_f_set,
                args.caseless,
                args.unk,
                args.embedding_dim,
                shrink_to_corpus=True)
            print("embedding size: '{}'".format(len(bichar_f_map)))

    # construct dataset
    lexicon_train_dataset = utils.padding_lexicon_bucket(
        lexicon_train_features, lexicon_f_map, args.gpu)
    lexicon_dev_dataset = utils.padding_lexicon_bucket(lexicon_dev_features,
                                                       lexicon_f_map, args.gpu)

    dataset = utils.construct_bucket_mean_vb(train_features, train_labels,
                                             lexicon_train_dataset, f_map,
                                             l_map, train_bichar_features,
                                             bichar_f_map, args.caseless)
    dev_dataset = utils.construct_bucket_mean_vb(dev_features, dev_labels,
                                                 lexicon_dev_dataset, f_map,
                                                 l_map, dev_bichar_features,
                                                 bichar_f_map, args.caseless)
    lexicon_test_dataset = utils.padding_lexicon_bucket(
        lexicon_test_features, lexicon_f_map, args.gpu)
    test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels,
                                                  lexicon_test_dataset, f_map,
                                                  l_map, test_bichar_features,
                                                  bichar_f_map, args.caseless)

    # build model
    print('building model')
    ner_model = LSTM_CRF(len(f_map),
Пример #5
0
        if not args.rand_embedding:
            print("feature size: '{}'".format(len(f_map)))
            print('loading embedding')
            if args.fine_tune:  # which means does not do fine-tune
                f_map = {'<eof>': 0}
            f_map, embedding_tensor, in_doc_words = utils.load_embedding_wlm(args.emb_file, ' ', f_map, dt_f_set,args.caseless,args.unk, args.embedding_dim, shrink_to_corpus=args.shrink_embedding)
            print("embedding size: '{}'".format(len(f_map)))

        l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels))
        l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set)
        for label in l_set:
            if label not in l_map:
                l_map[label] = len(l_map)

    # construct dataset
    dataset = utils.construct_bucket_mean_vb(train_features, train_labels, f_map, l_map, args.caseless)
    dev_dataset = utils.construct_bucket_mean_vb(dev_features, dev_labels, f_map, l_map, args.caseless)
    test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, f_map, l_map, args.caseless)

    dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset]
    dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset]
    test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]

    # build model
    print('building model')
    ner_model = LSTM_CRF(len(f_map), len(l_map), args.embedding_dim, args.hidden, args.layers, args.drop_out, large_CRF=args.small_crf)

    if args.load_check_point:
            ner_model.load_state_dict(checkpoint_file['state_dict'])
    else:
        if not args.rand_embedding:
Пример #6
0
    # load corpus

    if args.test_file:
        with codecs.open(args.test_file, 'r', 'utf-8') as f:
            test_lines = f.readlines()
    else:
        with codecs.open(jd['test_file'], 'r', 'utf-8') as f:
            test_lines = f.readlines()

    # converting format

    test_features, test_labels = utils.read_corpus(test_lines)

    # construct dataset
    test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, f_map, l_map, jd['caseless'])
    
    test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]

    # build model
    ner_model = LSTM_CRF(len(f_map), len(l_map), jd['embedding_dim'], jd['hidden'], jd['layers'], jd['drop_out'], large_CRF=jd['small_crf'])

    ner_model.load_state_dict(checkpoint_file['state_dict'])

    if args.gpu >= 0:
        if_cuda = True
        torch.cuda.set_device(args.gpu)
        ner_model.cuda()
        packer = CRFRepack(len(l_map), True)
    else:
        if_cuda = False
Пример #7
0
                dt_f_set,
                args.caseless,
                args.unk,
                args.embedding_dim,
                shrink_to_corpus=args.shrink_embedding)
            print("embedding size: '{}'".format(
                len(f_map)))  #f_map表示预训练的词向量中所有的词

        l_set = functools.reduce(lambda x, y: x | y,
                                 map(lambda t: set(t), test_labels))
        for label in l_set:
            if label not in l_map:  #l_map是107行的训练集中的所有标签
                l_map[label] = len(l_map)  #将验证集 测试集中的标签也添加到l_map中
    # construct dataset
    dataset = utils.construct_bucket_mean_vb(
        train_features, train_labels, f_map, l_map,
        args.caseless)  #f_map是预训练词向量中的词,l_map是训练集 验证集 测试集中出现的所有标签
    test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels,
                                                  f_map, l_map, args.caseless)

    dataset_loader = [
        torch.utils.data.DataLoader(tup,
                                    args.batch_size,
                                    shuffle=True,
                                    drop_last=False) for tup in dataset
    ]
    test_dataset_loader = [
        torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False)
        for tup in test_dataset
    ]