# load corpus if args.test_file: with codecs.open(args.test_file, 'r', 'utf-8') as f: test_lines = f.readlines() else: with codecs.open(jd['test_file'], 'r', 'utf-8') as f: test_lines = f.readlines() # converting format test_features, test_labels = utils.read_corpus(test_lines) # construct dataset test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, f_map, l_map, jd['caseless']) test_dataset_loader = [ torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset ] # build model ner_model = LSTM_CRF(len(f_map), len(l_map), jd['embedding_dim'], jd['hidden'], jd['layers'], jd['drop_out'], large_CRF=jd['small_crf'])
f_map = {'<eof>': 0} f_map, embedding_tensor, in_doc_words = utils.load_embedding_wlm( args.emb_file, ' ', f_map, dt_f_set, args.caseless, args.unk, args.embedding_dim) print("embedding size: '{}'".format(len(f_map))) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels)) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set) for label in l_set: if label not in l_map: l_map[label] = len(l_map) # construct dataset dataset = utils.construct_bucket_mean_vb(train_features, train_labels, f_map, l_map, args.caseless) dev_dataset = utils.construct_bucket_mean_vb(dev_features, dev_labels, f_map, l_map, args.caseless) test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, f_map, l_map, args.caseless) dataset_loader = [ torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset ] dev_dataset_loader = [ torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset ]
else: with codecs.open(jd['test_file'], 'r', 'utf-8') as f: test_lines = f.readlines() # converting format test_features, test_labels, test_bichar_features = utils.read_corpus(test_lines) with codecs.open(args.lexicon_test_file, 'r', 'utf-8') as f: lexicon_test_lines = f.readlines() lexicon_test_features, lexicon_feature_map = utils.read_corpus_lexicon(lexicon_test_lines, test_features, lexicon_f_map) lexicon_test_dataset = utils.padding_lexicon_bucket(lexicon_test_features, lexicon_f_map, args.gpu) # construct dataset test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, lexicon_test_dataset, f_map, l_map, test_bichar_features, bichar_f_map, jd['caseless']) # build model ner_model = LSTM_CRF(len(f_map), len(bichar_f_map), len(lexicon_f_map), len(l_map), jd['embedding_dim'], jd['hidden'], jd['layers'], jd['drop_out'], args.gpu, is_bichar, large_CRF=jd['small_crf']) ner_model.load_state_dict(checkpoint_file['state_dict']) if args.gpu >= 0: if_cuda = True torch.cuda.set_device(args.gpu) ner_model.cuda() packer = CRFRepack(len(l_map), True) else: if_cuda = False packer = CRFRepack(len(l_map), False)
bichar_f_map, bichar_dt_f_set, args.caseless, args.unk, args.embedding_dim, shrink_to_corpus=True) print("embedding size: '{}'".format(len(bichar_f_map))) # construct dataset lexicon_train_dataset = utils.padding_lexicon_bucket( lexicon_train_features, lexicon_f_map, args.gpu) lexicon_dev_dataset = utils.padding_lexicon_bucket(lexicon_dev_features, lexicon_f_map, args.gpu) dataset = utils.construct_bucket_mean_vb(train_features, train_labels, lexicon_train_dataset, f_map, l_map, train_bichar_features, bichar_f_map, args.caseless) dev_dataset = utils.construct_bucket_mean_vb(dev_features, dev_labels, lexicon_dev_dataset, f_map, l_map, dev_bichar_features, bichar_f_map, args.caseless) lexicon_test_dataset = utils.padding_lexicon_bucket( lexicon_test_features, lexicon_f_map, args.gpu) test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, lexicon_test_dataset, f_map, l_map, test_bichar_features, bichar_f_map, args.caseless) # build model print('building model') ner_model = LSTM_CRF(len(f_map),
if not args.rand_embedding: print("feature size: '{}'".format(len(f_map))) print('loading embedding') if args.fine_tune: # which means does not do fine-tune f_map = {'<eof>': 0} f_map, embedding_tensor, in_doc_words = utils.load_embedding_wlm(args.emb_file, ' ', f_map, dt_f_set,args.caseless,args.unk, args.embedding_dim, shrink_to_corpus=args.shrink_embedding) print("embedding size: '{}'".format(len(f_map))) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels)) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set) for label in l_set: if label not in l_map: l_map[label] = len(l_map) # construct dataset dataset = utils.construct_bucket_mean_vb(train_features, train_labels, f_map, l_map, args.caseless) dev_dataset = utils.construct_bucket_mean_vb(dev_features, dev_labels, f_map, l_map, args.caseless) test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, f_map, l_map, args.caseless) dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset] dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset] test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset] # build model print('building model') ner_model = LSTM_CRF(len(f_map), len(l_map), args.embedding_dim, args.hidden, args.layers, args.drop_out, large_CRF=args.small_crf) if args.load_check_point: ner_model.load_state_dict(checkpoint_file['state_dict']) else: if not args.rand_embedding:
# load corpus if args.test_file: with codecs.open(args.test_file, 'r', 'utf-8') as f: test_lines = f.readlines() else: with codecs.open(jd['test_file'], 'r', 'utf-8') as f: test_lines = f.readlines() # converting format test_features, test_labels = utils.read_corpus(test_lines) # construct dataset test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, f_map, l_map, jd['caseless']) test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset] # build model ner_model = LSTM_CRF(len(f_map), len(l_map), jd['embedding_dim'], jd['hidden'], jd['layers'], jd['drop_out'], large_CRF=jd['small_crf']) ner_model.load_state_dict(checkpoint_file['state_dict']) if args.gpu >= 0: if_cuda = True torch.cuda.set_device(args.gpu) ner_model.cuda() packer = CRFRepack(len(l_map), True) else: if_cuda = False
dt_f_set, args.caseless, args.unk, args.embedding_dim, shrink_to_corpus=args.shrink_embedding) print("embedding size: '{}'".format( len(f_map))) #f_map表示预训练的词向量中所有的词 l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels)) for label in l_set: if label not in l_map: #l_map是107行的训练集中的所有标签 l_map[label] = len(l_map) #将验证集 测试集中的标签也添加到l_map中 # construct dataset dataset = utils.construct_bucket_mean_vb( train_features, train_labels, f_map, l_map, args.caseless) #f_map是预训练词向量中的词,l_map是训练集 验证集 测试集中出现的所有标签 test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, f_map, l_map, args.caseless) dataset_loader = [ torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset ] test_dataset_loader = [ torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset ]