def build_dataloader(token_features, labels, batch_size, missing_tagspace, corpus_mask_value, tag2idx, chr2idx, token2idx, caseless, shuffle=False, drop_last=False): dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc( token_features, labels, missing_tagspace, tag2idx, chr2idx, token2idx, caseless, corpus_mask_value) dataloader = [ torch.utils.data.DataLoader(tup, batch_size, shuffle=shuffle, drop_last=drop_last) for tup in dataset ] return dataloader
ALLOW_SPANLEN = checkpoint_file['ALLOW_SPANLEN'] with codecs.open(args.dev_file, 'r', 'utf-8') as f: dev_lines = f.readlines() with codecs.open(args.test_file, 'r', 'utf-8') as f: test_lines = f.readlines() dev_features, dev_labels = utils.read_corpus(dev_lines) test_features, test_labels = utils.read_corpus(test_lines) dev_dataset = utils.construct_bucket_mean_vb_wc( dev_features, dev_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False) test_dataset = utils.construct_bucket_mean_vb_wc( test_features, test_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False) dev_dataset_loader = [
c_thresholds=args.mini_count, if_shrink_w_feature=False) f_set = {v for v in f_map} f_map = utils.shrink_features(f_map, train_features, args.mini_count) dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_features), f_set) dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_features), dt_f_set) f_map, embedding_tensor, in_doc_words = utils.load_embedding(args.emb_file, ' ', f_map, dt_f_set, args.unk, args.word_embedding_dim, shrink_to_corpus=args.shrink_embedding) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels)) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set) print('constructing dataset') dataset, dataset_onlycrf = utils.construct_bucket_mean_vb_wc(train_features, train_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], ALLOW_SPANLEN=args.allowspan, train_set=True) dev_dataset = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False) test_dataset = utils.construct_bucket_mean_vb_wc(test_features, test_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False) dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset] dataset_loader_crf = [torch.utils.data.DataLoader(tup, 3, shuffle=True, drop_last=False) for tup in dataset_onlycrf] if dataset_onlycrf else None dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset] test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset] print('building model') model = ner_model(args.word_embedding_dim, args.word_hidden_dim, args.word_lstm_layers, len(f_map), len(c_map), args.char_embedding_dim, args.char_lstm_hidden_dim, args.cnn_filter_num, args.char_lstm_layers, args.char_lstm, args.dropout_ratio, args.high_way, args.highway_layers, CRF_l_map['<start>'], CRF_l_map['<pad>'], len(CRF_l_map), SCRF_l_map, args.scrf_dense_dim, in_doc_words,args.index_embeds_dim, args.allowspan, SCRF_l_map['<START>'], SCRF_l_map['<STOP>'], args.grconv)
torch.cuda.set_device(args.gpu) # load corpus if args.test_file: with codecs.open(args.test_file, 'r', 'utf-8') as f: test_lines = f.readlines() else: with codecs.open(jd['test_file'], 'r', 'utf-8') as f: test_lines = f.readlines() # converting format test_features, test_labels = utils.read_corpus(test_lines) # construct dataset test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc( test_features, test_labels, l_map, c_map, f_map, jd['caseless']) test_dataset_loader = [ torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset ] # build model ner_model = LM_LSTM_CRF(len(l_map), len(c_map), jd['char_dim'], jd['char_hidden'], jd['char_layers'], jd['word_dim'], jd['word_hidden'], jd['word_layers'],
dt_f_set, args.caseless, args.unk, args.word_dim, shrink_to_corpus=args.shrink_embedding) print("embedding size: '{}'".format(len(f_map))) for label in l_set: if label not in l_map: l_map[label] = len(l_map) print('constructing dataset') for i in range(file_num): # construct dataset dataset, forw_corp, back_corp = utils.construct_bucket_mean_vb_wc( train_features[i], train_labels[i], l_map, char_map, f_map, args.caseless) dev_dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc( dev_features[i], dev_labels[i], l_map, char_map, f_map, args.caseless) test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc( test_features[i], test_labels[i], l_map, char_map, f_map, args.caseless) dataset_loader.append([ torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset ]) dev_dataset_loader.append([
print("feature size: '{}'".format(len(f_map))) print('loading embedding') if args.fine_tune: # which means does not do fine-tune f_map = {'<eof>': 0} f_map, embedding_tensor, in_doc_words = utils.load_embedding_wlm(args.emb_file, ' ', f_map, dt_f_set, args.caseless, args.unk, args.word_dim, shrink_to_corpus=args.shrink_embedding) print("embedding size: '{}'".format(len(f_map))) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels)) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set) for label in l_set: if label not in l_map: l_map[label] = len(l_map) print('constructing dataset') # construct dataset dataset, forw_corp, back_corp = utils.construct_bucket_mean_vb_wc(train_features, train_labels, l_map, c_map, f_map, args.caseless) dev_dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, l_map, c_map, f_map, args.caseless) test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(test_features, test_labels, l_map, c_map, f_map, args.caseless) dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset] dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset] test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset] # build model print('building model') ner_model = LM_LSTM_CRF(len(l_map), len(c_map), args.char_dim, args.char_hidden, args.char_layers, args.word_dim, args.word_hidden, args.word_layers, len(f_map), args.drop_out, large_CRF=args.small_crf, if_highway=args.high_way, in_doc_words=in_doc_words, highway_layers = args.highway_layers) if args.load_check_point: ner_model.load_state_dict(checkpoint_file['state_dict']) else: if not args.rand_embedding:
c_map = checkpoint_file['c_map'] in_doc_words = checkpoint_file['in_doc_words'] SCRF_l_map = checkpoint_file['SCRF_l_map'] ALLOW_SPANLEN = checkpoint_file['ALLOW_SPANLEN'] with codecs.open(args.dev_file, 'r', 'utf-8') as f: dev_lines = f.readlines() with codecs.open(args.test_file, 'r', 'utf-8') as f: test_lines = f.readlines() dev_features, dev_labels = utils.read_corpus(dev_lines) test_features, test_labels = utils.read_corpus(test_lines) dev_dataset = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False) test_dataset = utils.construct_bucket_mean_vb_wc(test_features, test_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False) dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset] test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset] print('build model') model = ner_model(jd['word_embedding_dim'], jd['word_hidden_dim'], jd['word_lstm_layers'], len(f_map), len(c_map), jd['char_embedding_dim'], jd['char_lstm_hidden_dim'], jd['cnn_filter_num'], jd['char_lstm_layers'], jd['char_lstm'],jd['dropout_ratio'], jd['high_way'], jd['highway_layers'], CRF_l_map['<start>'], CRF_l_map['<pad>'], len(CRF_l_map), SCRF_l_map, jd['scrf_dense_dim'], in_doc_words, jd['index_embeds_dim'], jd['allowspan'], SCRF_l_map['<START>'], SCRF_l_map['<STOP>'], jd['grconv']) print('load model')
f_map = {'<eof>': 0} f_map, embedding_tensor, in_doc_words = utils.load_embedding_wlm(args.emb_file, ' ', f_map, dt_f_set, args.caseless, args.unk, args.word_dim, shrink_to_corpus=args.shrink_embedding) print("embedding size: '{}'".format(len(f_map))) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels)) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set) for label in l_set: if label not in l_map: l_map[label] = len(l_map) print('constructing dataset') # construct dataset dataset, forw_corp, back_corp = utils.construct_bucket_mean_vb_wc(train_features, train_labels, l_map, c_map, f_map, args.caseless) dev_dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, l_map, c_map, f_map, args.caseless) test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(test_features, test_labels, l_map, c_map, f_map, args.caseless) co_dataset,_,_=utils.construct_bucket_mean_vb_wc(co_features,co_labels,l_map,c_map,f_map,args.caseless) dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset] dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset] test_dataset_loader = [torch.utils.data.DataLoader(tup, 1, shuffle=False, drop_last=False) for tup in test_dataset] co_dataset_loader=[torch.utils.data.DataLoader(tup,args.batch_size,shuffle=True,drop_last=False) for tup in co_dataset] #string match part word_to_id,emb_tensor,emb =utils.string_match_embedding(args.emb_file,args.word_dim) # build model print('building model')
def read_dataset(self, file_dict, dataset_name, *args, **kwargs): print('loading corpus') self.file_num = len(self.args.train_file) for i in range(self.file_num): with codecs.open(self.args.train_file[i], 'r', 'utf-8') as f: lines0 = f.readlines() lines0 = lines0[0:2000] # print (len(lines0)) self.lines.append(lines0) for i in range(self.file_num): with codecs.open(self.args.dev_file[i], 'r', 'utf-8') as f: dev_lines0 = f.readlines() dev_lines0 = dev_lines0[0:2000] self.dev_lines.append(dev_lines0) for i in range(self.file_num): with codecs.open(self.args.test_file[i], 'r', 'utf-8') as f: test_lines0 = f.readlines() test_lines0 = test_lines0[0:2000] self.test_lines.append(test_lines0) for i in range(self.file_num): dev_features0, dev_labels0 = utils.read_corpus(self.dev_lines[i]) test_features0, test_labels0 = utils.read_corpus( self.test_lines[i]) self.dev_features.append(dev_features0) self.test_features.append(test_features0) self.dev_labels.append(dev_labels0) self.test_labels.append(test_labels0) if self.args.output_annotation: # NEW test_word0, test_word_tag0 = utils.read_features( self.test_lines[i]) self.test_word.append(test_word0) self.test_word_tag.append(test_word_tag0) #print (len(self.test_word), len(self.test_labels)) if self.args.load_check_point: if os.path.isfile(self.args.load_check_point): print("loading checkpoint: '{}'".format( self.args.load_check_point)) self.checkpoint_file = torch.load( self.args.load_check_point) self.args.start_epoch = self.checkpoint_file['epoch'] self.f_map = self.checkpoint_file['f_map'] self.l_map = self.checkpoint_file['l_map'] c_map = self.checkpoint_file['c_map'] self.in_doc_words = self.checkpoint_file['in_doc_words'] self.train_features, self.train_labels = utils.read_corpus( self.lines[i]) else: print("no checkpoint found at: '{}'".format( self.args.load_check_point)) else: print('constructing coding table') train_features0, train_labels0, self.f_map, self.l_map, self.char_count = utils.generate_corpus_char( self.lines[i], self.f_map, self.l_map, self.char_count, c_thresholds=self.args.mini_count, if_shrink_w_feature=False) self.train_features.append(train_features0) self.train_labels.append(train_labels0) self.train_features_tot += train_features0 shrink_char_count = [ k for (k, v) in iter(self.char_count.items()) if v >= self.args.mini_count ] self.char_map = { shrink_char_count[ind]: ind for ind in range(0, len(shrink_char_count)) } self.char_map['<u>'] = len(self.char_map) # unk for char self.char_map[' '] = len(self.char_map) # concat for char self.char_map['\n'] = len(self.char_map) # eof for char f_set = {v for v in self.f_map} dt_f_set = f_set self.f_map = utils.shrink_features(self.f_map, self.train_features_tot, self.args.mini_count) l_set = set() for i in range(self.file_num): dt_f_set = functools.reduce( lambda x, y: x | y, map(lambda t: set(t), self.dev_features[i]), dt_f_set) dt_f_set = functools.reduce( lambda x, y: x | y, map(lambda t: set(t), self.test_features[i]), dt_f_set) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), self.dev_labels[i]), l_set) l_set = functools.reduce( lambda x, y: x | y, map(lambda t: set(t), self.test_labels[i]), l_set) if not self.args.rand_embedding: print("feature size: '{}'".format(len(self.f_map))) print('loading embedding') if self.args.fine_tune: # which means does not do fine-tune self.f_map = {'<eof>': 0} self.f_map, self.embedding_tensor, self.in_doc_words = utils.load_embedding_wlm( self.args.emb_file, ' ', self.f_map, dt_f_set, self.args.caseless, self.args.unk, self.args.word_dim, shrink_to_corpus=self.args.shrink_embedding) print("embedding size: '{}'".format(len(self.f_map))) for label in l_set: if label not in self.l_map: self.l_map[label] = len(self.l_map) print('constructing dataset') for i in range(self.file_num): # construct dataset dataset, forw_corp, back_corp = utils.construct_bucket_mean_vb_wc( self.train_features[i], self.train_labels[i], self.l_map, self.char_map, self.f_map, self.args.caseless) dev_dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc( self.dev_features[i], self.dev_labels[i], self.l_map, self.char_map, self.f_map, self.args.caseless) test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc( self.test_features[i], self.test_labels[i], self.l_map, self.char_map, self.f_map, self.args.caseless) self.dataset_loader.append([ torch.utils.data.DataLoader(tup, self.args.batch_size, shuffle=True, drop_last=False) for tup in dataset ]) self.dev_dataset_loader.append([ torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset ]) self.test_dataset_loader.append([ torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset ])
# load corpus if args.test_file: with codecs.open(args.test_file, 'r', 'utf-8') as f: test_lines = f.readlines() else: with codecs.open(jd['test_file'], 'r', 'utf-8') as f: test_lines = f.readlines() # converting format test_features, test_labels = utils.read_corpus(test_lines) # construct dataset test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(test_features, test_labels, l_map, c_map, f_map, jd['caseless']) test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset] # build model ner_model = LM_LSTM_CRF(len(l_map), len(c_map), jd['char_dim'], jd['char_hidden'], jd['char_layers'], jd['word_dim'], jd['word_hidden'], jd['word_layers'], len(f_map), jd['drop_out'], large_CRF=jd['small_crf'], if_highway=jd['high_way'], in_doc_words=in_doc_words, highway_layers = jd['highway_layers']) ner_model.load_state_dict(checkpoint_file['state_dict']) if args.gpu >= 0: if_cuda = True torch.cuda.set_device(args.gpu) ner_model.cuda() packer = CRFRepack_WC(len(l_map), True) else: if_cuda = False