def initialize_detector(self): t1 = time.time() try: import kenlm except ImportError: raise ImportError( 'mypycorrector dependencies are not fully installed, ' 'they are required for statistical language model.' 'Please use "pip install kenlm" to install it.' 'if you are Win, Please install kenlm in cgwin.') self.lm = kenlm.Model(self.language_model_path) logger.debug('Loaded language model: %s, spend: %s s' % (self.language_model_path, str(time.time() - t1))) # 词、频数dict t2 = time.time() self.word_freq = self.load_word_freq_dict(self.word_freq_path) self.char_freq = self.load_char_freq_dict(self.char_freq_path) t3 = time.time() logger.debug( 'Loaded word freq, char freq file: %s, size: %d, spend: %s s' % (self.word_freq_path, len(self.word_freq), str(t3 - t2))) # 自定义混淆集 self.custom_confusion = self._get_custom_confusion_dict( self.custom_confusion_path) t4 = time.time() logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_confusion), str(t4 - t3))) # 自定义切词词典 self.custom_word_freq = self.load_word_freq_dict( self.custom_word_freq_path) self.person_names = self.load_word_freq_dict(self.person_name_path) self.place_names = self.load_word_freq_dict(self.place_name_path) self.stopwords = self.load_word_freq_dict(self.stopwords_path) # 合并切词词典及自定义词典 self.custom_word_freq.update(self.person_names) self.custom_word_freq.update(self.place_names) self.custom_word_freq.update(self.stopwords) self.word_freq.update(self.custom_word_freq) t5 = time.time() logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_word_freq), str(t5 - t4))) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq, custom_confusion_dict=self.custom_confusion) # bert预训练模型 t6 = time.time() self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab) self.MASK_TOKEN = "[MASK]" self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids( [self.MASK_TOKEN])[0] # Prepare model self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir) logger.debug("Loaded model ok, path: %s, spend: %.3f s." % (self.bert_model_dir, time.time() - t6)) self.initialized_detector = True
def main(args): if args.dataset == 'sim-R': from BERTDST_utils.simR_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'sim-M': from BERTDST_utils.simM_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'DSTC2': from BERTDST_utils.DSTC2_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'WOZ2.0': from BERTDST_utils.WOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'MultiWOZ2.1': from BERTDST_utils.MultiWOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, OP, make_slot_meta ontology = json.load(open(args.ontology_data_path)) SLOT, ontology = make_slot_meta(ontology) slot_meta = SLOT tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) data = prepare_dataset(1.0, args.test_data_path, tokenizer, slot_meta, args.test_size_window, args.max_seq_length, args.test_MG) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = 0.1 op2id = OP model = MGDST(model_config, len(op2id), len(slot_meta)) ckpt = torch.load(args.model_ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.eval() model.to(device) model_evaluation(make_turn_label, postprocessing, state_equal, OP, model, data, tokenizer, slot_meta, 0, args.test_size_window, args.test_MG)
def run(self): remote_helper.get_remote_date( "https://www.flyai.com/m/chinese_base.zip") before_vocab_dir = os.path.join(os.getcwd(), 'vocab.txt') after_vocab_dir = os.path.join(args.bert_model_dir, 'vocab.txt') logger.info('>before_vocab_dir:{}'.format(before_vocab_dir)) logger.info('>after_vocab_dir:{}'.format(after_vocab_dir)) shutil.copyfile(before_vocab_dir, after_vocab_dir) if not os.path.exists(self.arguments.output_dir): os.mkdir(self.arguments.output_dir) self.arguments.BATCH = self.arguments.BATCH // self.arguments.gradient_accumulation_steps # 数据准备 分词器选择 tokenizer = BertTokenizer( self.arguments.bert_vocab_file).from_pretrained( self.arguments.bert_model_dir, do_lower_case=self.arguments.do_lower_case) # 获取数据 news/keywords train_news, train_category, dev_news, dev_category = self.generate() self.train(Net=Net, train_category=train_category, dev_category=dev_category, train_news=train_news, dev_news=dev_news, tokenizer=tokenizer)
def load_tokenizer(self): if self.model_configuration.is_xlnet: self.tokenizer = XLNetTokenizer.from_pretrained(self.model_configuration.bert_model, do_lower_case=self.model_configuration.do_lower) elif not self.model_configuration.is_scibert: self.tokenizer = BertTokenizer.from_pretrained(self.model_configuration.bert_model, do_lower_case=self.model_configuration.do_lower) else: self.tokenizer = BertTokenizer(self.model_configuration.vocab_file, do_lower_case=self.model_configuration.do_lower)
def initialize_bert_detector(self): t1 = time.time() self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab) self.MASK_TOKEN = "[MASK]" self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids( [self.MASK_TOKEN])[0] # Prepare model self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir) logger.debug("Loaded model ok, path: %s, spend: %.3f s." % (self.bert_model_dir, time.time() - t1)) self.initialized_bert_detector = True
def fit(self, tokens): # NOTE: We allow the model to use default: do_basic_tokenize. # This potentially splits tokens into more tokens apart from subtokens: # eg. Mr.Doe -> Mr . D ##oe (Note that . is not preceded by ##) # We take this into account when creating the token_flags in # function text_to_token_flags self.tokenizer = BertTokenizer( self.bert_filename, # do_basic_tokenize=self.do_basic_tokenize, do_lower_case=self.do_lower_case) return self
def init_params(): processors = {"sentiment_analysis": SentiAnalysisProcessor} task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() if args.model_type == 'bert': tokenizer = BertTokenizer(vocab_file=args.VOCAB_FILE) elif args.model_type == 'xlnet': tokenizer = XLNetTokenizer.from_pretrained( os.path.join(args.ROOT_DIR, args.xlnet_model), do_lower_case=args.do_lower_case) return processor, tokenizer
def __init__(self, pretrained_model=None, vocab_file=None, do_lower_case=True, max_len=None, do_basic_tokenize=True, never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): if pretrained_model: self.tokenizer = BertTokenizer.from_pretrained(pretrained_model) if "uncased" not in pretrained_model: self.tokenizer.basic_tokenizer.do_lower_case = False else: self.tokenizer = BertTokenizer(vocab_file, do_lower_case, do_basic_tokenize) self.vocab_size = len(self.tokenizer.vocab) self.never_split = never_split
def __init__(self, args): try: from pytorch_transformers import BertTokenizer from pytorch_transformers.tokenization_utils import clean_up_tokenization except ImportError: raise ImportError( 'Please install 1.0.0 version of pytorch_transformers' 'with: pip install pytorch-transformers') if 'bpe_vocab_file' in args: self.bert_tokenizer = BertTokenizer( args.bpe_vocab_file, do_lower_case=not args.bpe_cased) else: vocab_file_name = 'bert-base-cased' if args.bpe_cased else 'bert-base-uncased' self.bert_tokenizer = BertTokenizer.from_pretrained( vocab_file_name) self.clean_up_tokenization = clean_up_tokenization
def main(args): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ontology = json.load(open(os.path.join(args.data_root, args.ontology_data))) slot_meta, _ = make_slot_meta(ontology) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) data = prepare_dataset(os.path.join(args.data_root, args.test_data), tokenizer, slot_meta, args.n_history, args.max_seq_length, args.op_code) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = 0.1 op2id = OP_SET[args.op_code] model = TransformerDST(model_config, len(op2id), len(domain2id), op2id['update']) ckpt = torch.load(args.model_ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.eval() model.to(device) if args.eval_all: model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, False, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, False, True) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, True, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, True, True) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, False, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, True, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, False, True) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, True, True) else: model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, args.gt_op, args.gt_p_state, args.gt_gen)
def __init__(self, model_directory, vocab_file, lower=False): # Load pre-trained model (weights) self.model = BertForMaskedLM.from_pretrained(model_directory) self.model.eval() self.cuda = torch.cuda.is_available() if self.cuda: self.model = self.model.cuda() # Load pre-trained model tokenizer (vocabulary) self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=lower) self.CLS = '[CLS]' self.SEP = '[SEP]' self.MASK = '[MASK]' self.mask_id = self.tokenizer.convert_tokens_to_ids([self.MASK])[0] self.sep_id = self.tokenizer.convert_tokens_to_ids([self.SEP])[0] self.cls_id = self.tokenizer.convert_tokens_to_ids([self.CLS])[0]
def init(self): bert_config = BertConfig(self.args.output_config_file) if os.path.exists(self.args.output_model_file): if self.args.model_name == 'BertCNNPlus': bert_config.filter_num = self.args.filter_num bert_config.filter_sizes = [int(val) for val in self.args.filter_sizes.split()] elif self.args.model_name == 'BertRCNN': bert_config.rnn_hidden_size = self.args.rnn_hidden_size bert_config.num_layers = self.args.num_layers bert_config.bidirectional = self.args.bidirectional bert_config.dropout = self.args.dropout else: pass self.model = Net(config=bert_config) self.model.load_state_dict(torch.load(self.args.output_model_file)) self.model.to(DEVICE) self.tokenizer = BertTokenizer(self.args.bert_vocab_file).from_pretrained(self.args.bert_model_dir, do_lower_case=self.args.do_lower_case)
def __init__(self): self.use_gpu = t.cuda.is_available() self.vocab_root = "../kernel/vocab.txt" self.bert_config_root = "../kernel/bert_config.json" self.pretrained_bert_root = "../kernel/chr_idiombert.bin" self.raw_test_data_root = "../data/test.txt" self.test_ans_root = "../kernel/dev_ans.csv" self.idiom_vocab_root = "../kernel/idiomList.txt" self.prob_file = "../kernel/prob.csv" self.data_root = "../kernel/" self.split_test_data_root = "../kernel/split_test_data.json" self.tokenizer = BertTokenizer(vocab_file=self.vocab_root) self.num_workers = 4 self.test_batch_size = 512 self.max_seq_length = 128 with open(self.data_root + "idiom2index", mode="rb") as f1: self.idiom2index = pickle.load(f1) with open(self.data_root + "index2idiom", mode="rb") as f2: self.index2idiom = pickle.load(f2) self.hidden_dropout_prob = 0.5 self.use_gpu = t.cuda.is_available() self.device = t.device("cuda" if t.cuda.is_available() else "cpu")
def main(args): def worker_init_fn(worker_id): np.random.seed(args.random_seed + worker_id) if args.dataset == 'sim-R': from BERTDST_utils.simR_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'sim-M': from BERTDST_utils.simM_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'DSTC2': from BERTDST_utils.DSTC2_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'WOZ2.0': from BERTDST_utils.WOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'MultiWOZ2.1': from BERTDST_utils.MultiWOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, OP, make_slot_meta ontology = json.load(open(args.ontology_data_path)) SLOT, ontology = make_slot_meta(ontology) n_gpu = 0 if torch.cuda.is_available(): n_gpu = torch.cuda.device_count() np.random.seed(args.random_seed) random.seed(args.random_seed) rng = random.Random(args.random_seed) torch.manual_seed(args.random_seed) if n_gpu > 0: torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) slot_meta = SLOT op2id = OP print(op2id) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) train_data_raw = prepare_dataset(data_scale=args.train_scale, data_path=args.train_data_path, tokenizer=tokenizer, slot_meta=slot_meta, size_window=args.train_size_window, max_seq_length=args.max_seq_length, multi_granularity=args.train_MG, data_type='train') train_data = MultiWozDataset(train_data_raw, tokenizer, slot_meta, args.max_seq_length, rng, args.word_dropout) print("# train examples %d" % len(train_data_raw)) dev_data_raw = prepare_dataset(data_scale=1.0, data_path=args.dev_data_path, tokenizer=tokenizer, slot_meta=slot_meta, size_window=args.test_size_window, max_seq_length=args.max_seq_length, multi_granularity=args.test_MG, data_type='dev') print("# dev examples %d" % len(dev_data_raw)) test_data_raw = prepare_dataset(data_scale=1.0, data_path=args.test_data_path, tokenizer=tokenizer, slot_meta=slot_meta, size_window=args.test_size_window, max_seq_length=args.max_seq_length, multi_granularity=args.test_MG, data_type='test') print("# test examples %d" % len(test_data_raw)) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = args.dropout model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob model_config.hidden_dropout_prob = args.hidden_dropout_prob model = MGDST(model_config, len(op2id), len(slot_meta)) ckpt = torch.load(args.bert_ckpt_path, map_location='cpu') ckpt1 = { k.replace('bert.', '').replace('gamma', 'weight').replace('beta', 'bias'): v for k, v in ckpt.items() if 'cls.' not in k } model.encoder.bert.load_state_dict(ckpt1) #model.encoder.bert.from_pretrained(args.bert_ckpt_path) model.to(device) num_train_steps = int( len(train_data_raw) / args.batch_size * args.n_epochs) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] enc_param_optimizer = list(model.encoder.named_parameters()) enc_optimizer_grouped_parameters = [{ 'params': [ p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr) enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup), t_total=num_train_steps) dec_param_optimizer = list(model.decoder.parameters()) dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr) dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup), t_total=num_train_steps) if n_gpu > 1: model = torch.nn.DataParallel(model) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size, collate_fn=train_data.collate_fn, num_workers=args.num_workers, worker_init_fn=worker_init_fn) loss_fnc = nn.CrossEntropyLoss() best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0} total_step = 0 for epoch in range(args.n_epochs): batch_loss = [] model.train() for step, batch in enumerate(train_dataloader): batch = [ b.to(device) if not isinstance(b, int) else b for b in batch ] input_ids, input_mask, segment_ids, op_ids, gen_ids = batch state_scores, span_scores = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) loss_state = loss_fnc( state_scores.contiguous().view(-1, len(op2id)), op_ids.contiguous().view(-1)) try: loss_span = masked_cross_entropy_for_value( span_scores.contiguous(), gen_ids.contiguous(), tokenizer.vocab['[PAD]']) except Exception as e: print(e) loss = loss_state * 0.8 + loss_span * 0.2 batch_loss.append(loss.item()) loss.backward() enc_optimizer.step() enc_scheduler.step() dec_optimizer.step() dec_scheduler.step() model.zero_grad() total_step += 1 if step % 100 == 0: print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, span_loss : %.3f" \ % (epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_state.item(), loss_span.item())) batch_loss = [] if (epoch + 1) % args.eval_epoch == 0: print('total_step: ', total_step) eval_res = model_evaluation(make_turn_label, postprocessing, state_equal, OP, model, dev_data_raw, tokenizer, slot_meta, epoch + 1, args.test_size_window, args.test_MG) if eval_res['joint_acc'] > best_score['joint_acc']: best_score = eval_res model_to_save = model.module if hasattr(model, 'module') else model save_path = os.path.join( args.save_dir, 'model_best_gran[%s]_scale[%s]_seed[%s].bin' % (str(args.train_size_window), str( args.train_scale), args.random_seed)) torch.save(model_to_save.state_dict(), save_path) print("Best Score : ", best_score) print("\n") if epoch > args.patience_start_epoch and best_score[ 'epoch'] + args.patience < epoch: print("out of patience...") break print("Test using best model...") best_epoch = best_score['epoch'] ckpt_path = os.path.join( args.save_dir, 'model_best_gran[%s]_scale[%s]_seed[%s].bin' % (str(args.train_size_window), str(args.train_scale), args.random_seed)) model = MGDST(model_config, len(op2id), len(slot_meta)) ckpt = torch.load(ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.to(device) model_evaluation(make_turn_label, postprocessing, state_equal, OP, model, test_data_raw, tokenizer, slot_meta, best_epoch, args.test_size_window, args.test_MG)
def load(self, filename): self.tokenizer = BertTokenizer( filename, # do_basic_tokenize=self.do_basic_tokenize, do_lower_case=self.do_lower_case) return self
def __init__(self, vocab_path, do_lower_case, min_freq_words=None): self.tokenizer = BertTokenizer(vocab_file=vocab_path, do_lower_case=do_lower_case)
def main(args): def worker_init_fn(worker_id): np.random.seed(args.random_seed + worker_id) n_gpu = 0 if torch.cuda.is_available(): n_gpu = torch.cuda.device_count() np.random.seed(args.random_seed) random.seed(args.random_seed) rng = random.Random(args.random_seed) torch.manual_seed(args.random_seed) if n_gpu > 0: torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) ontology = json.load(open(args.ontology_data)) slot_meta, ontology = make_slot_meta(ontology) op2id = OP_SET[args.op_code] print(op2id) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) train_data_raw = prepare_dataset(data_path=args.train_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) train_data = MultiWozDataset(train_data_raw, tokenizer, slot_meta, args.max_seq_length, rng, ontology, args.word_dropout, args.shuffle_state, args.shuffle_p) print("# train examples %d" % len(train_data_raw)) dev_data_raw = prepare_dataset(data_path=args.dev_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) print("# dev examples %d" % len(dev_data_raw)) test_data_raw = prepare_dataset(data_path=args.test_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) print("# test examples %d" % len(test_data_raw)) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = args.dropout model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob model_config.hidden_dropout_prob = args.hidden_dropout_prob model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain) if not os.path.exists(args.bert_ckpt_path): args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets') ckpt = torch.load(args.bert_ckpt_path, map_location='cpu') model.encoder.bert.load_state_dict(ckpt) # re-initialize added special tokens ([SLOT], [NULL], [EOS]) model.encoder.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02) model.encoder.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02) model.encoder.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02) model.to(device) num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] enc_param_optimizer = list(model.encoder.named_parameters()) enc_optimizer_grouped_parameters = [ {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr) enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup), t_total=num_train_steps) dec_param_optimizer = list(model.decoder.parameters()) dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr) dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup), t_total=num_train_steps) if n_gpu > 1: model = torch.nn.DataParallel(model) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size, collate_fn=train_data.collate_fn, num_workers=args.num_workers, worker_init_fn=worker_init_fn) loss_fnc = nn.CrossEntropyLoss() best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0} for epoch in range(args.n_epochs): batch_loss = [] model.train() for step, batch in enumerate(train_dataloader): batch = [b.to(device) if not isinstance(b, int) else b for b in batch] input_ids, input_mask, segment_ids, state_position_ids, op_ids,\ domain_ids, gen_ids, max_value, max_update = batch if rng.random() < args.decoder_teacher_forcing: # teacher forcing teacher = gen_ids else: teacher = None domain_scores, state_scores, gen_scores = model(input_ids=input_ids, token_type_ids=segment_ids, state_positions=state_position_ids, attention_mask=input_mask, max_value=max_value, op_ids=op_ids, max_update=max_update, teacher=teacher) loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1)) loss_g = masked_cross_entropy_for_value(gen_scores.contiguous(), gen_ids.contiguous(), tokenizer.vocab['[PAD]']) loss = loss_s + loss_g if args.exclude_domain is not True: loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1)) loss = loss + loss_d batch_loss.append(loss.item()) loss.backward() enc_optimizer.step() enc_scheduler.step() dec_optimizer.step() dec_scheduler.step() model.zero_grad() if step % 100 == 0: if args.exclude_domain is not True: print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \ % (epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_s.item(), loss_g.item(), loss_d.item())) else: print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \ % (epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_s.item(), loss_g.item())) batch_loss = [] if (epoch+1) % args.eval_epoch == 0: eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code) if eval_res['joint_acc'] > best_score['joint_acc']: best_score = eval_res model_to_save = model.module if hasattr(model, 'module') else model save_path = os.path.join(args.save_dir, 'model_best.bin') torch.save(model_to_save.state_dict(), save_path) print("Best Score : ", best_score) print("\n") print("Test using best model...") best_epoch = best_score['epoch'] ckpt_path = os.path.join(args.save_dir, 'model_best.bin') model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain) ckpt = torch.load(ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.to(device) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=False, is_gt_p_state=False, is_gt_gen=False) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=False, is_gt_p_state=False, is_gt_gen=True) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=False, is_gt_p_state=True, is_gt_gen=False) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=False, is_gt_p_state=True, is_gt_gen=True) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=True, is_gt_p_state=False, is_gt_gen=False) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=True, is_gt_p_state=True, is_gt_gen=False) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=True, is_gt_p_state=False, is_gt_gen=True) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=True, is_gt_p_state=True, is_gt_gen=True)
def __init__(self, vocab_path, do_lower_case): self.tokenizer = BertTokenizer(vocab_path, do_lower_case)
def get_tokenizer(vocab_file=None): vocab_file = VOCAB_FILE if vocab_file is None else vocab_file os.path.isfile(vocab_file) tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False) return tokenizer
def main(log_in_file, lm_path, lm_type, data_path, usegpu, n_fold, total_step, eval_every, early_stop, lr, weight_decay, lr_decay_in_layers, wd_decay_in_layers, max_length, max_title_rate, content_head_rate, batch_size, lr_scheduler_type, input_pattern, clean_method, warmup_rate, classifier_dropout, classifier_active, seed): arg_name_value_pairs = deepcopy(locals()) prefix = time.strftime('%Y%m%d_%H%M') logger = logging.getLogger('default') formatter = logging.Formatter("%(asctime)s %(message)s") if log_in_file: handler1 = logging.FileHandler(prefix + '.log') handler1.setFormatter(formatter) handler1.setLevel(logging.DEBUG) logger.addHandler(handler1) handler2 = logging.StreamHandler() handler2.setFormatter(formatter) handler2.setLevel(logging.DEBUG) logger.addHandler(handler2) logger.setLevel(logging.DEBUG) for arg_name, arg_value in arg_name_value_pairs.items(): logger.info(f'{arg_name}: {arg_value}') global tokenizer if lm_type == 'bert': tokenizer = BertTokenizer(os.path.join(lm_path, 'vocab.txt')) else: tokenizer = XLNetTokenizer(os.path.join(lm_path, 'spiece.model')) global PAD, PAD_t, CLS_t, SEP_t PAD_t = '<pad>' CLS_t = '<cls>' SEP_t = '<sep>' PAD = tokenizer.convert_tokens_to_ids([PAD_t])[0] logger.info(f'padding token is {PAD}') processed_train = preprocess( os.path.join(data_path, 'Train_DataSet.csv'), os.path.join(data_path, 'Train_DataSet_Label.csv'), tokenizer, max_length, input_pattern, clean_method, max_title_rate, content_head_rate, logger) processed_test = preprocess(os.path.join(data_path, 'Test_DataSet.csv'), False, tokenizer, max_length, input_pattern, clean_method, max_title_rate, content_head_rate, logger) logger.info('seed everything and create model') seed_everything(seed) no_decay = ['.bias', 'LayerNorm.bias', 'LayerNorm.weight'] if lm_type == 'xlnet': model = XLNetForSequenceClassification.from_pretrained( lm_path, num_labels=3, summary_last_dropout=classifier_dropout) if classifier_active == 'relu': model.sequence_summary.activation = nn.ReLU() if usegpu: model = model.cuda() model_layer_names = [ 'transformer.mask_emb', 'transformer.word_embedding.weight' ] model_layer_names += [ f'transformer.layer.{i}.' for i in range(model.config.n_layer) ] model_layer_names += ['sequence_summary.summary', 'logits_proj'] else: model = BertForSequenceClassification.from_pretrained( lm_path, num_labels=3, hidden_dropout_prob=classifier_dropout) if classifier_active == 'relu': model.bert.pooler.activation = nn.ReLU() if usegpu: model = model.cuda() model_layer_names = ['bert.embeddings'] model_layer_names += [ 'bert.encoder.layer.{}.'.format(i) for i in range(model.config.num_hidden_layers) ] model_layer_names += ['bert.pooler', 'classifier'] optimizer = optimizer = AdamW([{ 'params': [ p for n, p in model.named_parameters() if layer_name in n and not any(nd in n for nd in no_decay) ], 'lr': lr * (lr_decay_in_layers**i), 'weight_decay': weight_decay * (wd_decay_in_layers**i) } for i, layer_name in enumerate(model_layer_names[::-1])] + [{ 'params': [ p for n, p in model.named_parameters() if layer_name in n and any(nd in n for nd in no_decay) ], 'lr': lr * (lr_decay_in_layers**i), 'weight_decay': .0 } for i, layer_name in enumerate(model_layer_names[::-1])]) if lr_scheduler_type == 'linear': lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_rate, t_total=total_step) elif lr_scheduler_type == 'constant': lr_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmup_rate) else: raise ValueError model_state_0 = deepcopy(model.state_dict()) optimizer_state_0 = deepcopy(optimizer.state_dict()) test_iter = get_data_iter(processed_test, batch_size * 4, collect_test_func, shuffle=False) pred = np.zeros((len(processed_test), 3)) val_scores = [] for fold_idx, (train_idx, val_idx) in enumerate( KFold(n_splits=n_fold, shuffle=True, random_state=seed).split(processed_train)): model.load_state_dict(model_state_0) optimizer.load_state_dict(optimizer_state_0) if lr_scheduler_type == 'linear': lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_rate, t_total=total_step) elif lr_scheduler_type == 'constant': lr_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmup_rate) else: raise ValueError train_iter = get_data_iter([processed_train[i] for i in train_idx], batch_size, collect_func) val_iter = get_data_iter([processed_train[i] for i in val_idx], batch_size * 4, collect_func, shuffle=False) best_model, best_score = training(model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, train_iter=train_iter, val_iter=val_iter, total_step=total_step, tokenizer=tokenizer, usegpu=usegpu, eval_every=eval_every, logger=logger, early_stop=early_stop, fold_idx=fold_idx) model.load_state_dict(best_model) val_scores.append(best_score) pred += predict(model, test_iter, usegpu) logger.info(f'average: {np.mean(val_scores):.6f}') pred = pred / n_fold prob_df = pd.DataFrame() submit = pd.DataFrame() submit['id'] = [i['id'] for i in processed_test] submit['label'] = pred.argmax(-1) prob_df['id'] = [i['id'] for i in processed_test] prob_df['0'] = pred[:, 0] prob_df['1'] = pred[:, 1] prob_df['2'] = pred[:, 2] submit.to_csv(f'submit_{prefix}.csv', index=False) prob_df.to_csv(f'probability_{prefix}.csv', index=False)
def main(): parser = argparse.ArgumentParser() # model parser.add_argument('--model', type=str, default='wordrnn') parser.add_argument('--dir', type=str, default=None) parser.add_argument('--tokenizer', type=str, default='nltk', help='Only effective when model set to wordrnn') parser.add_argument('--criterion', type=str, default='full') # data parser.add_argument('--set', type=str, default='msr') parser.add_argument('--partition', type=str, default='va') parser.add_argument('--no-move-cached', action='store_true') parser.add_argument('--log-dir', type=str, default='train/noname') parser.add_argument('--save-pred', action='store_true') args = parser.parse_args() problem_set = ProblemSet.load(args.set) examples = problem_set.get_examples(args.partition) logger.info("Evaluating models saved in {} on {}-{}".format( args.dir, args.set, args.partition)) if not os.path.exists(args.log_dir): logger.info("Creating directory at {}".format(args.log_dir)) os.makedirs(args.log_dir) args_path = os.path.join(args.log_dir, 'args.json') with open(args_path, 'w') as f: logger.info("Saving arguments at {}".format(args_path)) json.dump(vars(args), f, indent=2) log_path = os.path.join(args.log_dir, 'log.txt') file_handler = logging.FileHandler(log_path, mode='w') file_handler.setLevel(logging.INFO) logger.addHandler(file_handler) model_type = args.model.lower() if model_type == 'wordrnn': args_path = osp.join(args.dir, 'args.json') with open(args_path, 'r') as f: arg_dict = json.load(f) vocab_path = osp.join(args.dir, 'vocab.txt') vocab = load_vocab(vocab_path) if args.tokenizer.lower() == 'nltk': tokenizer = NLTKTokenizer(vocab, arg_dict['lower']) elif args.tokenizer.lower() == 'wordpiece': tokenizer = BertTokenizer(vocab_path, arg_dict['lower']) model = WordRNN(len(vocab), len(vocab), arg_dict['rnncell'], arg_dict['emsize'], arg_dict['outsize'], arg_dict['nhid'], arg_dict['nlayers'], arg_dict['bidirec'], arg_dict.get('autoenc', False), arg_dict['decoder_bias']) logger.info(model) ckpt_paths = glob.glob(osp.join(args.dir, '*.pt')) ckpt_paths.sort(key=osp.getmtime) for path in ckpt_paths: model.load_state_dict(torch.load(path)) direction = 'autoenc' if model.autoenc else ( 'bidirec' if model.bidirec else 'forward') evaluate(examples, model, tokenizer, direction, args.criterion, str(osp.basename(path.split('.')[0]))) if args.save_pred: save_fn = osp.basename(path).replace('.pt', '.csv') save_preds(examples, osp.join(args.log_dir, save_fn)) elif model_type == 'lm1b': lm1b_dir = settings['lm1b_dir'] for e in examples: e.context[0] = ' '.join(['<S>', e.context[0]]) e.context[-1] = ' '.join([e.context[-1], '</S>']) vocab = load_vocab(osp.join(lm1b_dir, 'vocab-2016-09-10.txt')) special_tokens = ['<S>', '</S>', '<UNK>'] tokenizer = BaseTokenizer(vocab, False, '<UNK>', special_tokens) in_vocab = load_vocab(osp.join(lm1b_dir, args.dir, 'vocab.txt')) out_to_in = [in_vocab['<UNK>']] * 800000 for i, token in tokenizer.ids_to_tokens.items(): out_to_in[i] = in_vocab.get(token, in_vocab['<UNK>']) tf_path = osp.join(lm1b_dir, 'ckpt-*') npy_path = osp.join(lm1b_dir, args.dir, 'embeddings.npy') model = LM1B.from_tf(tf_path, npy_path, out_to_in, 8) logger.info(model) evaluate(examples, model, tokenizer, 'forward', args.criterion) if args.save_pred: save_preds(examples, osp.join(args.log_dir, 'preds.csv')) else: cache_dir = settings['pretrans_dir'] bert_dir = osp.join(settings['pretrans_dir'], args.dir) model_or_dir = bert_dir if osp.exists(bert_dir) else args.dir config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] config = config_class.from_pretrained(model_or_dir, cache_dir=cache_dir) tokenizer = tokenizer_class.from_pretrained( model_or_dir, cache_dir=cache_dir, max_len=config.max_position_embeddings, do_lower_case='-uncased' in model_or_dir) model = model_class.from_pretrained(model_or_dir, cache_dir=cache_dir, config=config) direction = 'forward' if model_type == 'bert': direction = 'autoenc' evaluate(examples, model, tokenizer, direction, args.criterion) if args.save_pred: save_preds(examples, osp.join(args.log_dir, 'preds.csv')) if not args.no_move_cached and not osp.exists(bert_dir): logger.info("Creating directory at {}".format(bert_dir)) os.mkdir(bert_dir) model_url = model.pretrained_model_archive_map[model_or_dir] model_path = osp.join(bert_dir, WEIGHTS_NAME) move_cached(model_url, cache_dir, model_path) config_url = model.config.pretrained_config_archive_map[ model_or_dir] config_path = osp.join(bert_dir, CONFIG_NAME) move_cached(config_url, cache_dir, config_path) for k, url_map in tokenizer.pretrained_vocab_files_map.items(): vocab_path = osp.join(bert_dir, tokenizer.vocab_files_names[k]) move_cached(url_map[model_or_dir], cache_dir, vocab_path)
print("Len= ", len(tag2idx)) tag2name={tag2idx[key] : key for key in tag2idx.keys()} device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() vocabulary = "bert_models/vocab.txt" max_len = 45 tokenizer=BertTokenizer(vocab_file=vocabulary,do_lower_case=False) tokenized_texts = [] word_piece_labels = [] i_inc = 0 for word_list,label in (zip(sentences,labels)): temp_lable = [] temp_token = [] # Add [CLS] at the front temp_lable.append('[CLS]') temp_token.append('[CLS]') for word,lab in zip(word_list,label): token_list = tokenizer.tokenize(word)
def main(args): assert args.use_one_optim is True if args.recover_e > 0: raise NotImplementedError("This option is from my oldest code version. " "I have not checked it for this code version.") if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) print("### mkdir {:}".format(args.save_dir)) def worker_init_fn(worker_id): np.random.seed(args.random_seed + worker_id) n_gpu = 0 if torch.cuda.is_available() and (not args.use_cpu): n_gpu = torch.cuda.device_count() device = torch.device('cuda') print("### Device: {:}".format(device)) else: print("### Use CPU (Debugging)") device = torch.device("cpu") if args.random_seed < 0: print("### Pick a random seed") args.random_seed = random.sample(list(range(1, 100000)), 1)[0] print("### Random Seed: {:}".format(args.random_seed)) np.random.seed(args.random_seed) random.seed(args.random_seed) rng = random.Random(args.random_seed) torch.manual_seed(args.random_seed) if n_gpu > 0: if args.random_seed >= 0: torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) ontology = json.load(open(args.ontology_data)) slot_meta, ontology = make_slot_meta(ontology) op2id = OP_SET[args.op_code] print(op2id) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) train_path = os.path.join(args.data_root, "train.pt") train_data_raw = torch.load(train_path)[:5000] print("# train examples %d" % len(train_data_raw)) test_path = os.path.join(args.data_root, "test.pt") test_data_raw = torch.load(test_path) print("# test examples %d" % len(test_data_raw)) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = args.dropout model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob model_config.hidden_dropout_prob = args.hidden_dropout_prob type_vocab_size = 4 dec_config = args model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id), op2id['update'], tokenizer.convert_tokens_to_ids(['[MASK]'])[0], tokenizer.convert_tokens_to_ids(['[SEP]'])[0], tokenizer.convert_tokens_to_ids(['[PAD]'])[0], tokenizer.convert_tokens_to_ids(['-'])[0], type_vocab_size, args.exclude_domain) test_epochs = [int(e) for e in args.load_epoch.strip().lower().split('-')] for best_epoch in test_epochs: print("### Epoch {:}...".format(best_epoch)) sys.stdout.flush() ckpt_path = os.path.join(args.save_dir, 'model.e{:}.bin'.format(best_epoch)) ckpt = torch.load(ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.to(device) # eval_res = model_evaluation(model, train_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, # use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, n_gpu=n_gpu, # is_gt_op=False, is_gt_p_state=False, is_gt_gen=False) # # print("### Epoch {:} Train Score : ".format(best_epoch), eval_res) # print('\n'*2) # sys.stdout.flush() eval_res = model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, n_gpu=n_gpu, is_gt_op=False, is_gt_p_state=False, is_gt_gen=False) print("### Epoch {:} Test Score : ".format(best_epoch), eval_res) print('\n'*2) sys.stdout.flush()
def test(): # torch.autograd.set_detect_anomaly(True) load_save_model = False lr = 1e-5 batch_size = 4 gpu = True torch.manual_seed(0) device = torch.device('cpu') if gpu: device = torch.device('cuda') tokenizer = BertTokenizer(vocab_file='publish/vocab.txt', max_len=512) dataset, known_token = load_dataset('TRAIN/Train_reviews.csv', 'TRAIN/Train_labels.csv', tokenizer) train_dataset, validate_dataset = split_dataset(dataset, 'TRAIN/shuffle.idx', 0.97) bert_pretraining = convert_tf_checkpoint_to_pytorch('./publish/bert_model.ckpt', './publish/bert_config.json') model = Model(bert_pretraining.bert) # tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', cache_dir='bert-base-chinese') train_dataset = Dataset(train_dataset) train_dataloader = torch_data.DataLoader( dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn(tokenizer) ) validate_dataset = Dataset(validate_dataset) validate_dataloader = torch_data.DataLoader( dataset=validate_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn(tokenizer) ) model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.01) statistic = { 'best_f1': -100, 'best_f1_epoch': None, 'best_match_f1': -100, 'best_match_epoch': None, 'epoch_detail': [] } if load_save_model: model.load_state_dict(torch.load('./save_model/best.model')) for epoch in range(15): print(str(epoch) + '------------------------------------------------------------------') accum_total_loss = 0 accum_seq_labeling_loss = 0 accum_match_loss = 0 accum_category_loss = 0 accum_polarity_loss = 0 model.train() pbar = tqdm() try: for step, (batch_X, len_X, mask, gather_idx, targets) in enumerate(train_dataloader): batch_X = batch_X.to(device) mask = mask.to(device) # tokenizer.decode(list(batch_X[0].cpu().numpy())).replace(' ', '') scores, gather_idx = model(batch_X, len_X, mask, gather_idx) loss = model.loss(scores, targets, mask) optimizer.zero_grad() loss[0].backward() optimizer.step() accum_total_loss += loss[0].cpu().detach().numpy() accum_seq_labeling_loss += loss[1].cpu().detach().numpy() if type(loss[2]) is not int: accum_match_loss += loss[2].cpu().detach().numpy() accum_category_loss += loss[3].cpu().detach().numpy() accum_polarity_loss += loss[4].cpu().detach().numpy() pbar.update(batch_size) pbar.set_description('step: %d, total loss: %f, seq loss: %f, match loss: %f, category loss: %f, polarity loss: %f' % \ (step, accum_total_loss / (step + 1), accum_seq_labeling_loss / (step + 1), accum_match_loss / (step + 1),\ accum_category_loss / (step + 1), accum_polarity_loss / (step + 1))) except KeyboardInterrupt: pbar.close() raise pbar.close() optimizer.zero_grad() loss_statistic = { 'total_loss': accum_total_loss / (step + 1), 'seq_loss': accum_seq_labeling_loss / (step + 1), 'match_loss': accum_match_loss / (step + 1), 'category_loss': accum_category_loss / (step + 1), 'polarity_loss': accum_polarity_loss / (step + 1) } model.eval() total_gt_seq_target = [] total_gt_match_target = [] total_gt_single_aspect_category_target = [] total_gt_single_opinion_category_target = [] total_gt_cross_category_target = [] total_gt_single_aspect_polarity_target = [] total_gt_single_opinion_polarity_target = [] total_gt_cross_polarity_target = [] total_pred_seq_target = [] total_pred_match_target = [] total_pred_single_aspect_category_target = [] total_pred_single_opinion_category_target = [] total_pred_cross_category_target = [] total_pred_single_aspect_polarity_target = [] total_pred_single_opinion_polarity_target = [] total_pred_cross_polarity_target = [] pbar = tqdm() try: for step, (batch_X, len_X, mask, gather_idx, targets) in enumerate(validate_dataloader): batch_X = batch_X.to(device) mask = mask.to(device) scores, gather_idx = model(batch_X, len_X, mask, gather_idx) (pred_seq_target, pred_match_target, pred_single_aspect_category_target, pred_single_opinion_category_target,\ pred_cross_category_target, pred_single_aspect_polarity_target, pred_single_opinion_polarity_target,\ pred_cross_polarity_target) = model.infer(scores, mask) (seq_target, match_target, single_aspect_category_target, single_opinion_category_target, cross_category_target,\ single_aspect_polarity_target, single_opinion_polarity_target, cross_polarity_target) = targets total_pred_seq_target.append(pred_seq_target.view(-1).cpu().detach().numpy()) total_gt_seq_target.append(seq_target.view(-1).cpu().detach().numpy()) for b in range(len(pred_match_target)): if pred_match_target[b] is not None: assert match_target[b].numel() != 0 total_pred_match_target.append(pred_match_target[b].view(-1).cpu().detach().numpy()) total_gt_match_target.append(match_target[b].view(-1).cpu().detach().numpy()) if pred_single_aspect_category_target[b] is not None: total_pred_single_aspect_category_target.append(pred_single_aspect_category_target[b].cpu().detach().numpy()) total_gt_single_aspect_category_target.append(single_aspect_category_target[b].cpu().detach().numpy()) if pred_single_opinion_category_target[b] is not None: total_pred_single_opinion_category_target.append(pred_single_opinion_category_target[b].cpu().detach().numpy()) total_gt_single_opinion_category_target.append(single_opinion_category_target[b].cpu().detach().numpy()) if pred_cross_category_target[b] is not None: total_pred_cross_category_target.append(pred_cross_category_target[b].view(-1).cpu().detach().numpy()) total_gt_cross_category_target.append(cross_category_target[b].view(-1).cpu().detach().numpy()) if pred_single_aspect_polarity_target[b] is not None: total_pred_single_aspect_polarity_target.append(pred_single_aspect_polarity_target[b].cpu().detach().numpy()) total_gt_single_aspect_polarity_target.append(single_aspect_polarity_target[b].cpu().detach().numpy()) if pred_single_opinion_polarity_target[b] is not None: total_pred_single_opinion_polarity_target.append(pred_single_opinion_polarity_target[b].cpu().detach().numpy()) total_gt_single_opinion_polarity_target.append(single_opinion_polarity_target[b].cpu().detach().numpy()) if pred_cross_polarity_target[b] is not None: total_pred_cross_polarity_target.append(pred_cross_polarity_target[b].view(-1).cpu().detach().numpy()) total_gt_cross_polarity_target.append(cross_polarity_target[b].view(-1).cpu().detach().numpy()) pbar.update(batch_size) pbar.set_description('step: %d' % step) except KeyboardInterrupt: pbar.close() raise pbar.close() total_gt_seq_target = np.concatenate(total_gt_seq_target) total_gt_match_target = np.concatenate(total_gt_match_target) total_gt_single_aspect_category_target = np.concatenate(total_gt_single_aspect_category_target) total_gt_single_opinion_category_target = np.concatenate(total_gt_single_opinion_category_target) total_gt_cross_category_target = np.concatenate(total_gt_cross_category_target) total_gt_single_aspect_polarity_target = np.concatenate(total_gt_single_aspect_polarity_target) total_gt_single_opinion_polarity_target = np.concatenate(total_gt_single_opinion_polarity_target) total_gt_cross_polarity_target = np.concatenate(total_gt_cross_polarity_target) total_pred_seq_target = np.concatenate(total_pred_seq_target) total_pred_match_target = np.concatenate(total_pred_match_target) total_pred_single_aspect_category_target = np.concatenate(total_pred_single_aspect_category_target) total_pred_single_opinion_category_target = np.concatenate(total_pred_single_opinion_category_target) total_pred_cross_category_target = np.concatenate(total_pred_cross_category_target) total_pred_single_aspect_polarity_target = np.concatenate(total_pred_single_aspect_polarity_target) total_pred_single_opinion_polarity_target = np.concatenate(total_pred_single_opinion_polarity_target) total_pred_cross_polarity_target = np.concatenate(total_pred_cross_polarity_target) total_gt_category_target = np.concatenate((total_gt_single_aspect_category_target, total_gt_single_opinion_category_target,\ total_gt_cross_category_target)) total_pred_category_target = np.concatenate((total_pred_single_aspect_category_target, total_pred_single_opinion_category_target,\ total_pred_cross_category_target)) total_gt_polarity_target = np.concatenate((total_gt_single_aspect_polarity_target, total_gt_single_opinion_polarity_target,\ total_gt_cross_polarity_target)) total_pred_polarity_target = np.concatenate((total_pred_single_aspect_polarity_target, total_gt_single_opinion_polarity_target,\ total_pred_cross_polarity_target)) seq_metric = seq_f1(total_pred_seq_target, total_gt_seq_target) match_f1 = f1_score(total_gt_match_target, total_pred_match_target) match_p = precision_score(total_gt_match_target, total_pred_match_target) match_r = recall_score(total_gt_match_target, total_pred_match_target) category_f1 = seq_f1(total_pred_category_target, total_gt_category_target, 'macro') polarity_f1 = seq_f1(total_pred_polarity_target, total_gt_polarity_target, 'macro') print('Others: %f, B_A: %f, I_A: %f, B_O: %f, I_O: %f, ' % tuple(seq_metric), end='') print('match: %f, ' % match_f1, end='') print('match p: %f, ' % match_p, end='') print('match r: %f, ' % match_r, end='') print('category: %f, ' % category_f1, end='') print('polarity: %f, ' % polarity_f1, end='') epoch_statistic = { 'seq_metric': tuple(seq_metric), 'seq': 'Others: %f, B_A: %f, I_A: %f, B_O: %f, I_O: %f, ' % tuple(seq_metric), 'match': match_f1, 'match_p': match_p, 'match_r': match_r, 'category:': category_f1, 'polarity': polarity_f1, 'loss': loss_statistic } avg_f1 = (np.sum(seq_metric) + match_f1 + category_f1 + polarity_f1) / 8 print('avg: %f' % avg_f1) if avg_f1 > statistic['best_f1']: statistic['best_f1'] = avg_f1 statistic['best_f1_epoch'] = epoch torch.save(model.state_dict(), 'save_model/best.model') if match_f1 > statistic['best_match_f1']: statistic['best_match_f1'] = match_f1 statistic['best_match_epoch'] = epoch torch.save(model.state_dict(), 'save_model/best_match.model') statistic['epoch_detail'].append(epoch_statistic)
def main(args): assert args.use_one_optim is True if args.use_cls_only: args.no_dial = True print("### use_cls_only: {:}".format(args.use_cls_only)) print("### no_dial: {:}".format(args.no_dial)) if args.recover_e > 0: raise NotImplementedError("This option is from my oldest code version. " "I have not checked it for this code version.") if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) print("### mkdir {:}".format(args.save_dir)) def worker_init_fn(worker_id): np.random.seed(args.random_seed + worker_id) n_gpu = 0 if torch.cuda.is_available() and (not args.use_cpu): n_gpu = torch.cuda.device_count() device = torch.device('cuda') print("### Device: {:}".format(device)) else: print("### Use CPU (Debugging)") device = torch.device("cpu") if args.random_seed < 0: print("### Pick a random seed") args.random_seed = random.sample(list(range(0, 100000)), 1)[0] print("### Random Seed: {:}".format(args.random_seed)) np.random.seed(args.random_seed) random.seed(args.random_seed) rng = random.Random(args.random_seed) torch.manual_seed(args.random_seed) if n_gpu > 0: if args.random_seed >= 0: torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) ontology = json.load(open(args.ontology_data)) slot_meta, ontology = make_slot_meta(ontology) op2id = OP_SET[args.op_code] print(op2id) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) train_path = os.path.join(args.data_root, "train.pt") dev_path = os.path.join(args.data_root, "dev.pt") test_path = os.path.join(args.data_root, "test.pt") if not os.path.exists(test_path): test_data_raw = prepare_dataset(data_path=args.test_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) torch.save(test_data_raw, test_path) else: test_data_raw = torch.load(test_path) print("# test examples %d" % len(test_data_raw)) if not os.path.exists(train_path): train_data_raw = prepare_dataset(data_path=args.train_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) torch.save(train_data_raw, train_path) else: train_data_raw = torch.load(train_path) train_data = MultiWozDataset(train_data_raw, tokenizer, slot_meta, args.max_seq_length, rng, ontology, args.word_dropout, args.shuffle_state, args.shuffle_p, pad_id=tokenizer.convert_tokens_to_ids(['[PAD]'])[0], slot_id=tokenizer.convert_tokens_to_ids(['[SLOT]'])[0], decoder_teacher_forcing=args.decoder_teacher_forcing, use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only) print("# train examples %d" % len(train_data_raw)) if not os.path.exists(dev_path): dev_data_raw = prepare_dataset(data_path=args.dev_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) torch.save(dev_data_raw, dev_path) else: dev_data_raw = torch.load(dev_path) print("# dev examples %d" % len(dev_data_raw)) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = args.dropout model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob model_config.hidden_dropout_prob = args.hidden_dropout_prob type_vocab_size = 4 dec_config = args model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id), op2id['update'], tokenizer.convert_tokens_to_ids(['[MASK]'])[0], tokenizer.convert_tokens_to_ids(['[SEP]'])[0], tokenizer.convert_tokens_to_ids(['[PAD]'])[0], tokenizer.convert_tokens_to_ids(['-'])[0], type_vocab_size, args.exclude_domain) if not os.path.exists(args.bert_ckpt_path): args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets') state_dict = torch.load(args.bert_ckpt_path, map_location='cpu') _k = 'embeddings.token_type_embeddings.weight' print("config.type_vocab_size != state_dict[bert.embeddings.token_type_embeddings.weight] ({0} != {1})".format( type_vocab_size, state_dict[_k].shape[0])) state_dict[_k].resize_( type_vocab_size, state_dict[_k].shape[1]) state_dict[_k].data[2, :].copy_(state_dict[_k].data[0, :]) state_dict[_k].data[3, :].copy_(state_dict[_k].data[0, :]) model.bert.load_state_dict(state_dict) print("\n### Done Load BERT") sys.stdout.flush() # re-initialize added special tokens ([SLOT], [NULL], [EOS]) model.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02) model.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02) model.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02) # re-initialize seg-2, seg-3 model.bert.embeddings.token_type_embeddings.weight.data[2].normal_(mean=0.0, std=0.02) model.bert.embeddings.token_type_embeddings.weight.data[3].normal_(mean=0.0, std=0.02) model.to(device) num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs) if args.use_one_optim: print("### Use One Optim") param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.enc_lr) scheduler = WarmupLinearSchedule(optimizer, int(num_train_steps * args.enc_warmup), t_total=num_train_steps) else: no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] enc_param_optimizer = list(model.bert.named_parameters()) # TODO: For BERT only print('### Optim BERT: {:}'.format(len(enc_param_optimizer))) enc_optimizer_grouped_parameters = [ {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr) enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup), t_total=num_train_steps) dec_param_optimizer = list(model.named_parameters()) # TODO: For other parameters print('### Optim All: {:}'.format(len(dec_param_optimizer))) dec_param_optimizer = [p for (n, p) in dec_param_optimizer if 'bert' not in n] print('### Optim OTH: {:}'.format(len(dec_param_optimizer))) dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr) dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup), t_total=num_train_steps) if args.recover_e > 0: model_recover, enc_recover, dec_recover = load(args, str(args.recover_e)) print("### Recover Model E{:}".format(args.recover_e)) sys.stdout.flush() model.load_state_dict(model_recover) print("### Recover Optim E{:}".format(args.recover_e)) sys.stdout.flush() enc_optimizer.load_state_dict(enc_recover) dec_optimizer.load_state_dict(dec_optimizer) if n_gpu > 1: model = torch.nn.DataParallel(model) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size, collate_fn=train_data.collate_fn, num_workers=args.num_workers, worker_init_fn=worker_init_fn) loss_fnc = nn.CrossEntropyLoss() best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0} start_time = time.time() for epoch in range(args.n_epochs): batch_loss = [] model.train() for step, batch in enumerate(train_dataloader): batch = [b.to(device) if (not isinstance(b, int)) and (not isinstance(b, dict) and (not isinstance(b, list)) and (not isinstance(b, np.ndarray))) else b for b in batch] input_ids_p, segment_ids_p, input_mask_p, \ state_position_ids, op_ids, domain_ids, input_ids_g, segment_ids_g, position_ids_g, input_mask_g, \ masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, n_total_pred = batch domain_scores, state_scores, loss_g = model(input_ids_p, segment_ids_p, input_mask_p, state_position_ids, input_ids_g, segment_ids_g, position_ids_g, input_mask_g, masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, only_pred_op=args.only_pred_op, n_gpu=n_gpu) if n_total_pred > 0: loss_g = loss_g.sum() / n_total_pred else: loss_g = 0 loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1)) if args.only_pred_op: loss = loss_s else: loss = loss_s + loss_g if args.exclude_domain is not True: loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1)) loss = loss + loss_d batch_loss.append(loss.item()) loss.backward() if args.use_one_optim: optimizer.step() scheduler.step() else: enc_optimizer.step() enc_scheduler.step() dec_optimizer.step() dec_scheduler.step() model.zero_grad() if step % 100 == 0: try: loss_g = loss_g.item() except AttributeError: loss_g = loss_g if args.exclude_domain is not True: print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \ % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_s.item(), loss_g, loss_d.item())) else: print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \ % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_s.item(), loss_g)) sys.stdout.flush() batch_loss = [] if args.use_one_optim: save(args, epoch + 1, model, optimizer) else: save(args, epoch + 1, model, enc_optimizer, dec_optimizer) if ((epoch+1) % args.eval_epoch == 0) and (epoch+1 >= 8): eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code, use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu) print("### Epoch {:} Score : ".format(epoch+1), eval_res) if eval_res['joint_acc'] > best_score['joint_acc']: best_score = eval_res print("### Best Joint Acc: {:} ###".format(best_score['joint_acc'])) print('\n') if epoch+1 >= 8: # To speed up eval_res_test = model_evaluation(model, test_data_raw, tokenizer, slot_meta, epoch + 1, args.op_code, use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu) print("### Epoch {:} Test Score : ".format(epoch + 1), eval_res_test)
token_idx = torch.tensor(bert_tokenizer.convert_tokens_to_ids(tokens)) sep_idx = tokens.index('[SEP]') segment_idx = token_idx * 0 segment_idx[(sep_idx + 1):] = 1 mask = (token_idx != 0) return token_idx.unsqueeze(0), segment_idx.unsqueeze(0), mask.unsqueeze(0) if __name__ == '__main__': args = parser.parse_args() assert os.path.exists(args.bert_model), '{} does not exist'.format(args.bert_model) assert os.path.exists(args.bert_vocab), '{} does not exist'.format(args.bert_vocab) assert args.topk > 0, '{} should be positive'.format(args.topk) print('Initialize BERT vocabulary from {}...'.format(args.bert_vocab)) bert_tokenizer = BertTokenizer(vocab_file=args.bert_vocab) print('Initialize BERT model from {}...'.format(args.bert_model)) config = BertConfig.from_json_file('./bert-base-uncased/config.json') bert_model = BertForMaskedLM.from_pretrained('./bert-base-uncased/pytorch_model.bin', config = config) while True: message = input('Enter your message: ').strip() tokens = bert_tokenizer.tokenize(message) if len(tokens) == 0: continue if tokens[0] != CLS: tokens = [CLS] + tokens if tokens[-1] != SEP: tokens.append(SEP) token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer) with torch.no_grad():
def main(): pred_file_path = 'test.csv' load_save_model = True lr = 1e-5 batch_size = 8 gpu = True torch.manual_seed(0) device = torch.device('cpu') if gpu: device = torch.device('cuda') tokenizer = BertTokenizer(vocab_file='publish/vocab.txt', max_len=512) _, known_token = load_dataset('TRAIN/Train_reviews.csv', 'TRAIN/Train_labels.csv', tokenizer) dataset = load_review_dataset('TRAIN/TEST/Test_reviews.csv') dataset = Dataset(list(dataset.items())) dataloader = torch_data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, collate_fn=test_collate_fn( tokenizer, known_token)) bert_pretraining = convert_tf_checkpoint_to_pytorch( './publish/bert_model.ckpt', './publish/bert_config.json') model = Model(bert_pretraining.bert) model = model.cuda() if load_save_model: model.load_state_dict(torch.load('./save_model/best.model')) pred_file = open(pred_file_path, mode='w', encoding='utf-8') pbar = tqdm() model.eval() for step, (batch_X, len_X, mask, batch_idx, origin_batch_X) in enumerate(dataloader): batch_X = batch_X.to(device) mask = mask.to(device) scores, gather_idx = model(batch_X, len_X, mask, None) (pred_seq_target, pred_match_target, pred_single_aspect_category_target, pred_single_opinion_category_target,\ pred_cross_category_target, pred_single_aspect_polarity_target, pred_single_opinion_polarity_target,\ pred_cross_polarity_target) = model.infer(scores, mask) label = [] aspect_idx, opinion_idx = gather_idx for b in range(batch_X.shape[0]): _aspect_idx, _opinion_idx = aspect_idx[b], opinion_idx[b] if len(_aspect_idx) == 0 and len(_opinion_idx) == 0: label.append((batch_idx[b], '_', '_', '_', '_')) _aspect_cross, _opinion_cross = [ False for i in range(len(_aspect_idx)) ], [False for i in range(len(_opinion_idx))] for i in range(len(_aspect_idx)): for j in range(len(_opinion_idx)): if pred_match_target[b][i, j] == 1: _aspect_cross[i] = True _opinion_cross[j] = True category = ID2CATEGORY[pred_cross_category_target[b][ i, j]] polarity = ID2POLARITY[pred_cross_polarity_target[b][ i, j]] aspect = tokenizer.decode( list(origin_batch_X[b, _aspect_idx[i]].cpu(). detach().numpy())).replace(' ', '') opinion = tokenizer.decode( list(origin_batch_X[b, _opinion_idx[j]].cpu().detach( ).numpy())).replace(' ', '') # aspect = tokenizer.decode(list(batch_X[b, _aspect_idx[i]].cpu().detach().numpy())).replace(' ', '') # opinion = tokenizer.decode(list(batch_X[b, _opinion_idx[j]].cpu().detach().numpy())).replace(' ', '') aspect_beg = len( tokenizer.decode( list(batch_X[b, 1:_aspect_idx[i][0]].cpu().detach( ).numpy())).replace(' ', '')) aspect_end = aspect_beg + len(aspect) opinion_beg = len( tokenizer.decode( list(batch_X[b, 1:_opinion_idx[j][0]].cpu(). detach().numpy())).replace(' ', '')) opinion_end = opinion_beg + len(opinion) label.append((batch_idx[b], aspect, opinion, category, polarity)) for i in range(len(_aspect_idx)): if _aspect_cross[i] == False: category = ID2CATEGORY[ pred_single_aspect_category_target[b][i]] polarity = ID2POLARITY[ pred_single_aspect_polarity_target[b][i]] aspect = tokenizer.decode( list(origin_batch_X[ b, _aspect_idx[i]].cpu().detach().numpy())).replace( ' ', '') # aspect = tokenizer.decode(list(batch_X[b, _aspect_idx[i]].cpu().detach().numpy())).replace(' ', '') aspect_beg = len( tokenizer.decode( list(batch_X[b, 1:_aspect_idx[i][0]].cpu().detach( ).numpy())).replace(' ', '')) aspect_end = aspect_beg + len(aspect) label.append( (batch_idx[b], aspect, '_', category, polarity)) for i in range(len(_opinion_idx)): if _opinion_cross[i] == False: category = ID2CATEGORY[ pred_single_opinion_category_target[b][i]] polarity = ID2POLARITY[ pred_single_opinion_polarity_target[b][i]] opinion = tokenizer.decode( list(origin_batch_X[ b, _opinion_idx[i]].cpu().detach().numpy())).replace( ' ', '') # opinion = tokenizer.decode(list(batch_X[b, _opinion_idx[i]].cpu().detach().numpy())).replace(' ', '') opinion_beg = len( tokenizer.decode( list(batch_X[b, 1:_opinion_idx[i][0]].cpu().detach( ).numpy())).replace(' ', '')) opinion_end = opinion_beg + len(opinion) label.append( (batch_idx[b], '_', opinion, category, polarity)) for _label in label: _label = ','.join(list(map(lambda x: str(x), _label))) pred_file.write(_label + '\n') pbar.update(batch_size) pbar.set_description('step: %d' % step) pred_file.close() pbar.close()
def tag_sent(text): # initialize variables num_tags = 24 # depends on the labelling scheme max_len = 45 vocabulary = "bert_models/vocab.txt" bert_out_address = 'bert/model' tokenizer = BertTokenizer(vocab_file=vocabulary, do_lower_case=False) model = BertForTokenClassification.from_pretrained(bert_out_address, num_labels=num_tags) f = open('se_data/tags.txt') lines = f.readlines() f.close() tag2idx = {} for line in lines: key = line.split()[0] val = line.split()[1] tag2idx[key.strip()] = int(val.strip()) tag2name = {tag2idx[key]: key for key in tag2idx.keys()} device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() if torch.cuda.is_available(): model.cuda() if n_gpu > 1: model = torch.nn.DataParallel(model) model.eval() tokenized_texts = [] word_piece_labels = [] i_inc = 0 temp_token = [] # Add [CLS] at the front temp_token.append('[CLS]') for word in nltk.word_tokenize(text): token_list = tokenizer.tokenize(word) for m, token in enumerate(token_list): temp_token.append(token) # Add [SEP] at the end temp_token.append('[SEP]') tokenized_texts.append(temp_token) #if 5 > i_inc: #print("No.%d,len:%d"%(i_inc,len(temp_token))) #print("texts:%s"%(" ".join(temp_token))) #i_inc +=1 input_ids = pad_sequences( [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=max_len, dtype="long", truncating="post", padding="post") attention_masks = [[int(i > 0) for i in ii] for ii in input_ids] #attention_masks[0]; segment_ids = [[0] * len(input_id) for input_id in input_ids] #segment_ids[0]; tr_inputs = torch.tensor(input_ids).to(device) tr_masks = torch.tensor(attention_masks).to(device) tr_segs = torch.tensor(segment_ids).to(device) outputs = model( tr_inputs, token_type_ids=None, attention_mask=tr_masks, ) #tr_masks = tr_masks.to('cpu').numpy() logits = outputs[0] # Get NER predict result logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() #print(logits) #print(len(logits[0])) tags_t = [tag2name[t] for t in logits[0]] #print(nltk.word_tokenize(text)) c = len(tokenized_texts[0]) #print(tags_t[:c]) return tokenized_texts[0][1:len(temp_token) - 1], tags_t[:c][1:len(tags_t[:c]) - 1]