def run(): """train the model""" # set the logger utils.set_logger(config.log_dir) logging.info("device: {}".format(config.device)) # 处理数据,分离文本和标签 processor = Processor(config) processor.process() logging.info("--------Process Done!--------") # 分离出验证集 word_train, word_dev, label_train, label_dev = load_dev('train') # build dataset train_dataset = NERDataset(word_train, label_train, config) dev_dataset = NERDataset(word_dev, label_dev, config) logging.info("--------Dataset Build!--------") # get dataset size train_size = len(train_dataset) # build data_loader train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=train_dataset.collate_fn) dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=dev_dataset.collate_fn) logging.info("--------Get Dataloader!--------") # Prepare model device = config.device model = BertNER.from_pretrained(config.roberta_model, num_labels=len(config.label2id)) model.to(device) # Prepare optimizer if config.full_fine_tuning: # model.named_parameters(): [bert, classifier, crf] bert_optimizer = list(model.bert.named_parameters()) classifier_optimizer = list(model.classifier.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in bert_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay}, {'params': [p for n, p in bert_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, {'params': [p for n, p in classifier_optimizer if not any(nd in n for nd in no_decay)], 'lr': config.learning_rate * 5, 'weight_decay': config.weight_decay}, {'params': [p for n, p in classifier_optimizer if any(nd in n for nd in no_decay)], 'lr': config.learning_rate * 5, 'weight_decay': 0.0}, {'params': model.crf.parameters(), 'lr': config.learning_rate * 5} ] # only fine-tune the head classifier else: param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, correct_bias=False) train_steps_per_epoch = train_size // config.batch_size scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=(config.epoch_num // 10) * train_steps_per_epoch, num_training_steps=config.epoch_num * train_steps_per_epoch) # Train the model logging.info("--------Start Training!--------") train(train_loader, dev_loader, model, optimizer, scheduler, config.model_dir)
def simple_run(): """train without k-fold""" # set the logger utils.set_logger(config.log_dir) # 设置gpu为命令行参数指定的id if config.gpu != '': device = torch.device(f"cuda:{config.gpu}") else: device = torch.device("cpu") logging.info("device: {}".format(device)) # 处理数据,分离文本和标签 processor = Processor(config) processor.data_process() # 建立词表 vocab = Vocabulary(config) vocab.get_vocab() # 分离出验证集 word_train, word_dev, label_train, label_dev = dev_split(config.train_dir) # simple run without k-fold run(word_train, label_train, word_dev, label_dev, vocab, device)
def k_fold_run(): """train with k-fold""" # set the logger utils.set_logger(config.log_dir) # 设置gpu为命令行参数指定的id if config.gpu != '': device = torch.device(f"cuda:{config.gpu}") else: device = torch.device("cpu") logging.info("device: {}".format(device)) # 处理数据,分离文本和标签 processor = Processor(config) processor.data_process() # 建立词表 vocab = Vocabulary(config) vocab.get_vocab() # 分离出验证集 data = np.load(config.train_dir, allow_pickle=True) words = data["words"] labels = data["labels"] kf = KFold(n_splits=config.n_split) kf_data = kf.split(words, labels) kf_index = 0 total_test_loss = 0 total_f1 = 0 for train_index, dev_index in kf_data: kf_index += 1 word_train = words[train_index] label_train = labels[train_index] word_dev = words[dev_index] label_dev = labels[dev_index] test_loss, f1 = run(word_train, label_train, word_dev, label_dev, vocab, device, kf_index) total_test_loss += test_loss total_f1 += f1 average_test_loss = float(total_test_loss) / config.n_split average_f1 = float(total_f1) / config.n_split logging.info("Average test loss: {} , average f1 score: {}".format( average_test_loss, average_f1))
print(tags.size()) # torch.Size([1, 1, 512]) tags = tags.squeeze(0).cpu().numpy().tolist() preds = tags[0][1:-1] # 取出CLS SEP label_entities = get_entities(preds, id2label) json_d = {} json_d['tag_seq'] = ' '.join([id2label[x] for x in preds]) json_d['entities'] = label_entities print(tokens[1:-1]) print(json_d['tag_seq'].split(' ')[:input_len]) print(len(tokens[1:-1])) print(len(json_d['tag_seq'].split(' ')[:input_len])) if __name__ == '__main__': processor = Processor() label_list = processor.get_labels() # 将标签进行id映射 id2label = {i: label for i, label in enumerate(label_list)} label2id = {label: i for i, label in enumerate(label_list)} num_labels = len(label_list) # s = '常建良,男,1963年出生,工科学士,高级工程师,北京物资学院客座副教授。' s = [ '1', '9', '6', '6', '年', '出', '生', ',', '汉', '族', ',', '中', '共', '党', '员', ',', '本', '科', '学', '历', ',', '工', '程', '师', '、', '美', '国', '项', '目', '管', '理', '协', '会', '注', '册', '会', '员', '(', 'P', 'M', 'I', 'M', 'e', 'm', 'b', 'e', 'r', ')', '、', '注', '册', '项', '目', '管', '理', '专', '家', '(', 'P', 'M', 'P', ')', '、', '项', '目', '经', '理', '。' ] tokenizer = BertTokenizer.from_pretrained('./bert_pretrain/vocab.txt')
def pharmacy_counting(): start_time = time.time() processor = Processor(input_file, output_file) processor.process() print("program executed: %s" % (time.time() - start_time))
config.embedding_dir, binary=False, encoding='utf-8') vocab_size = len(vocab) + 1 embed_size = config.embedding_size weight = torch.zeros(vocab_size, embed_size) cnt = 0 for i in range(len(word2vec_model.index_to_key)): try: index = vocab.word_id(word2vec_model.index_to_key[i]) except: continue cnt += 1 weight[index, :] = torch.from_numpy(word2vec_model.get_vector( vocab.id_word(vocab.word_id(word2vec_model.index_to_key[i])))) logging.info("--------Pretrained Embedding Loaded ! ({}/{})--------".format(cnt, len(vocab))) return weight if __name__ == "__main__": from data_process import Processor from Vocabulary import Vocabulary processor = Processor(config) processor.data_process() # 建立词表 vocab = Vocabulary(config) vocab.get_vocab() matrix, emb_vocab, size, l = load_embedding_manually(config.embedding_dir) print(emb_vocab['i2w'][4]) # 大 print(vocab.word_id(emb_vocab['i2w'][4])) # 15 w = embedding(vocab)