def get_ngram_idf(train_data_path, devtest_data_path, to_file, threshold=1): print("==" * 40) print("==" * 40) train_microblogs = json.load(open(train_data_path, "r"), encoding="utf-8") devtest_microblogs = json.load(open(devtest_data_path, "r"), encoding="utf-8") all_microblogs = train_microblogs + devtest_microblogs total = len(all_microblogs) # 统计词在文档中出现的频次 IDF_Counter = Counter() # vocab = set([]) vocab = {} for microblog in all_microblogs: for word in set(microblog["response"]): # if float(microblog["sentiment score"]) < 0: IDF_Counter[word] += 1 # vocab.add(word) data_utils.set_dict_key_value(vocab, word) # 删除频率小于threshold的键 data_utils.removeItemsInDict(vocab, threshold) # rf dict_idf = {} for word in vocab: dict_idf[word] = math.log(total/ float(IDF_Counter[word]+1)) # dict_idf = sorted(dict_idf.items(), key=lambda e: e[1], reverse=False) data_utils.save_params(dict_idf, to_file)
def get_entropy(idf_dic, from_file, to_file): with codecs.open(from_file, 'r', encoding='utf8') as f1: dict_entropy = {} for line in f1: line_list = line.strip().split(" ") entropy = 0.0 for word in line_list: entropy += idf_dic[word] dict_entropy[line.strip()] = entropy/len(line_list) data_utils.save_params(dict_entropy, to_file)
def init_embedding(self): self.word_embed_file = self.config.word_embed_file self.word_dim = self.config.word_dim self.threshold = self.config.threshold self.we_file = self.config.we_file self.w2i_file = self.config.w2i_file self.r2i_file = self.config.r2i_file self.u2i_file = self.config.u2i_file # the char_embed always init if self.config.init: self.utter_vocab, self.res_vocab, self.vocab = self.build_vocab() self.embed = data_utils.load_word_embedding( self.vocab, self.word_embed_file, self.config, self.word_dim) data_utils.save_params(self.vocab, self.w2i_file) data_utils.save_params(self.res_vocab, self.r2i_file) data_utils.save_params(self.utter_vocab, self.u2i_file) data_utils.save_params(self.embed, self.we_file) else: self.embed = data_utils.load_params(self.we_file) self.vocab = data_utils.load_params(self.w2i_file) self.res_vocab = data_utils.load_params(self.r2i_file) self.utter_vocab = data_utils.load_params(self.u2i_file) self.embed = self.embed.astype(np.float32) print("vocab size: %d" % len(self.vocab), "we shape: ", self.embed.shape)
def init_embedding(self): self.word_embed_file = self.config.word_embed_file self.word_dim = self.config.word_dim self.char_dim = self.config.char_dim self.ner_dim = self.config.ner_dim self.pos_dim = self.config.pos_dim self.threshold = self.config.threshold self.we_file = self.config.we_file self.w2i_file = self.config.w2i_file self.c2i_file = self.config.c2i_file self.n2i_file = self.config.n2i_file self.p2i_file = self.config.p2i_file # the char_embed always init if self.config.init: self.word_vocab, self.char_vocab, self.ner_vocab, self.pos_vocab = self.build_vocab( ) self.embed = data_utils.load_word_embedding( self.word_vocab, self.word_embed_file, self.config, self.word_dim) data_utils.save_params(self.word_vocab, self.w2i_file) data_utils.save_params(self.char_vocab, self.c2i_file) data_utils.save_params(self.ner_vocab, self.n2i_file) data_utils.save_params(self.pos_vocab, self.p2i_file) data_utils.save_params(self.embed, self.we_file) else: self.embed = data_utils.load_params(self.we_file) self.word_vocab = data_utils.load_params(self.w2i_file) self.char_vocab = data_utils.load_params(self.c2i_file) self.ner_vocab = data_utils.load_params(self.n2i_file) self.pos_vocab = data_utils.load_params(self.p2i_file) self.embed = self.embed.astype(np.float32) self.char_embed = np.array(np.random.uniform( -0.25, 0.25, (len(self.char_vocab), self.char_dim)), dtype=np.float32) self.ner_embed = np.array(np.random.uniform( -0.25, 0.25, (len(self.ner_vocab), self.ner_dim)), dtype=np.float32) self.pos_embed = np.array(np.random.uniform( -0.25, 0.25, (len(self.pos_vocab), self.pos_dim)), dtype=np.float32) print("vocab size: %d" % len(self.word_vocab), "we shape: ", self.embed.shape)
def train(self, output_dir, train_batch_size, gradient_accumulation_steps, seed, epochs, data_path, pretrained_path, valid_path, no_cuda=False, dropout=0.3, weight_decay=0.01, warmup_proportion=0.1, learning_rate=5e-5, adam_epsilon=1e-8, max_seq_length=128, squeeze=True, max_grad_norm=1.0, eval_batch_size=32, epoch_save_model=False, model_name='BERT', embedding_path=None, split_train_data=False, motherfile = False): if os.path.exists(output_dir) and os.listdir(output_dir): raise ValueError("Output directory (%s) already exists and is not empty." % output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO, filename=os.path.join(output_dir, "log.txt")) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logger = logging.getLogger(__name__) if gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1" % gradient_accumulation_steps) train_batch_size = train_batch_size // gradient_accumulation_steps random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) # add one for IGNORE label if motherfile: print(data_path) train_examples, train_label_list = get_examples_from_motherfile(data_path, 'train') val_examples, val_label_list = get_examples_from_motherfile(data_path, 'test') train_label_list.extend(val_label_list) label_list = list(set(train_label_list)) elif split_train_data: examples, label_list = get_examples(data_path, 'train') random.shuffle(examples) train_examples = examples[0:int(len(examples)*0.6)] val_examples = examples[int(len(examples)*0.6):int(len(examples)*0.8)] eval_examples = examples[int(len(examples)*0.8):] else: train_examples = None train_examples, label_list = get_examples(data_path, 'train') num_train_optimization_steps = 0 num_labels = len(label_list) + 1 num_train_optimization_steps = int( len(train_examples) / train_batch_size / gradient_accumulation_steps) * epochs hidden_size = 300 if pretrained_path == None else 768 if 'base' in pretrained_path else 1024 device = 'cuda:0' if (torch.cuda.is_available() and not no_cuda) else 'cpu' logger.info(device) print(pretrained_path) if model_name == 'HERBERT': model = AutoTokenizerForTokenClassification( pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) elif model_name == 'BERT_MULTILINGUAL': model = BertBaseMultilingualCased( pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) elif model_name == 'Reformer': model = Reformer(n_labels=num_labels, hidden_size=512, dropout=dropout, device=device, max_seq_length=max_seq_length, batch_size=train_batch_size) else: model = XLMRForTokenClassification(pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout=dropout, device=device) model.to(device) no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) optimizer_grouped_parameters = [ {'params': [p for n, p in params if not any( nd in n for nd in no_decay)], 'weight_decay': weight_decay}, {'params': [p for n, p in params if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] warmup_steps = int(warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) train_features = convert_examples_to_features( train_examples, label_list, max_seq_length, model.encode_word) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = create_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=train_batch_size) if not split_train_data: val_examples, _ = get_examples(valid_path, 'valid') val_features = convert_examples_to_features( val_examples, label_list, max_seq_length, model.encode_word) val_data = create_dataset(val_features) best_val_f1 = 0.0 for epoch_no in range(1, epochs+1): logger.info("Epoch %d" % epoch_no) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 model.train() steps = len(train_dataloader) for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, label_ids, l_mask, valid_ids, = batch loss = model(input_ids, label_ids, l_mask, valid_ids) if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if step % 5 == 0: logger.info('Step = %d/%d; Loss = %.4f' % (step+1, steps, tr_loss / (step+1))) if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() logger.info("\nTesting on validation set...") f1, report = evaluate_model(model, val_data, label_list, eval_batch_size, device) print(report) if f1 > best_val_f1: best_val_f1 = f1 logger.info("\nFound better f1=%.4f on validation set. Saving model\n" % f1) logger.info("%s\n" % report) torch.save(model.state_dict(), open(os.path.join(output_dir, 'model.pt'), 'wb')) save_params(output_dir, dropout, num_labels, label_list) if epoch_save_model: epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no) os.makedirs(epoch_output_dir) torch.save(model.state_dict(), open(os.path.join(epoch_output_dir, 'model.pt'), 'wb')) save_params(epoch_output_dir, dropout, num_labels, label_list)
def train(self, model, x_train, y_train, label_map, epochs, train_batch_size, seed, x_valid, y_valid, gradient_accumulation_steps, output_dir, max_seq_length=128, weight_decay=0.01, warmup_proportion=0.1, learning_rate=0.01, adam_epsilon=1e-8, no_cuda=False, max_grad_norm=1.0, eval_batch_size=32, epoch_save_model=False, dropout=0.2, save=True, logger = None): if not logger: logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logger = logging.getLogger(__name__) num_train_optimization_steps = int( len(x_train) / train_batch_size / gradient_accumulation_steps) * epochs params = list(model.named_parameters()) no_decay = ['bias', 'final_layer_norm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in params if not any( nd in n for nd in no_decay)], 'weight_decay': weight_decay}, {'params': [p for n, p in params if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] warmup_steps = int(warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) if save and os.path.exists(output_dir) and os.listdir(output_dir): raise ValueError("Output directory (%s) already exists and is not empty." % output_dir) if save and not os.path.exists(output_dir): os.makedirs(output_dir) device = 'cuda:1' if (torch.cuda.is_available() and not no_cuda) else 'cpu' logger.info(device) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) model.to(device) best_val_f1 = 0.0 best_precision = 0.0 best_recall = 0.0 epoch_times = [] steps = len(x_train)%train_batch_size for epoch_no in range(1, epochs+1): start = timer() logger.info("Epoch %d" % epoch_no) tr_loss = 0 model.train() for step in range(0, steps): div = int(step*train_batch_size) if len(x_train) > div+train_batch_size: input_ids, label_ids, l_mask, valid_ids, = get_batch(x_train[div:div+train_batch_size], y_train[div:div+train_batch_size], device=device, embed_method=model.encode_word, max_seq_length=max_seq_length, label_map=label_map, batch_size=train_batch_size) else: input_ids, label_ids, l_mask, valid_ids, = get_batch(x_train[div:], y_train[div:], device=device, embed_method=model.encode_word, max_seq_length=max_seq_length, label_map=label_map, batch_size=train_batch_size) loss = model(input_ids, label_ids, l_mask, valid_ids) if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), max_grad_norm) tr_loss += loss.item() if step % 5 == 0: logger.info('Step = %d/%d; Loss = %.4f' % (step+1, steps, tr_loss / (step+1))) if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() epoch_times.append(timer() - start) logger.info("\nTesting on validation set...") f1, report, precision, recall = self.evaluate_model(model, x_valid, y_valid, label_map, eval_batch_size, device, max_seq_length) print(report) if f1 > best_val_f1: best_val_f1 = f1 best_precision = precision best_recall = recall logger.info("\nFound better f1=%.4f on validation set. Saving model\n" % f1) print(report) if save: torch.save(model.state_dict(), open(os.path.join(output_dir, 'model.pt'), 'wb')) save_params(output_dir, dropout, len(label_map.keys()), list(label_map.keys())) if save and epoch_save_model: epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no) os.makedirs(epoch_output_dir) torch.save(model.state_dict(), open(os.path.join(epoch_output_dir, 'model.pt'), 'wb')) save_params(epoch_output_dir, dropout, len(label_map.keys()), list(label_map.keys())) print("Avg. epoch time") print(np.mean(epoch_times, axis=0)) return best_val_f1, best_precision, best_recall