def __repr__(self): Print('Configuration settings:', 'information') Print('--------------------------------', 'information') ret = '' for key in self.parser: ret += key + '\t\t' + str(self.parser[key]) + '\n' return ret
def __init__(self, parser, config_file): self.parser = None if parser: self.parser = parser if config_file: self.save_to_json_file(config_file) else: Print('A configuration file is required', 'information') if config_file: Print(config_file + ' loading', 'information') self.load_from_json_file(config_file) else: Print('But no files founded!', 'warning')
def get_index_word(self, idx): """ Simple `idx2word[idx]`. """ if idx >= self.word_size: Print('No idx ' + idx + ' in alphabet ' + self.name, 'error') else: return self.idx2word[idx]
def get_word_index(self, word): """ Simple `word2idx[word]`. """ if word not in self.word2idx: Print('No word ' + word + ' in alphabet ' + self.name, 'error') else: return self.word2idx[word]
def __init__(self, input_size, embed_dim, dropout_prob, pre_build_embedding, idx2word, bert_dir): super(Glove_Bert_Embedding, self).__init__() self.input_size, self.embed_dim =\ input_size, embed_dim self.embedding = nn.Embedding(input_size, self.embed_dim) self.embedding.weight.data.copy_(torch.from_numpy(pre_build_embedding)) self.dropout = nn.Dropout(dropout_prob) Print('Building Bert model...', 'information') # self.bert_model = BertModel.from_pretrained('bert-base-uncased') with open(bert_dir, 'rb') as f: self.bert_model = torch.load(f) self.bert_model.eval() self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.idx2word = idx2word Print('Word embedding Model built done.', 'success')
def save_to_json_file(self, config_file): r""" Save configuration as json. """ if config_file is None: Print('No config file founded.', 'error') return with open(config_file, 'w') as f: json.dump(vars(self.parser.parse_args()), f)
def load_from_json_file(self, config_file): r""" load configuration from json. """ if config_file is None: Print('No config file founded.', 'error') return with open(config_file, 'r') as f: self.parser = json.load(f)
def add_word(self, word, ignore_existed_word=True): """ Add a word to the dictionary. """ if word in self.word2idx: if not ignore_existed_word: Print('word ' + word + ' already exists in alphabet ' + self.name, 'warning') else: self.word2idx[word] = self.word_size self.idx2word[self.word_size] = word self.word_size += 1
def build_word_dict(self, file_type): """ Build word dictionary from file_type. Args: file_type: Valid in range ['train', 'test', 'dev']. """ file_name = self.config.parser[file_type + '_dir'] with open(file_name, 'r') as f: for line in f.readlines(): contents = line.strip().split(' ') # an empty line, means seperator for two batch # doc id, means a new batch whose `docid` is doc id # a word and its tag sepaerated by a blank if len(contents) >= 2: word, _ = contents[0], contents[1] self.word_dict.add_word(word) Print(f'word dict from {file_name} is added', 'success')
def train(reader, model): print(reader.config) optim_choice = reader.config.parser['optimizer'] # switch optimizer if optim_choice.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), reader.config.parser['HP_learning_rate'], momentum=reader.config.parser['momentum'], weight_decay=reader.config.parser['HP_l2']) elif optim_choice.lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters(), reader.config.parser['HP_learning_rate'], weight_decay=reader.config.parser['HP_l2']) else: # default adam Print('optimizer is not found, using Adam by default', 'warning') optimizer = optim.Adam(model.parameters(), reader.config.parser['HP_learning_rate'], weight_decay=reader.config.parser['HP_l2']) best_dev = -1000000000 train_doc_idx, train_sentence_list_idx, train_tag_idx =\ reader.read_from_file('train') Print('Start training model...', 'information') for epoch in range(reader.config.parser['iteration']): Print(f'Epoch: {epoch + 1} / {reader.config.parser["iteration"]}', 'information') # update learning rate if optimizer is 'SGD' if optim_choice is 'SGD': optimizer = lr_decay(optimizer, epoch, reader.config.parser['HP_lr_decay'], reader.config.parser['HP_learning_rate']) # random shuffle train data train_ids = np.arange(len(train_doc_idx)) np.random.shuffle(train_ids) model.train() batch_size = reader.config.parser['batch_size'] batch_nums = len(train_doc_idx) // batch_size tqdm_iter = tqdm.tqdm(range(batch_nums)) tqdm_iter.set_description('training on batch', refresh=False) loss_sum = 0 for batch_id in tqdm_iter: model.zero_grad() start = batch_id * batch_size end = start + batch_size # the following three are train inputs train_doc = [train_doc_idx[idx] for idx in train_ids[start:end]] train_sentence_list = [ train_sentence_list_idx[idx] for idx in train_ids[start:end] ] train_tag = [train_tag_idx[idx] for idx in train_ids[start:end]] word_seq_tensor, word_seq_len, seq_order_recovery, sentence_tensor_list, tag_seq_tensor,\ seq_mask = batchify_generation( train_doc, train_sentence_list, train_tag, True) loss, tag_seq = model.forward(sentence_tensor_list, word_seq_tensor, word_seq_len, tag_seq_tensor, seq_mask) loss.backward() optimizer.step() loss_sum += loss.item() ''' if epoch % 200 == 0: # generate dev data dev_word_seq_tensor, dev_sentence_tensor_list, dev_tag_seq_tensor =\ reader.read_from_file('dev') dev_word_seq_tensor, dev_word_seq_len, dev_seq_order_recovery, dev_sentence_tensor_list, dev_tag_seq_tensor,\ dev_seq_mask = batchify_generation( dev_word_seq_tensor, dev_sentence_tensor_list, dev_tag_seq_tensor, False) # develop loss, _ = model.forward(dev_sentence_tensor_list, dev_word_seq_tensor, dev_word_seq_len, dev_tag_seq_tensor, dev_seq_mask) Print(f'dev loss {loss.item()} on batch {batch_id + 1}', 'information') if loss.item() < best_dev: best_dev = loss.item() torch.save(model.state_dict(), 'sources/model.pkl') Print('Model saved at "sources/model.pkl"', 'success') del dev_word_seq_tensor, dev_word_seq_len, dev_seq_order_recovery,\ dev_sentence_tensor_list, dev_tag_seq_tensor, dev_seq_mask ''' Print(f'average loss {loss_sum / batch_nums: .4f}')
def build_pre_embedding(self, use_saved_embed=False): """ Build word embedding from pre-trained Glove model by default. For the word not in pre-trained Glove model, we apply random vector to represent the embedding. This should be after building word dictionary. If there is `embedding_save_dir` in configuration json file, Just read embedding from it. """ if use_saved_embed and\ self.config.parser['embedding_save_dir'] is not '': Print( f'reading saved embedding file from '\ f'{self.config.parser["embedding_save_dir"]}', 'information' ) with open(self.config.parser['embedding_save_dir'], 'rb') as f: pretrain_embed = pickle.load(f) else: if self.config.parser['embed_dir'] is None: Print('Pre-trained embedding file not available.', 'error') return embed_file = self.config.parser['embed_dir'] # load in pre-trained Glove model, save it as a dict pretrain_embed = {} with open(embed_file, 'r', encoding='utf-8') as f: tqdm_iter = tqdm.tqdm(f.readlines()) tqdm_iter.set_description('read from pre-trained file', False) for line in tqdm_iter: embed_content = line.strip().split() word, embed_content = embed_content[0], embed_content[1:] if self.config.parser['word_embed_dim'] < 0: self.config.parser['word_embed_dim'] = len( embed_content) elif self.config.parser['word_embed_dim'] != len( embed_content): # invalid embedding word continue embed_content = np.array([float(x) for x in embed_content]) pretrain_embed[word] = embed_content if self.config.parser['embedding_save_dir'] is not '': with open(self.config.parser['embedding_save_dir'], 'wb') as f: pickle.dump(pretrain_embed, f) Print( f'pre-trained embedding dictionary is saved at '\ f'{self.config.parser["embedding_save_dir"]}', 'success' ) embed_dim = self.config.parser['word_embed_dim'] # build embedding if find it in pre-trained model # else randomly generate one. self.embedding = np.empty([self.word_dict.word_size, embed_dim]) scale = np.sqrt(3 / embed_dim) perfect_match, case_match, not_match = 0, 0, 0 for word, index in self.word_dict.word2idx.items(): if word in pretrain_embed: self.embedding[index, :] = self.norm2one(pretrain_embed[word]) \ if self.config.parser['norm_word_embed'] else pretrain_embed[word] perfect_match += 1 if word.lower() in pretrain_embed: self.embedding[index, :] = self.norm2one(pretrain_embed[word.lower()]) \ if self.config.parser['norm_word_embed'] else pretrain_embed[word.lower()] case_match += 1 else: # not found self.embedding[index, :] = np.random.uniform( -scale, scale, [embed_dim]) not_match += 1 Print( f'Pre-trained embedding loaded in from {self.config.parser["embed_dir"]},\n'\ f'pre-train words: {len(pretrain_embed)}, perfect match {perfect_match},\n'\ f'case match {case_match}, not match {not_match},\n'\ f'oov {not_match / self.word_dict.word_size}', 'success' ) return self.embedding