示例#1
0
 def __repr__(self):
     Print('Configuration settings:', 'information')
     Print('--------------------------------', 'information')
     ret = ''
     for key in self.parser:
         ret += key + '\t\t' + str(self.parser[key]) + '\n'
     return ret
示例#2
0
 def __init__(self, parser, config_file):
     self.parser = None
     if parser:
         self.parser = parser
         if config_file:
             self.save_to_json_file(config_file)
     else:
         Print('A configuration file is required', 'information')
         if config_file:
             Print(config_file + ' loading', 'information')
             self.load_from_json_file(config_file)
         else:
             Print('But no files founded!', 'warning')
示例#3
0
 def get_index_word(self, idx):
     """
     Simple `idx2word[idx]`.
     """
     if idx >= self.word_size:
         Print('No idx ' + idx + ' in alphabet ' + self.name, 'error')
     else:
         return self.idx2word[idx]
示例#4
0
 def get_word_index(self, word):
     """
     Simple `word2idx[word]`.
     """
     if word not in self.word2idx:
         Print('No word ' + word + ' in alphabet ' + self.name, 'error')
     else:
         return self.word2idx[word]
示例#5
0
    def __init__(self, input_size, embed_dim, dropout_prob,
                 pre_build_embedding, idx2word, bert_dir):
        super(Glove_Bert_Embedding, self).__init__()
        self.input_size, self.embed_dim =\
            input_size, embed_dim
        self.embedding = nn.Embedding(input_size, self.embed_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(pre_build_embedding))
        self.dropout = nn.Dropout(dropout_prob)

        Print('Building Bert model...', 'information')
        # self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        with open(bert_dir, 'rb') as f:
            self.bert_model = torch.load(f)
        self.bert_model.eval()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.idx2word = idx2word
        Print('Word embedding Model built done.', 'success')
示例#6
0
 def save_to_json_file(self, config_file):
     r"""
     Save configuration as json.
     """
     if config_file is None:
         Print('No config file founded.', 'error')
         return
     with open(config_file, 'w') as f:
         json.dump(vars(self.parser.parse_args()), f)
示例#7
0
 def load_from_json_file(self, config_file):
     r"""
     load configuration from json.
     """
     if config_file is None:
         Print('No config file founded.', 'error')
         return
     with open(config_file, 'r') as f:
         self.parser = json.load(f)
示例#8
0
 def add_word(self, word, ignore_existed_word=True):
     """
     Add a word to the dictionary.
     """
     if word in self.word2idx:
         if not ignore_existed_word:
             Print('word ' + word + ' already exists in alphabet ' +
                   self.name, 'warning')
     else:
         self.word2idx[word] = self.word_size
         self.idx2word[self.word_size] = word
         self.word_size += 1
示例#9
0
    def build_word_dict(self, file_type):
        """
        Build word dictionary from file_type.

        Args:
            file_type: Valid in range ['train', 'test', 'dev'].
        """
        file_name = self.config.parser[file_type + '_dir']
        with open(file_name, 'r') as f:
            for line in f.readlines():
                contents = line.strip().split(' ')
                # an empty line, means seperator for two batch
                # doc id, means a new batch whose `docid` is doc id
                # a word and its tag sepaerated by a blank
                if len(contents) >= 2:
                    word, _ = contents[0], contents[1]
                    self.word_dict.add_word(word)
        Print(f'word dict from {file_name} is added', 'success')
示例#10
0
def train(reader, model):
    print(reader.config)
    optim_choice = reader.config.parser['optimizer']

    # switch optimizer
    if optim_choice.lower() == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              reader.config.parser['HP_learning_rate'],
                              momentum=reader.config.parser['momentum'],
                              weight_decay=reader.config.parser['HP_l2'])
    elif optim_choice.lower() == 'adagrad':
        optimizer = optim.Adagrad(model.parameters(),
                                  reader.config.parser['HP_learning_rate'],
                                  weight_decay=reader.config.parser['HP_l2'])
    else:  # default adam
        Print('optimizer is not found, using Adam by default', 'warning')
        optimizer = optim.Adam(model.parameters(),
                               reader.config.parser['HP_learning_rate'],
                               weight_decay=reader.config.parser['HP_l2'])

    best_dev = -1000000000

    train_doc_idx, train_sentence_list_idx, train_tag_idx =\
        reader.read_from_file('train')

    Print('Start training model...', 'information')
    for epoch in range(reader.config.parser['iteration']):
        Print(f'Epoch: {epoch + 1} / {reader.config.parser["iteration"]}',
              'information')

        # update learning rate if optimizer is 'SGD'
        if optim_choice is 'SGD':
            optimizer = lr_decay(optimizer, epoch,
                                 reader.config.parser['HP_lr_decay'],
                                 reader.config.parser['HP_learning_rate'])
        # random shuffle train data
        train_ids = np.arange(len(train_doc_idx))
        np.random.shuffle(train_ids)

        model.train()
        batch_size = reader.config.parser['batch_size']
        batch_nums = len(train_doc_idx) // batch_size
        tqdm_iter = tqdm.tqdm(range(batch_nums))
        tqdm_iter.set_description('training on batch', refresh=False)
        loss_sum = 0
        for batch_id in tqdm_iter:
            model.zero_grad()
            start = batch_id * batch_size
            end = start + batch_size

            # the following three are train inputs
            train_doc = [train_doc_idx[idx] for idx in train_ids[start:end]]
            train_sentence_list = [
                train_sentence_list_idx[idx] for idx in train_ids[start:end]
            ]
            train_tag = [train_tag_idx[idx] for idx in train_ids[start:end]]

            word_seq_tensor, word_seq_len, seq_order_recovery, sentence_tensor_list, tag_seq_tensor,\
                seq_mask = batchify_generation(
                    train_doc, train_sentence_list, train_tag, True)
            loss, tag_seq = model.forward(sentence_tensor_list,
                                          word_seq_tensor, word_seq_len,
                                          tag_seq_tensor, seq_mask)

            loss.backward()
            optimizer.step()

            loss_sum += loss.item()
            '''
            if epoch % 200 == 0:
                # generate dev data
                dev_word_seq_tensor, dev_sentence_tensor_list, dev_tag_seq_tensor =\
                    reader.read_from_file('dev')
                dev_word_seq_tensor, dev_word_seq_len, dev_seq_order_recovery, dev_sentence_tensor_list, dev_tag_seq_tensor,\
                    dev_seq_mask = batchify_generation(
                        dev_word_seq_tensor, dev_sentence_tensor_list, dev_tag_seq_tensor, False)
                
                # develop
                loss, _ = model.forward(dev_sentence_tensor_list, dev_word_seq_tensor,
                                        dev_word_seq_len, dev_tag_seq_tensor, dev_seq_mask)
                Print(f'dev loss {loss.item()} on batch {batch_id + 1}', 'information')
                if loss.item() < best_dev:
                    best_dev = loss.item()
                    torch.save(model.state_dict(), 'sources/model.pkl')
                    Print('Model saved at "sources/model.pkl"', 'success')
                
                del dev_word_seq_tensor, dev_word_seq_len, dev_seq_order_recovery,\
                    dev_sentence_tensor_list, dev_tag_seq_tensor, dev_seq_mask
            '''
        Print(f'average loss {loss_sum / batch_nums: .4f}')
示例#11
0
    def build_pre_embedding(self, use_saved_embed=False):
        """
        Build word embedding from pre-trained Glove model by default.
        For the word not in pre-trained Glove model,
        we apply random vector to represent the embedding.

        This should be after building word dictionary.

        If there is `embedding_save_dir` in configuration json file,
        Just read embedding from it.
        """

        if use_saved_embed and\
            self.config.parser['embedding_save_dir'] is not '':
            Print(
                f'reading saved embedding file from '\
                f'{self.config.parser["embedding_save_dir"]}',
                'information'
            )
            with open(self.config.parser['embedding_save_dir'], 'rb') as f:
                pretrain_embed = pickle.load(f)
        else:
            if self.config.parser['embed_dir'] is None:
                Print('Pre-trained embedding file not available.', 'error')
                return

            embed_file = self.config.parser['embed_dir']

            # load in pre-trained Glove model, save it as a dict
            pretrain_embed = {}
            with open(embed_file, 'r', encoding='utf-8') as f:
                tqdm_iter = tqdm.tqdm(f.readlines())
                tqdm_iter.set_description('read from pre-trained file', False)
                for line in tqdm_iter:
                    embed_content = line.strip().split()
                    word, embed_content = embed_content[0], embed_content[1:]
                    if self.config.parser['word_embed_dim'] < 0:
                        self.config.parser['word_embed_dim'] = len(
                            embed_content)
                    elif self.config.parser['word_embed_dim'] != len(
                            embed_content):
                        # invalid embedding word
                        continue
                    embed_content = np.array([float(x) for x in embed_content])
                    pretrain_embed[word] = embed_content

            if self.config.parser['embedding_save_dir'] is not '':
                with open(self.config.parser['embedding_save_dir'], 'wb') as f:
                    pickle.dump(pretrain_embed, f)
                Print(
                    f'pre-trained embedding dictionary is saved at '\
                    f'{self.config.parser["embedding_save_dir"]}',
                    'success'
                )

        embed_dim = self.config.parser['word_embed_dim']

        # build embedding if find it in pre-trained model
        # else randomly generate one.
        self.embedding = np.empty([self.word_dict.word_size, embed_dim])
        scale = np.sqrt(3 / embed_dim)
        perfect_match, case_match, not_match = 0, 0, 0
        for word, index in self.word_dict.word2idx.items():
            if word in pretrain_embed:
                self.embedding[index, :] = self.norm2one(pretrain_embed[word]) \
                    if self.config.parser['norm_word_embed'] else pretrain_embed[word]
                perfect_match += 1
            if word.lower() in pretrain_embed:
                self.embedding[index, :] = self.norm2one(pretrain_embed[word.lower()]) \
                    if self.config.parser['norm_word_embed'] else pretrain_embed[word.lower()]
                case_match += 1
            else:
                # not found
                self.embedding[index, :] = np.random.uniform(
                    -scale, scale, [embed_dim])
                not_match += 1
        Print(
            f'Pre-trained embedding loaded in from {self.config.parser["embed_dir"]},\n'\
            f'pre-train words: {len(pretrain_embed)}, perfect match {perfect_match},\n'\
            f'case match {case_match}, not match {not_match},\n'\
            f'oov {not_match / self.word_dict.word_size}', 'success'
        )
        return self.embedding