Пример #1
0
    def __init__(self, vocab_file, do_lower_case=False, max_len=None):
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'."
                "To load the vocabulary from a Google pretrained "
                "model use "
                "`tokenizer = "
                "BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                    vocab_file))

        self.vocab = tokenization.load_vocab(vocab_file)
        self.ids_to_tokens = OrderedDict([(ids, tok)
                                          for tok, ids in self.vocab.items()])
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)
Пример #2
0
 def __init__(self):
     super(Text2Vec, self).__init__()
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.wordpiece_tokenizer  = WordpieceTokenizer(vocab= self.tokenizer.vocab)
     self.model = BertModel.from_pretrained('bert-base-uncased')
     self.bert_word2vec=[]
     
     bert_word2vec_={}
     with open('bert-base-uncased.30522.768d.vec', 'r') as f:
         for e in f.readlines():
             self.bert_word2vec.append(e.split())
             
     for e in self.bert_word2vec[1:]:
         bert_word2vec_[e[0]]=np.array(e[1:]).astype('float32')
     self.bert_word2vec=bert_word2vec_
Пример #3
0
def main():
    word_piece = WordpieceTokenizer(bert_vocab)
    #get dataset
    #train_dataset, test_dataset = get_dataset(word_vocab, char_vocab)
    #vocab_size, embedding_size,char_vocab_size, char_embedding_size, num_filter, ngram_filter_size, num_classes
    train_dataset, test_dataset = get_dataset_bert(word_vocab, char_vocab,
                                                   word_piece)
    model = FastText(vocab_size=len(word_vocab), embedding_size=128, char_vocab_size=len(char_vocab), \
                     char_embedding_size=50, num_filter=200, ngram_filter_size=[3], num_classes=2, \
                     bert_weight_path=bert_weight_path)
    model.to('cuda')
    optimizer = optim.Adam(model.parameters())
    loss_function = nn.CrossEntropyLoss()
    train(model,
          optimizer,
          loss_function,
          train_dataset,
          test_dataset,
          bert=True,
          is_print_size=True)
Пример #4
0
    def __init__(self, data):
        super(WordRep, self).__init__()
        print("build word representation...")
        self.gpu = data.HP_gpu
        self.batch_size = data.HP_batch_size

        self.embedding_dim = data.word_emb_dim
        self.drop = nn.Dropout(data.HP_dropout)
        self.word_embedding = nn.Embedding(data.word_alphabet.size(),
                                           self.embedding_dim)
        if data.pretrain_word_embedding is not None:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(data.pretrain_word_embedding))
        else:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(data.word_alphabet.size(),
                                          self.embedding_dim)))

        # self.hiddentoEmbdim = nn.Linear(768, self.embedding_dim)

        # bert feature
        self.word_alphabet = data.word_alphabet
        self.use_bert = data.use_bert

        if self.use_bert:
            # Load pre-trained model (weights)
            self.bert_model = BertModel.from_pretrained('bert-base-uncased')
            self.bert_model.eval()
            # Load pre-trained model tokenizer (vocabulary)
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            self.wpiecetokenizer = WordpieceTokenizer(self.tokenizer.vocab)

        if self.gpu:
            self.drop = self.drop.cuda()
            # self.hiddentoEmbdim = self.hiddentoEmbdim.cuda()
            self.word_embedding = self.word_embedding.cuda()
            if self.use_bert:
                self.bert_model = self.bert_model.cuda()
Пример #5
0
class MyBertTokenizer(BertTokenizer):
    def __init__(self, vocab_file, do_lower_case=False, max_len=None):
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'."
                "To load the vocabulary from a Google pretrained "
                "model use "
                "`tokenizer = "
                "BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                    vocab_file))

        self.vocab = tokenization.load_vocab(vocab_file)
        self.ids_to_tokens = OrderedDict([(ids, tok)
                                          for tok, ids in self.vocab.items()])
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)

    def tokenize(self, text):
        orig_tokens = tokenization.whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)
        return split_tokens
Пример #6
0
# 加载词典 pre-trained model tokenizer (vocabulary)
EN_VOCAB = './bert-large-cased-vocab.txt'

CN_VOCAB = './vocab.txt'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_text = tokenizer.tokenize(text2)
print(tokenized_text)

# tokenizer = BasicTokenizer.from_pretrained(VOCAB)
basicTokenizer = BasicTokenizer()
tokenized_text = basicTokenizer.tokenize(text2)
print(tokenized_text)

# tokenizer = WordpieceTokenizer.from_pretrained(VOCAB)
wordpieceTokenizer = WordpieceTokenizer(CN_VOCAB)
tokenized_text = wordpieceTokenizer.tokenize(text2)
print(tokenized_text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
# assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet',
#                           '##eer', '[SEP]']

# 将 token 转为 vocabulary 索引
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# 定义句子 A、B 索引
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# 将 inputs 转为 PyTorch tensors
Пример #7
0
class Text2Vec(object):
    def __init__(self):
        super(Text2Vec, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.wordpiece_tokenizer  = WordpieceTokenizer(vocab= self.tokenizer.vocab)
        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.bert_word2vec=[]
        
        bert_word2vec_={}
        with open('bert-base-uncased.30522.768d.vec', 'r') as f:
            for e in f.readlines():
                self.bert_word2vec.append(e.split())
                
        for e in self.bert_word2vec[1:]:
            bert_word2vec_[e[0]]=np.array(e[1:]).astype('float32')
        self.bert_word2vec=bert_word2vec_
        
    def _subword(self, sent):
        index=0
        sub_ids=[]
        for w in sent:
            if w[:2]!='##':
                sub_ids.append(index)
                index+=1
            else:
                sub_ids.append(index-1)
        return sub_ids
    
    def _merge_emb(self, sub_ids, embs):
        assert len(sub_ids)==len(embs)
        d={}
        for i in range(sub_ids[-1]+1):
            d[i]=[]
        
        for index, emb in zip(sub_ids, embs):
            d[index].append(emb)
        
        merged_emb=[]
        for i in range(sub_ids[-1]+1):
            merged_emb.append(torch.mean(torch.stack(d[i], dim=0), dim=0))
        
        return merged_emb
        
    def text2vec(self, text, only_emb=False):
        tokenized_sent = self.wordpiece_tokenizer.tokenize(text.lower())
        subword_ids = self._subword(tokenized_sent)
        
        if only_emb:
            embs=[]
            for token in tokenized_sent:
                embs.append(self.bert_word2vec[token.encode('utf-8')])
            embs=np.array(embs)
            
            d={}
            for i in range(subword_ids[-1]+1):
                d[i]=[]
            
            for index, emb in zip(subword_ids, embs):
                d[index].append(emb)
            
            merged_emb=[]
            for i in range(subword_ids[-1]+1):
                merged_emb.append(np.mean(np.stack(d[i], axis=0), axis=0))
            
            embs = np.array(merged_emb)
            
        else:
            indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_sent)
            segments_ids = [0]*len(indexed_tokens)
            
            # Convert inputs to PyTorch tensors
            tokens_tensor = torch.tensor([indexed_tokens])
            segments_tensors = torch.tensor([segments_ids])
            
            # Predict hidden states features for each layer
            encoded_layers, _ = self.model(tokens_tensor, segments_tensors)
            #s1_reps = torch.mean(encoded_layers[-2][0], 1)
            
            embs = encoded_layers[-2][0]
                
            embs = self._merge_emb(subword_ids, embs)
            embs = torch.stack(embs, dim=0).cpu().detach().numpy()
        return embs