def __init__(self, vocab_file, do_lower_case=False, max_len=None): if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'." "To load the vocabulary from a Google pretrained " "model use " "`tokenizer = " "BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format( vocab_file)) self.vocab = tokenization.load_vocab(vocab_file) self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.max_len = max_len if max_len is not None else int(1e12)
def __init__(self): super(Text2Vec, self).__init__() self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.wordpiece_tokenizer = WordpieceTokenizer(vocab= self.tokenizer.vocab) self.model = BertModel.from_pretrained('bert-base-uncased') self.bert_word2vec=[] bert_word2vec_={} with open('bert-base-uncased.30522.768d.vec', 'r') as f: for e in f.readlines(): self.bert_word2vec.append(e.split()) for e in self.bert_word2vec[1:]: bert_word2vec_[e[0]]=np.array(e[1:]).astype('float32') self.bert_word2vec=bert_word2vec_
def main(): word_piece = WordpieceTokenizer(bert_vocab) #get dataset #train_dataset, test_dataset = get_dataset(word_vocab, char_vocab) #vocab_size, embedding_size,char_vocab_size, char_embedding_size, num_filter, ngram_filter_size, num_classes train_dataset, test_dataset = get_dataset_bert(word_vocab, char_vocab, word_piece) model = FastText(vocab_size=len(word_vocab), embedding_size=128, char_vocab_size=len(char_vocab), \ char_embedding_size=50, num_filter=200, ngram_filter_size=[3], num_classes=2, \ bert_weight_path=bert_weight_path) model.to('cuda') optimizer = optim.Adam(model.parameters()) loss_function = nn.CrossEntropyLoss() train(model, optimizer, loss_function, train_dataset, test_dataset, bert=True, is_print_size=True)
def __init__(self, data): super(WordRep, self).__init__() print("build word representation...") self.gpu = data.HP_gpu self.batch_size = data.HP_batch_size self.embedding_dim = data.word_emb_dim self.drop = nn.Dropout(data.HP_dropout) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) # self.hiddentoEmbdim = nn.Linear(768, self.embedding_dim) # bert feature self.word_alphabet = data.word_alphabet self.use_bert = data.use_bert if self.use_bert: # Load pre-trained model (weights) self.bert_model = BertModel.from_pretrained('bert-base-uncased') self.bert_model.eval() # Load pre-trained model tokenizer (vocabulary) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.wpiecetokenizer = WordpieceTokenizer(self.tokenizer.vocab) if self.gpu: self.drop = self.drop.cuda() # self.hiddentoEmbdim = self.hiddentoEmbdim.cuda() self.word_embedding = self.word_embedding.cuda() if self.use_bert: self.bert_model = self.bert_model.cuda()
class MyBertTokenizer(BertTokenizer): def __init__(self, vocab_file, do_lower_case=False, max_len=None): if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'." "To load the vocabulary from a Google pretrained " "model use " "`tokenizer = " "BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format( vocab_file)) self.vocab = tokenization.load_vocab(vocab_file) self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.max_len = max_len if max_len is not None else int(1e12) def tokenize(self, text): orig_tokens = tokenization.whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: for sub_token in self.wordpiece_tokenizer.tokenize(token): split_tokens.append(sub_token) return split_tokens
# 加载词典 pre-trained model tokenizer (vocabulary) EN_VOCAB = './bert-large-cased-vocab.txt' CN_VOCAB = './vocab.txt' tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenized_text = tokenizer.tokenize(text2) print(tokenized_text) # tokenizer = BasicTokenizer.from_pretrained(VOCAB) basicTokenizer = BasicTokenizer() tokenized_text = basicTokenizer.tokenize(text2) print(tokenized_text) # tokenizer = WordpieceTokenizer.from_pretrained(VOCAB) wordpieceTokenizer = WordpieceTokenizer(CN_VOCAB) tokenized_text = wordpieceTokenizer.tokenize(text2) print(tokenized_text) # Mask a token that we will try to predict back with `BertForMaskedLM` masked_index = 8 tokenized_text[masked_index] = '[MASK]' # assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', # '##eer', '[SEP]'] # 将 token 转为 vocabulary 索引 indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # 定义句子 A、B 索引 segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] # 将 inputs 转为 PyTorch tensors
class Text2Vec(object): def __init__(self): super(Text2Vec, self).__init__() self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.wordpiece_tokenizer = WordpieceTokenizer(vocab= self.tokenizer.vocab) self.model = BertModel.from_pretrained('bert-base-uncased') self.bert_word2vec=[] bert_word2vec_={} with open('bert-base-uncased.30522.768d.vec', 'r') as f: for e in f.readlines(): self.bert_word2vec.append(e.split()) for e in self.bert_word2vec[1:]: bert_word2vec_[e[0]]=np.array(e[1:]).astype('float32') self.bert_word2vec=bert_word2vec_ def _subword(self, sent): index=0 sub_ids=[] for w in sent: if w[:2]!='##': sub_ids.append(index) index+=1 else: sub_ids.append(index-1) return sub_ids def _merge_emb(self, sub_ids, embs): assert len(sub_ids)==len(embs) d={} for i in range(sub_ids[-1]+1): d[i]=[] for index, emb in zip(sub_ids, embs): d[index].append(emb) merged_emb=[] for i in range(sub_ids[-1]+1): merged_emb.append(torch.mean(torch.stack(d[i], dim=0), dim=0)) return merged_emb def text2vec(self, text, only_emb=False): tokenized_sent = self.wordpiece_tokenizer.tokenize(text.lower()) subword_ids = self._subword(tokenized_sent) if only_emb: embs=[] for token in tokenized_sent: embs.append(self.bert_word2vec[token.encode('utf-8')]) embs=np.array(embs) d={} for i in range(subword_ids[-1]+1): d[i]=[] for index, emb in zip(subword_ids, embs): d[index].append(emb) merged_emb=[] for i in range(subword_ids[-1]+1): merged_emb.append(np.mean(np.stack(d[i], axis=0), axis=0)) embs = np.array(merged_emb) else: indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_sent) segments_ids = [0]*len(indexed_tokens) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Predict hidden states features for each layer encoded_layers, _ = self.model(tokens_tensor, segments_tensors) #s1_reps = torch.mean(encoded_layers[-2][0], 1) embs = encoded_layers[-2][0] embs = self._merge_emb(subword_ids, embs) embs = torch.stack(embs, dim=0).cpu().detach().numpy() return embs