def __init__(self, embedder, encoding, bert_model = 'bert-base-chinese'): super(BertForWordSegmentation, self).__init__() self.embedder = embedder self.encoding = encoding self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case = False) self.model = cudaify(BertModel.from_pretrained(bert_model, output_hidden_states=True)) self.classifier = cudaify(DropoutClassifier(self.embedder.embedding_width(), self.encoding.domain_size()))
def __init__(self, embedder): super(BertForWordSegmentation, self).__init__() self.embedder = embedder self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', do_lower_case=False) self.model = cudaify( BertModel.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True)) self.classifier = cudaify( DropoutClassifier(self.embedder.embedding_width(), 2))
def batch_iter(self): """Currently doesn't yield the incomplete last batch.""" evidence_batch = [] response_batch = [] for i in self.desired_ids: evidence_batch.append(self.inst_ds[i].get_embedding('embed')) response_batch.append(self.inst_ds.sense_id(self.inst_ds[i].sense)) if len(evidence_batch) == self.batch_size: yield cudaify(torch.tensor(evidence_batch)), cudaify( torch.tensor(response_batch)) evidence_batch = [] response_batch = []
def create_and_train_net(training_data, test_data): training_data = cudaify(training_data) test_data = cudaify(test_data) print("training size:", training_data.shape) print("testing size:", test_data.shape) classifier = cudaify(DropoutClassifier(1536, 2, 200)) return train_net(classifier, training_data, test_data, lambda x, y: tensor_batcher(x, y, False), batch_size=96, n_epochs=12, learning_rate=0.001, verbose=True)
def elmo_vectorize(positions, vectors): elmo_ids = batch_to_ids(vectors) elmo_ids = cudaify(elmo_ids) outputs_dict = elmo(elmo_ids) positions = cudaify(torch.tensor(positions)) representations = outputs_dict["elmo_representations"] representations_avged = sum(representations) / 3 gather_index = torch.stack([positions] * representations_avged.shape[2], dim=1).unsqueeze(1) positioned = representations_avged.gather(1, gather_index).squeeze(1).cpu() return positioned.detach()
def embed_elmo_avg_right(positions, vectors): print("vectorizing") elmo_ids = batch_to_ids(vectors) elmo_ids = cudaify(elmo_ids) outputs_dict = elmo(elmo_ids) positions = cudaify(torch.tensor(positions)) next_positions = (positions + 1).clamp(0, 510) representations = outputs_dict["elmo_representations"] representations_avged = sum(representations) / 3 gather_index = torch.cat([torch.stack([positions, next_positions], dim=1).unsqueeze(2)] * 1024, dim=2) positioned= representations_avged.gather(1, gather_index).mean(1).cpu() return positioned.detach()
def embed_elmo_concat_left(positions, vectors): print("vectorizing") elmo_ids = batch_to_ids(vectors) elmo_ids = cudaify(elmo_ids) outputs_dict = elmo(elmo_ids) positions = cudaify(torch.tensor(positions)) prev_positions = (positions - 1).clamp(0, 510) representations = outputs_dict["elmo_representations"] representations_avged = sum(representations) / 3 gather_index = torch.cat([torch.stack([prev_positions, positions], dim=1).unsqueeze(2)] * 1024, dim=2) positioned= representations_avged.gather(1, gather_index) positioned = positioned.reshape([positioned.shape[0], positioned.shape[1] * positioned.shape[2]]).cpu() return positioned.detach()
def create_and_train_net(net, training_data, test_data, verbose): training_data = cudaify(training_data) test_data = cudaify(test_data) if verbose: print("training size:", training_data.shape) print("testing size:", test_data.shape) classifier = cudaify(net) best_net, best_acc = train_net(classifier, training_data, test_data, tensor_batcher, batch_size=2, n_epochs=10, learning_rate=0.001, verbose=verbose) return best_acc
def forward(self, input_tokens, labels = None): bert_tokens = [] bert_tokens.append("[CLS]") bert_tokens += input_tokens bert_tokens.append("[SEP]") indexed_tokens = self.tokenizer.convert_tokens_to_ids(bert_tokens) tokens_tensor = cudaify(torch.tensor([indexed_tokens])) outputs = self.model(tokens_tensor) pooled_output = outputs[2] processed_list = [] assert(len(indexed_tokens) == pooled_output[0].shape[1]) for i in range(1, len(indexed_tokens) - 2): processed_list.append(self.embedder(pooled_output, i)) processed_tensor = cudaify(torch.cat(processed_list, 0)) result = self.classifier(processed_tensor) loss = None if labels is not None: loss_fct = torch.nn.CrossEntropyLoss() y = cudaify(torch.LongTensor(labels[:(len(labels) - 1)])) # final label is always 1, hence ignored loss = loss_fct(result, y) return result, loss
# -*- coding: utf-8 -*- from allennlp.modules.elmo import Elmo, batch_to_ids from util import cudaify import torch from IPython.core.debugger import set_trace options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" N_REPRESENTATIONS = 3 elmo = Elmo(options_file, weight_file, N_REPRESENTATIONS) elmo = cudaify(elmo) from allennlp.commands.elmo import ElmoEmbedder embedder = ElmoEmbedder() def elmo_vectorize_instance(instance): vectors = embedder.embed_sentence(instance.tokens) vectors = sum(vectors) / 3 embedding = [float(f) for f in vectors[instance.pos]] instance.add_embedding("elmo", embedding) return instance def elmo_vectorize(positions, vectors):