def test_long_sequence_splitting(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") indexer = PretrainedTransformerMismatchedIndexer("bert-base-uncased", max_length=4) text = ["AllenNLP", "is", "great"] tokens = tokenizer.tokenize(" ".join(["[CLS]"] + text + ["[SEP]"])) expected_ids = tokenizer.convert_tokens_to_ids(tokens) assert len( expected_ids) == 7 # just to make sure it's what we're expecting cls_id, sep_id = expected_ids[0], expected_ids[-1] expected_ids = (expected_ids[:3] + [sep_id, cls_id] + expected_ids[3:5] + [sep_id, cls_id] + expected_ids[5:]) vocab = Vocabulary() indexed = indexer.tokens_to_indices([Token(word) for word in text], vocab) assert indexed["token_ids"] == expected_ids # [CLS] allen ##nl [SEP] [CLS] #p is [SEP] [CLS] great [SEP] assert indexed["segment_concat_mask"] == [1] * len(expected_ids) # allennlp is great assert indexed["mask"] == [1] * len(text) # [CLS] allen #nl #p is great [SEP] assert indexed["wordpiece_mask"] == [1] * 7
def test_bert(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") indexer = PretrainedTransformerMismatchedIndexer("bert-base-cased") text = ["AllenNLP", "is", "great"] tokens = tokenizer.tokenize(" ".join(["[CLS]"] + text + ["[SEP]"])) expected_ids = tokenizer.convert_tokens_to_ids(tokens) vocab = Vocabulary() indexed = indexer.tokens_to_indices([Token(word) for word in text], vocab) assert indexed["token_ids"] == expected_ids assert indexed["mask"] == [1] * len(text) # Hardcoding a few things because we know how BERT tokenization works assert indexed["offsets"] == [(1, 3), (4, 4), (5, 5)] assert indexed["wordpiece_mask"] == [1] * len(expected_ids) keys = indexed.keys() assert indexer.get_empty_token_list() == {key: [] for key in keys} max_length = 10 padding_lengths = {key: max_length for key in keys} padded_tokens = indexer.as_padded_tensor_dict(indexed, padding_lengths) for key in keys: padding_length = max_length - len(indexed[key]) padding = (0, 0) if key == "offsets" else 0 expected_value = indexed[key] + ([padding] * padding_length) assert len(padded_tokens[key]) == max_length if key == "offsets": expected_value = [list(t) for t in expected_value] assert padded_tokens[key].tolist() == expected_value
def tmp(): config = "configs/bert_pretrain.jsonnet" serialization_dir = "models" output_dir = "bert_out" tokenizer_conllu_path = "data/coptic/converted/train" documents = read_conllu_files(tokenizer_conllu_path) sentences = [] for document in documents: for sentence in document: sentences.append(" ".join([t['form'] for t in sentence])) print("Training tokenizer...") os.environ["TOKENIZER_PATH"] = output_dir t = train_bert_tokenizer(sentences, serialize_path=output_dir, vocab_size=6000) tok = PretrainedTransformerTokenizer("./bert_out/") idx = PretrainedTransformerMismatchedIndexer("./bert_out/") vocab = Vocabulary() vocab.set_from_file("bert_out/vocab.txt", oov_token="[UNK]", is_padded=True) s = tok.tokenize(sentences[1]) i = idx.tokens_to_indices(s, vocab) i print(t)
def test_tokenized_sentences_dataset(): sentences = [ "this is a simple sentence .".split(), "Claudio Monteverdi ( 15 May 1567 – 29 November 1643 ) was an Italian composer ." .split(), "string player and maestro di cappella .".split(), "A composer of both secular and sacred music , and a pioneer in the development of opera , he is considered a transitional figure ." .split(), "Between the Renaissance and the Baroque periods of music history .". split(), "He was a court musician in Mantua ( c. 1590 – 1613 ) , and then maestro di cappella at St Mark's Basilica in the Republic of Venice ." .split(), "His surviving music includes nine books of madrigals , in the tradition of earlier" .split() ] indexer = PretrainedTransformerMismatchedIndexer("bert-large-cased") dataset = TokenizedSentencesDataset(sentences, indexer) dataset.index_with(Vocabulary()) iterator = get_bucket_iterator(dataset, 1000, is_trainingset=False) for elem in iterator: print(elem) model = MultilayerPretrainedTransformerMismatchedEmbedder( "bert-large-cased", [-1]) iterator = get_bucket_iterator(dataset, 1000, is_trainingset=False) for batch in iterator: print(model(**batch["tokens"]["tokens"]))
def test_end_to_end_for_first_sub_token_embedding(self, sub_token_mode: str): token_indexer = PretrainedTransformerMismatchedIndexer( "bert-base-uncased") sentence1 = ["A", ",", "AllenNLP", "sentence", "."] sentence2 = ["AllenNLP", "is", "open", "source", "NLP", "library"] tokens1 = [Token(word) for word in sentence1] tokens2 = [Token(word) for word in sentence2] vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer_mismatched", "model_name": "bert-base-uncased", "sub_token_mode": sub_token_mode, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"]["mask"].tolist() == [ [True, True, True, True, True, False], [True, True, True, True, True, True], ] assert tokens["bert"]["offsets"].tolist() == [ [[1, 1], [2, 2], [3, 5], [6, 6], [7, 7], [0, 0]], [[1, 3], [4, 4], [5, 5], [6, 6], [7, 8], [9, 9]], ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, max(len(sentence1), len(sentence2)), 768) assert not torch.isnan(bert_vectors).any()
def __init__(self, my_device=torch.device('cuda:2'), model_name='roberta.hdf5', model_path=current_directory_path + '/external_pretrained_models/'): self.answ = "UNKNOWN ERROR" self.model_name = model_name self.model_path = model_path self.first_object = '' self.second_object = '' self.predicates = '' self.aspects = '' cuda_device = my_device self.spans = [ ] # we can't use set because span object is dict and dict is unchashable. We add function add_span to keep non-repeatability try: print(self.model_path + self.model_name) print(model_path + "vocab_dir") vocab = Vocabulary.from_files(model_path + "vocab_dir") BERT_MODEL = 'google/electra-base-discriminator' embedder = PretrainedTransformerMismatchedEmbedder( model_name=BERT_MODEL) text_field_embedder = BasicTextFieldEmbedder({'tokens': embedder}) seq2seq_encoder = PassThroughEncoder( input_dim=embedder.get_output_dim()) print("encoder loaded") self.indexer = PretrainedTransformerMismatchedIndexer( model_name=BERT_MODEL) print("indexer loaded") self.model = SimpleTagger( text_field_embedder=text_field_embedder, vocab=vocab, encoder=seq2seq_encoder, calculate_span_f1=True, label_encoding='IOB1').cuda(device=cuda_device) self.model.load_state_dict( torch.load(self.model_path + self.model_name)) print("model loaded") self.reader = Conll2003DatasetReader( token_indexers={'tokens': self.indexer}) print("reader loaded") except: e = sys.exc_info()[0] print("exeption while mapping to gpu in extractor ", e) raise RuntimeError( "Init extractor: can't map to gpu. Maybe it is OOM") try: self.predictor = SentenceTaggerPredictor(self.model, self.reader) except: e = sys.exc_info()[0] print("exeption in creating predictor ", e) raise RuntimeError( "Init extractor: can't map to gpu. Maybe it is WTF")
def load_dataset(self, data_folder): corpus = ColumnCorpus(data_folder, { 0: 'text', 1: 'ner' }, train_file='train.txt', test_file='test.txt', dev_file='dev.txt') # We do not need dev set indexer = PretrainedTransformerMismatchedIndexer( model_name=self.config.BERT_MODEL_TYPE) self.reader = GeniaDatasetReader(token_indexers={'tokens': indexer}, tag_label='ner') self.train_dataset_for_voc = self.reader.read(data_folder + '/train.txt') # Creating tag dictionaries self.idx2tag, self.tag2idx = make_bert_tag_dict_from_flair_corpus( corpus) self.tags = list( set((tag.split('-')[1] for tag in self.idx2tag if len(tag.split('-')) > 1))) print('Tags:', self.tags) # Convert into the format suitable for training self.X_train, self.y_train = prepare_corpus(corpus.train) self.X_test, self.y_test = prepare_corpus(corpus.test) # print(self.X_test) # print('------------------') # print(self.y_test) # sent = CSentence(tokens, sentences[i]) # entities = format_entities(sent, result[i]) # tokenizer = ProcessorTokenizerRu() # splitter = ProcessorSentenceSplitter() # for x, y in zip(self.X_test, self.y_test): # print(x, y) # abc = format_entities(x, y) # if abc: # print(abc, '------------') # Convert into the format suitable for visualization # self.y_train_dict = convert_y_to_dict_format(self.X_train, self.y_train) self.X_helper = create_helper(self.X_train) print("self.X_train[:1]:") print(self.X_train[:1]) print("self.X_helper.head():") print(self.X_helper.head()) self.y_seed_dict = [None for _ in range(len(self.X_helper))]
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, domain_identifier: str = None, bert_model_name: str = None, with_spans: bool = False, mismatched_tokens: bool = False, random_sample: bool = False, random_seed: int = None, limit: int = -1, print_violations: bool = False, label_namespace: str = "labels", **kwargs, ) -> None: super().__init__(**kwargs) self._token_indexers = token_indexers or { "tokens": PretrainedTransformerIndexer(model_name=bert_model_name) } if mismatched_tokens: self._token_indexers = { "tokens": PretrainedTransformerMismatchedIndexer( model_name=bert_model_name) } self._dummy_indexer = {"dummy_tokens": SingleIdTokenIndexer()} self._domain_identifier = domain_identifier self._with_spans = with_spans self._mismatched_tokens = mismatched_tokens self._limit = limit self._random_sample = random_sample self._random_seed = random_seed self._max_sequence_length = 0 self._print_violations = print_violations self._label_namespace = label_namespace if bert_model_name is not None: self.bert_tokenizer = AutoTokenizer.from_pretrained( bert_model_name) if with_spans: self.bert_tokenizer_allennlp = PretrainedTransformerTokenizer( model_name=bert_model_name) self.lowercase_input = "uncased" in bert_model_name self.bert_config = AutoConfig.from_pretrained(bert_model_name) else: self.bert_tokenizer = None self.lowercase_input = False
def __init__( self, wordpiece_tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, combine_input_fields: bool = None, input_parsed: bool = None, parser: StanzaPipeline = None, input_fields: List = None, **kwargs, ) -> None: #super().__init__(manual_distributed_sharding=True, **kwargs) super(NLIGraphReader, self).__init__(**kwargs) self._wordpiece_tokenizer = wordpiece_tokenizer or PretrainedTransformerTokenizer(config.TRANSFORMER_NAME) self._token_indexers = token_indexers or {"tokens": PretrainedTransformerMismatchedIndexer(config.TRANSFORMER_NAME)} self._combine_input_fields = combine_input_fields or False self._input_parsed = input_parsed if (input_parsed is not None) else True self._parser = parser or None self.f = input_fields or config.default_fields
def test_token_without_wordpieces(self): token_indexer = PretrainedTransformerMismatchedIndexer( "bert-base-uncased") sentence1 = ["A", "", "AllenNLP", "sentence", "."] sentence2 = ["AllenNLP", "", "great"] tokens1 = [Token(word) for word in sentence1] tokens2 = [Token(word) for word in sentence2] vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer_mismatched", "model_name": "bert-base-uncased", } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"]["offsets"].tolist() == [ [[1, 1], [-1, -1], [2, 4], [5, 5], [6, 6]], [[1, 3], [-1, -1], [4, 4], [0, 0], [0, 0]], ] bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, max(len(sentence1), len(sentence2)), 768) assert not torch.isnan(bert_vectors).any() assert all(bert_vectors[0, 1] == 0) assert all(bert_vectors[1, 1] == 0)
def test_throws_error_on_incorrect_sub_token_mode(self, sub_token_mode: str): token_indexer = PretrainedTransformerMismatchedIndexer( "bert-base-uncased") sentence1 = ["A", ",", "AllenNLP", "sentence", "."] sentence2 = ["AllenNLP", "is", "open", "source", "NLP", "library"] tokens1 = [Token(word) for word in sentence1] tokens2 = [Token(word) for word in sentence2] vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer_mismatched", "model_name": "bert-base-uncased", "sub_token_mode": sub_token_mode, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] with pytest.raises(ConfigurationError): token_embedder(tokens)
def test_exotic_tokens_no_nan_grads(self): token_indexer = PretrainedTransformerMismatchedIndexer( "bert-base-uncased") sentence1 = ["A", "", "AllenNLP", "sentence", "."] sentence2 = [ "A", "\uf732\uf730\uf730\uf733", "AllenNLP", "sentence", "." ] tokens1 = [Token(word) for word in sentence1] tokens2 = [Token(word) for word in sentence2] vocab = Vocabulary() token_embedder = BasicTextFieldEmbedder({ "bert": PretrainedTransformerMismatchedEmbedder("bert-base-uncased") }) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] bert_vectors = token_embedder(tokens) test_loss = bert_vectors.mean() test_loss.backward() for name, param in token_embedder.named_parameters(): grad = param.grad assert (grad is None) or (not torch.any(torch.isnan(grad)).item())
from allennlp.data import Vocabulary, DataLoader from allennlp.data.token_indexers import PretrainedTransformerMismatchedIndexer from build.lib.nlp_tools.nlp_models.multilayer_pretrained_transformer_mismatched_embedder import \ MultilayerPretrainedTransformerMismatchedEmbedder from nlp_tools.data_io.datasets import TokenizedSentencesDataset if __name__ == "__main__": encoder_name = "bert-large-cased" print("loading indexer") indexer = PretrainedTransformerMismatchedIndexer(encoder_name) print("loading embedder") embedder = MultilayerPretrainedTransformerMismatchedEmbedder(encoder_name, layers_to_merge=[-1, -2, -3, -4], ) s1 = "When the New York Stock Exchange heard the announcement , equities plummeted , causing a chain reaction of bank runs and failures throughout the United States that signaled the arrival of the Panic of 1873 to American shores .".split() s2 = "On the north bank of the river , WIS 42 turns northwest onto North Water Street and follows the river in a northerly direction to Forestville in Door County , where it becomes Forestville Avenue .".split() sentences = [s1, s2] dataset = TokenizedSentencesDataset(sentences, indexer) dataset.index_with(Vocabulary()) # iterator = get_bucket_iterator(dataset, 1000, is_trainingset=False, # device=torch.device("cuda")) iterator = DataLoader(dataset, batch_size=2) outputs = list() import numpy as np for batch in iterator: batch_output = list() net_out = embedder(**batch["tokens"]["tokens"]) a = net_out[0][18].detach().numpy() b = net_out[1][3].detach().numpy() a = a / np.linalg.norm(a, keepdims=True)
import shutil import sys import numpy as np import torch import torch.optim as optim from torch.utils.data import DataLoader # CONFIG SECTION expname = "exp1" logger = logging.getLogger(__name__) model_name = "bert-base-cased" indexers = {"bert" : PretrainedTransformerMismatchedIndexer(model_name, namespace="bert")} reader = Conll2003DatasetReader(token_indexers = indexers) train_dataset = reader.read("conll2003/eng.train") validation_dataset = reader.read("conll2003/eng.testa") test_dataset = reader.read("conll2003/eng.testb") all_insts = train_dataset + validation_dataset + test_dataset vocab = Vocabulary.from_instances(all_insts) dataset = Batch(all_insts) dataset.index_instances(vocab) embedder = PretrainedTransformerMismatchedEmbedder(model_name, last_layer_only = True)
# This pattern is typically used in cases where your input data is already # tokenized, so we're showing that here. text_tokens = ["This", "is", "some", "frandibulous", "text", "."] tokens = [Token(x) for x in text_tokens] print(tokens) # We're using a very small transformer here so that it runs quickly in binder. You # can change this to any transformer model name supported by Hugging Face. transformer_model = 'google/reformer-crime-and-punishment' # Represents the list of word tokens with a sequences of wordpieces as determined # by the transformer's tokenizer. This actually results in a pretty complex data # type, which you can see by running this. It's complicated because we need to # know how to combine the wordpieces back into words after running the # transformer. indexer = PretrainedTransformerMismatchedIndexer(model_name=transformer_model) text_field = TextField(tokens, {'transformer': indexer}) text_field.index(Vocabulary()) token_tensor = text_field.as_tensor(text_field.get_padding_lengths()) # There are two key things to notice in this output. First, there are two masks: # `mask` is a word-level mask that gets used in the utility functions described in # the last section of this chapter. `wordpiece_mask` gets used by the `Embedder` # itself. Second, there is an `offsets` tensor that gives start and end wordpiece # indices for the original tokens. In the embedder, we grab these, average all of # the wordpieces for each token, and return the result. print("Indexed tensors:", token_tensor) embedding = PretrainedTransformerMismatchedEmbedder(model_name=transformer_model)
def test_auto_determining_num_tokens_added(self): indexer = PretrainedTransformerMismatchedIndexer("bert-base-cased") assert indexer._determine_num_special_tokens_added() == (1, 1)
import torch from allennlp.data import Vocabulary, Instance, Token from allennlp.data.fields import TextField from allennlp.data.token_indexers import PretrainedTransformerMismatchedIndexer from allennlp.nn.util import get_token_ids_from_text_field_tensors from transformers import BertTokenizer, DataCollatorForWholeWordMask tokenizer = BertTokenizer.from_pretrained('./bert_out') vocab = Vocabulary(non_padded_namespaces=["tokens"]) vocab.add_transformer_vocab(tokenizer, "tokens") vocab.get_token_index("[PAD]", "tokens") idx = PretrainedTransformerMismatchedIndexer("./bert_out", namespace="tokens") def prepare_instance(s): tokens = [Token(t) for t in s.split(" ")] indexed = idx.tokens_to_indices(tokens, vocab) print([vocab.get_token_from_index(i) for i in indexed['token_ids']]) return Instance({"tokens": TextField(tokens, {"tokens": idx})}) instances = [prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ"), prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ")] for i in instances: i["tokens"].index(vocab) tensors = [i.as_tensor_dict() for i in instances] collator = DataCollatorForWholeWordMask(tokenizer=tokenizer) ids = torch.cat([tensors[0]['tokens']['tokens']['token_ids'].unsqueeze(0), tensors[1]['tokens']['tokens']['token_ids'].unsqueeze(0)], dim=0)